In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

READING THE DATASET

In [30]:
df=pd.read_csv("car_data.csv")

MODIFYING THE DATASET

In [36]:
# Explore the dataset
print(df.head())


df = df.drop(['Owner'], axis=1)

# Check for missing values
print(df.isnull().sum())



  Car_Name  Year  Selling_Price  Present_Price  Driven_kms Fuel_Type  \
0     ritz  2014           3.35           5.59       27000    Petrol   
1      sx4  2013           4.75           9.54       43000    Diesel   
2     ciaz  2017           7.25           9.85        6900    Petrol   
3  wagon r  2011           2.85           4.15        5200    Petrol   
4    swift  2014           4.60           6.87       42450    Diesel   

  Selling_type Transmission  Owner  
0       Dealer       Manual      0  
1       Dealer       Manual      0  
2       Dealer       Manual      0  
3       Dealer       Manual      0  
4       Dealer       Manual      0  
Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Driven_kms       0
Fuel_Type        0
Selling_type     0
Transmission     0
dtype: int64


SPLITING DATA

In [39]:
# Identify features (X) and target variable (y)
X = df[['Year', 'Present_Price', 'Driven_kms', 'Fuel_Type', 'Selling_type', 'Transmission']]
y = df['Selling_Price']

# Handle categorical features using one-hot encoding (if needed)
X = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure feature names match between training and testing datasets
missing_features = set(X_train.columns) - set(X_test.columns)
for feature in missing_features:
    X_test[feature] = 0

INITIALISING RANDOM FOREST REGRESSOR

In [40]:
# Initialize the Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_regressor.fit(X_train, y_train)

MAKING PREDICTIONS ON THE TESTING SET

In [41]:
# Make predictions on the test set
y_pred = rf_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

Mean Squared Error: 0.86
R-squared: 0.96


FEATURE NAMES

In [42]:
# Print the order of feature names for training dataset
print("Training Feature Names:")
print(X_train.columns)

# Print the order of feature names for testing dataset
print("\nTesting Feature Names:")
print(X_test.columns)

Training Feature Names:
Index(['Year', 'Present_Price', 'Driven_kms', 'Fuel_Type_CNG',
       'Fuel_Type_Diesel', 'Fuel_Type_Petrol', 'Selling_type_Dealer',
       'Selling_type_Individual', 'Transmission_Automatic',
       'Transmission_Manual'],
      dtype='object')

Testing Feature Names:
Index(['Year', 'Present_Price', 'Driven_kms', 'Fuel_Type_CNG',
       'Fuel_Type_Diesel', 'Fuel_Type_Petrol', 'Selling_type_Dealer',
       'Selling_type_Individual', 'Transmission_Automatic',
       'Transmission_Manual'],
      dtype='object')


TESTING THE MODEL

In [44]:
# Test the model with custom data
custom_data = pd.DataFrame({
    'Year': [2018],
    'Present_Price': [10.0],
    'Driven_kms': [50000],
    'Fuel_Type_CNG': [0],
    'Fuel_Type_Diesel': [0],
    'Fuel_Type_Petrol': [1],
    'Selling_type_Dealer': [0],
    'Selling_type_Individual': [1],
    'Transmission_Automatic': [0],
    'Transmission_Manual': [1]
})

custom_prediction = rf_regressor.predict(custom_data)

print(f"Predicted Selling Price for Custom Data: {custom_prediction[0]:.2f}")

Predicted Selling Price for Custom Data: 7.84
