Load Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load your dataset
data = pd.read_excel('C:/Users/ACER/Downloads/Car Dheko/cleaned_combined_file.xlsx')
data.head()


Unnamed: 0,city,car_links,it,ft,bt,km,transmission,ownerNo,oem,model,...,Miscellaneous - Miscellaneous - Steering Type,Miscellaneous - Miscellaneous - Turning Radius,Miscellaneous - Miscellaneous - Front Brake Type,Miscellaneous - Miscellaneous - Rear Brake Type,Miscellaneous - Miscellaneous - Alloy Wheel Size,Miscellaneous - Miscellaneous - No Door Numbers,Miscellaneous - Miscellaneous - Cargo Volumn,Miscellaneous - Miscellaneous - Acceleration,Miscellaneous - Miscellaneous - Top Speed,price_column
0,Kolkata,https://www.cardekho.com/used-car-details/used...,0,Petrol,Sedan,70000,Automatic,3,Toyota,Toyota Camry,...,Power,5.5,Ventilated Disc,Solid Disc,17.0,4.0,,,,9.75
1,Kolkata,https://www.cardekho.com/buy-used-car-details/...,0,Petrol,Hatchback,23981,Manual,1,Datsun,Datsun RediGO,...,Power,4.7,Disc,Drum,,5.0,222.0,,,2.66
2,Kolkata,https://www.cardekho.com/used-car-details/used...,0,Petrol,SUV,7100,Automatic,1,Renault,Renault Kiger,...,Electric,,Disc,Drum,,5.0,405.0,,,5.95
3,Kolkata,https://www.cardekho.com/used-car-details/used...,0,Diesel,SUV,50000,Automatic,2,Audi,Audi Q3,...,Power,5.9,Ventilated Disc,Drum,16.0,5.0,460.0,8.2,212.0,12.0
4,Kolkata,https://www.cardekho.com/used-car-details/used...,0,Petrol,Hatchback,35629,Automatic,1,Maruti,Maruti Wagon R,...,Power,4.7,Disc,Drum,,5.0,341.0,18.6,,4.25


 Handle Missing Values

In [2]:
# Fill missing numerical values with mean
for column in data.select_dtypes(include=[np.number]).columns:
    data[column].fillna(data[column].mean(), inplace=True)

# Fill missing categorical values with mode
for column in data.select_dtypes(include=['object']).columns:
    data[column].fillna(data[column].mode()[0], inplace=True)


Select Numerical and Categorical Columns

In [3]:
# Identify numerical and categorical columns
numerical_columns = data.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = data.select_dtypes(include=['object']).columns.tolist()

print("Numerical columns:", numerical_columns)
print("Categorical columns:", categorical_columns)


Categorical columns: ['city', 'car_links', 'ft', 'bt', 'transmission', 'oem', 'model', 'variantName', 'price', 'trendingText.imgUrl', 'trendingText.heading', 'trendingText.desc', 'Engine and Transmission - Engine - Color', 'Engine and Transmission - Engine - Engine Type', 'Engine and Transmission - Engine - Value Configuration', 'Engine and Transmission - Engine - Fuel Suppy System', 'Engine and Transmission - Engine - Turbo Charger', 'Engine and Transmission - Engine - Super Charger', 'Miscellaneous - Miscellaneous - Drive Type', 'Miscellaneous - Miscellaneous - Steering Type', 'Miscellaneous - Miscellaneous - Front Brake Type', 'Miscellaneous - Miscellaneous - Rear Brake Type']


Feature Selection for Numerical Columns and categorical columns

In [4]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import LabelEncoder

# Step 1: Define the target variable
y = data['price_column']  # Replace 'price_column' with your actual target column name

# Step 2: Feature selection for numerical columns
X_numerical = data[numerical_columns].drop('price_column', axis=1)  # Drop the target column from numerical features
selector_numerical = SelectKBest(score_func=f_regression, k=25)  # Select top 10 numerical features
X_numerical_selected = selector_numerical.fit_transform(X_numerical, y)

# Automatically selected numerical columns
selected_numerical_columns = [numerical_columns[i] for i in selector_numerical.get_support(indices=True)]
print("Automatically Selected Numerical Columns:", selected_numerical_columns)

# Step 3: Manually choose the numerical columns from selected ones
manual_selected_numerical_columns = [
   'km', 'ownerNo', 'modelYear', 'Seats', 'Power Steering', 'Mileage', 'Engine', 'Max Power'
]

# Convert selected numerical features into DataFrame and choose manually selected columns
X_numerical_selected_df = pd.DataFrame(X_numerical_selected, columns=selected_numerical_columns)
X_numerical_final = X_numerical_selected_df[manual_selected_numerical_columns]

# Step 4: Label encoding for categorical columns
label_encoder = LabelEncoder()
for column in categorical_columns:
    data[column] = label_encoder.fit_transform(data[column])

# Step 5: Feature selection for categorical columns
X_categorical = data[categorical_columns]
selector_categorical = SelectKBest(score_func=f_regression, k=25)  # Select top 10 categorical features
X_categorical_selected = selector_categorical.fit_transform(X_categorical, y)

# Automatically selected categorical columns
selected_categorical_columns = [categorical_columns[i] for i in selector_categorical.get_support(indices=True)]
print("Automatically Selected Categorical Columns:", selected_categorical_columns)

# Step 6: Manually choose the categorical columns from selected ones
manual_selected_categorical_columns = [
    'city', 'ft', 'bt', 'transmission', 'oem', 'model', 'variantName'
]

# Convert selected categorical features into DataFrame and choose manually selected columns
X_categorical_selected_df = pd.DataFrame(X_categorical_selected, columns=selected_categorical_columns)
X_categorical_final = X_categorical_selected_df[manual_selected_categorical_columns]

# Step 7: Combine the final selected numerical and categorical features into one DataFrame
X_selected_final = pd.concat([X_numerical_final, X_categorical_final], axis=1)

# Output the shape of the final selected features
print("Shape of manually selected features:", X_selected_final.shape)
print("Final selected features:")
print(X_selected_final.head())


Automatically Selected Numerical Columns: ['km', 'ownerNo', 'modelYear', 'centralVariantId', 'Comfort & Convenience - Comfort - Navigation System', 'Mileage', 'Engine', 'Max Power', 'Torque', 'Wheel Size', 'Seats', 'Engine and Transmission - Engine - Displacement', 'Engine and Transmission - Engine - No of Cylinder', 'Engine and Transmission - Engine - Values per Cylinder', 'Dimensions & Capacity - Dimensions - Length', 'Dimensions & Capacity - Dimensions - Width', 'Dimensions & Capacity - Dimensions - Height', 'Dimensions & Capacity - Dimensions - Wheel Base', 'Dimensions & Capacity - Dimensions - Front Tread', 'Dimensions & Capacity - Dimensions - Rear Tread', 'Dimensions & Capacity - Dimensions - Kerb Weight', 'Dimensions & Capacity - Dimensions - Gross Weight', 'Miscellaneous - Miscellaneous - Gear Box', 'Miscellaneous - Miscellaneous - Seating Capacity', 'Miscellaneous - Miscellaneous - Turning Radius', 'Miscellaneous - Miscellaneous - Alloy Wheel Size', 'Miscellaneous - Miscellan



Outlier Detection and Handling using IQR

In [5]:
# Outlier detection and handling using IQR
for column in selected_numerical_columns:
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Mark outliers
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    print(f"Detected outliers in {column}:")
    print(outliers)


Detected outliers in km:
      city  car_links  it  ft  bt       km  transmission  ownerNo  oem  model  \
190      0       6958   0   1   4   177000             1        1   27    255   
557      0       4501   0   4   7   975000             1        1   12     94   
1274     5       6979   0   1   4   188000             1        2   27    255   
1426     5       6931   0   1   7   170000             1        1   27    251   
1708     5       5515   0   1   8   191867             1        1   18    158   
1715     5       4617   0   1   7   157312             1        4   16    121   
1734     5       4979   0   4   2   152783             1        4   18    136   
1762     5       5335   0   1   4   208978             1        1   18    146   
1764     5       6296   0   1   7   152633             1        1   22    201   
1775     5       6278   0   4   2   164910             1        1   22    199   
1799     5       3472   0   4   8   176550             1        3    7     54   
181

Train-Test Split

In [6]:
from sklearn.model_selection import train_test_split

# Assuming X_selected_final is already defined and y is your target variable
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected_final, y, test_size=0.2, random_state=42)

# Print the sizes of the training and testing sets
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


Training set size: (5833, 14)
Testing set size: (1459, 14)


Model Development

In [7]:
# Initialize machine learning models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor()
}


Model Training and Evaluation

In [8]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import numpy as np

# Train and evaluate each model
for name, model in models.items():
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    mean_cv_score = -np.mean(cv_scores)  # Convert from negative to positive MSE
    
    # Predict on test set
    y_pred = model.predict(X_test)

    # Calculate performance metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Print the evaluation metrics
    print(f"{name} - MAE: {mae:.2f}, MSE: {mse:.2f}, R²: {r2:.2f}, CV MSE: {mean_cv_score:.2f}")


Linear Regression - MAE: 3.40, MSE: 40.09, R²: 0.62, CV MSE: 36.83
Decision Tree - MAE: 1.84, MSE: 21.31, R²: 0.80, CV MSE: 23.71
Random Forest - MAE: 1.44, MSE: 14.96, R²: 0.86, CV MSE: 13.63
Gradient Boosting - MAE: 1.73, MSE: 14.82, R²: 0.86, CV MSE: 16.34


Hyperparameter Tuning

In [10]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import GridSearchCV, train_test_split
# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# # Make sure you've encoded categorical features and that the dataset is clean
# # Use X_train_encoded and X_test_encoded, and the split y_train, y_test

# print("Starting Grid Search...")
# param_grid = {
#     'n_estimators': [50, 100, 150],
#     'max_depth': [None, 5, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error', verbose=3)
# grid_search.fit(X_train_encoded, y_train)
# print("Grid Search Completed!")

# # Check if best estimator is found
# best_rf_model = grid_search.best_estimator_
# print(f"Best Estimator: {best_rf_model}")

# # Predict on the test set and check predictions
# y_pred_best_rf = best_rf_model.predict(X_test_encoded)
# print(f"Predictions: {y_pred_best_rf[:10]}")

# # Evaluate the best model
# best_mae = mean_absolute_error(y_test, y_pred_best_rf)
# best_mse = mean_squared_error(y_test, y_pred_best_rf)
# best_r2 = r2_score(y_test, y_pred_best_rf)

# # Print the evaluation metrics
# print(f"Best Random Forest - MAE: {best_mae:.2f}, MSE: {best_mse:.2f}, R²: {best_r2:.2f}")


Here storing the model in a pickle file

In [12]:
import pickle

# Save the label encoders for categorical columns
encoders = {}
for column in manual_selected_categorical_columns:
    encoder = LabelEncoder()
    data[column] = encoder.fit_transform(data[column])
    encoders[column] = encoder

# Save encoders to a file
with open('encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)

# Now proceed with model training as usual


In [11]:
import pickle

# Save the model to a file
with open('Random Forest.pkl', 'wb') as file:
    pickle.dump(model, file)
print("Model saved as Random Forest.pkl")


Model saved as Random Forest.pkl
