In [2]:
# Import required modules
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
import joblib
from sklearn.feature_selection import RFE

In [3]:
# Read the data and store it in a variable "df"
df = pd.read_csv('final_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7647 entries, 0 to 7646
Data columns (total 55 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   it                             7647 non-null   int64  
 1   bt                             7647 non-null   object 
 2   km                             7647 non-null   float64
 3   transmission                   7647 non-null   object 
 4   ownerNo                        7647 non-null   int64  
 5   oem                            7647 non-null   object 
 6   model                          7647 non-null   object 
 7   modelYear                      7647 non-null   int64  
 8   centralVariantId               7647 non-null   int64  
 9   variantName                    7647 non-null   object 
 10  price                          7647 non-null   float64
 11  Registration Year              7647 non-null   int64  
 12  Insurance Validity             7647 non-null   o

In [4]:
df.head(2)

Unnamed: 0,it,bt,km,transmission,ownerNo,oem,model,modelYear,centralVariantId,variantName,...,Rear Brake Type,Top Speed,Acceleration,Tyre Type,No Door Numbers,Cargo Volumn,Wheel Size,Alloy Wheel Size,city,Features Count
0,0,Hatchback,120000.0,Manual,3,Maruti,Maruti Celerio,2015,3979,VXI,...,Drum,150.0,15.05,"Tubeless, Radial",5.0,235.0,14.0,14.0,bangalore,8
1,0,SUV,32706.0,Manual,2,Ford,Ford Ecosport,2018,6087,1.5 Petrol Titanium BSIV,...,Drum,182.0,13.5,"Tubeless, Radial",4.0,352.0,16.0,16.0,bangalore,9


In [5]:
# Extract the categorical columns
cat_cols = df.select_dtypes(include='object').columns
cat_cols

Index(['bt', 'transmission', 'oem', 'model', 'variantName',
       'Insurance Validity', 'Fuel Type', 'RTO', 'Color', 'Engine Type',
       'Value Configuration', 'Fuel Suppy System', 'Turbo Charger',
       'Super Charger', 'Drive Type', 'Steering Type', 'Front Brake Type',
       'Rear Brake Type', 'Tyre Type', 'city'],
      dtype='object')

In [6]:
# Extract the numerical columns
cont_cols = df.select_dtypes(include=['float64', 'int64']).columns
cont_cols

Index(['it', 'km', 'ownerNo', 'modelYear', 'centralVariantId', 'price',
       'Registration Year', 'Seats', 'Engine Displacement',
       'Comfort & Convenience', 'Interior', 'Exterior', 'Safety',
       'Entertainment & Communication', 'Mileage', 'Max Power', 'Torque',
       'No of Cylinder', 'Values per Cylinder', 'Length', 'Width', 'Height',
       'Wheel Base', 'Front Tread', 'Rear Tread', 'Kerb Weight', 'Gear Box',
       'Turning Radius', 'Top Speed', 'Acceleration', 'No Door Numbers',
       'Cargo Volumn', 'Wheel Size', 'Alloy Wheel Size', 'Features Count'],
      dtype='object')

In [7]:
# Encode categorical features
encoder = LabelEncoder()
encoded_mappings = {}
for i in cat_cols:
    df[i] = encoder.fit_transform(df[i])
    encoded_mappings[i] = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

for col, mapping in encoded_mappings.items():
    print(f"Encoded values for '{col}':")
    for original, encoded in mapping.items():
        print(f"  {original} -> {encoded}")
    print()

# Save the encoded mappings in a .pkl file
joblib.dump(encoded_mappings, "encoded_mappings.pkl")


Encoded values for 'bt':
  Coupe -> 0
  Hatchback -> 1
  MUV -> 2
  Minivans -> 3
  Pickup Trucks -> 4
  SUV -> 5
  Sedan -> 6
  Wagon -> 7

Encoded values for 'transmission':
  Automatic -> 0
  Manual -> 1

Encoded values for 'oem':
  Audi -> 0
  BMW -> 1
  Chevrolet -> 2
  Citroen -> 3
  Datsun -> 4
  Fiat -> 5
  Ford -> 6
  Honda -> 7
  Hyundai -> 8
  Isuzu -> 9
  Jaguar -> 10
  Jeep -> 11
  Kia -> 12
  Land Rover -> 13
  MG -> 14
  Mahindra -> 15
  Maruti -> 16
  Mercedes-Benz -> 17
  Mini -> 18
  Mitsubishi -> 19
  Nissan -> 20
  Opel -> 21
  Renault -> 22
  Skoda -> 23
  Tata -> 24
  Toyota -> 25
  Volkswagen -> 26
  Volvo -> 27

Encoded values for 'model':
  Audi A3 -> 0
  Audi A4 -> 1
  Audi A6 -> 2
  Audi Q2 -> 3
  Audi Q3 -> 4
  Audi Q3 Sportback -> 5
  Audi Q5 -> 6
  Audi Q7 -> 7
  BMW 1 Series -> 8
  BMW 2 Series -> 9
  BMW 3 Series -> 10
  BMW 3 Series GT -> 11
  BMW 3 Series Gran Limousine -> 12
  BMW 5 Series -> 13
  BMW 6 Series -> 14
  BMW X1 -> 15
  BMW X3 -> 16
  BMW

['encoded_mappings.pkl']

In [8]:
# Split dataset into features and target
X = df.drop(columns=['price'])
y = df['price']

In [9]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet': ElasticNet(),
    'Support Vector Regression (SVR)': SVR(),
    'K-Nearest Neighbors (KNN)': KNeighborsRegressor(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

# Store evaluation metrics for each model
evaluation_results = []

# Iterate over models
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    train_preds = model.predict(X_train)
    test_pred = model.predict(X_test)
    
    # Compute metrics
    train_mae = mean_absolute_error(y_train, train_preds)
    train_r2 = r2_score(y_train, train_preds)
    test_mae = mean_absolute_error(y_test, test_pred)
    test_r2 = r2_score(y_test, test_pred)
    
    # Append metrics to results
    evaluation_results.append({
        'Model': name,
        'Train MAE': train_mae,
        'Train R2 Score': train_r2,
        'Test MAE' : test_mae,
        'Test R2 Score' : test_r2
    })

# Convert results to a DataFrame for better readability
evaluation_df = pd.DataFrame(evaluation_results)

  model = cd_fast.enet_coordinate_descent(


In [11]:
evaluation_df

Unnamed: 0,Model,Train MAE,Train R2 Score,Test MAE,Test R2 Score
0,Linear Regression,247448.287614,0.790847,247060.935734,0.803653
1,Ridge Regression,247461.448835,0.79084,246987.48424,0.803765
2,Lasso Regression,247448.187773,0.790847,247060.520356,0.803654
3,ElasticNet,247494.015615,0.775299,249917.64459,0.790964
4,Support Vector Regression (SVR),468594.207593,-0.085748,516447.976903,-0.093154
5,K-Nearest Neighbors (KNN),220516.309531,0.748458,302761.817124,0.595689
6,Random Forest,36088.332317,0.991894,96994.436584,0.957258
7,Gradient Boosting,93773.680064,0.969452,113097.812628,0.957117
8,XGBoost,29329.348518,0.997849,90438.141979,0.963288


In [12]:
# Perform Recursive Feature Elimination (RFE) using XgBoost as the base model
base_model_xg = XGBRegressor(random_state=42)
rfe = RFE(estimator=base_model_xg, n_features_to_select=10)
rfe.fit(X_train, y_train)

In [13]:
selected_features_xg = list(X_train.columns[rfe.support_])
selected_features_xg.append('oem')
selected_features_xg.append('model')
selected_features_xg.append('ownerNo')
selected_features_xg.append('modelYear')
print("Selected Features:", selected_features_xg)

Selected Features: ['Registration Year', 'Safety', 'Max Power', 'Width', 'Wheel Base', 'Gear Box', 'Turning Radius', 'Acceleration', 'Tyre Type', 'Wheel Size', 'oem', 'model', 'ownerNo', 'modelYear']


In [14]:
# Filter dataset to include only selected features
X_train_selected = X_train[selected_features_xg]
X_test_selected = X_test[selected_features_xg]

In [15]:
# perform hyperparameter tuning for XgBoost using RandomizedSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 1.5, 2]
}

# Initialize the model
xgb = XGBRegressor(random_state=42)

# Perform RandomizedSearchCV
random_search_xgb = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=50,
    scoring='r2',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit the random search to the data
random_search_xgb.fit(X_train_selected, y_train)

# Best parameters and score
print("Best Parameters:", random_search_xgb.best_params_)
print("Best R2 Score:", random_search_xgb.best_score_)

# Evaluate the best model on the test set
best_model_xgb = random_search_xgb.best_estimator_
y_pred = best_model_xgb.predict(X_test_selected)
xgb_mae = mean_absolute_error(y_test, y_pred)
xgb_r2 = r2_score(y_test, y_pred)
print(f"MAE: {xgb_mae}, R2 Score: {xgb_r2}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters: {'subsample': 0.8, 'reg_lambda': 1.5, 'reg_alpha': 0.1, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.05, 'gamma': 0.3, 'colsample_bytree': 0.8}
Best R2 Score: 0.9381214694941802
MAE: 110544.36548713235, R2 Score: 0.9614408864291027


In [16]:
results = {
    'Model': ['XGBoost'],
    'MAE': [xgb_mae],
    'R2 Score': [xgb_r2]
}
results_df = pd.DataFrame(results)
print(results_df)

     Model            MAE  R2 Score
0  XGBoost  110544.365487  0.961441


In [17]:
# Best Parameters: {'subsample': 0.8, 'reg_lambda': 1.5, 'reg_alpha': 0.1, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.05, 'gamma': 0.3, 'colsample_bytree': 0.8}

# Train the best model using best parameters

best_xgb_params = random_search_xgb.best_params_

best_xgb_model = XGBRegressor(
    n_estimators=best_xgb_params['n_estimators'],
    max_depth=best_xgb_params['max_depth'],
    learning_rate=best_xgb_params['learning_rate'],
    subsample=best_xgb_params['subsample'],
    colsample_bytree=best_xgb_params['colsample_bytree'],
    random_state=42
)

best_xgb_model.fit(X_train_selected, y_train)

y_train_pred_xgb = best_xgb_model.predict(X_train_selected)

mae_train_xgb = mean_absolute_error(y_train, y_train_pred_xgb)
r2_train_xgb = r2_score(y_train, y_train_pred_xgb)

y_test_pred_xgb = best_xgb_model.predict(X_test_selected)

mae_test_xgb = mean_absolute_error(y_test, y_test_pred_xgb)
r2_test_xgb = r2_score(y_test, y_test_pred_xgb)

best_xgb_params_results = {
    'Model': ['XGBoost'],
    'Train MAE': [mae_train_xgb],
    'Train R2 Score': [r2_train_xgb],
    'Test MAE' : [mae_test_xgb],
    'Test R2 Score' : [r2_test_xgb]
}
result = pd.DataFrame(best_xgb_params_results)
print(result)


     Model     Train MAE  Train R2 Score       Test MAE  Test R2 Score
0  XGBoost  86588.689924        0.978481  111095.056306       0.960838


In [18]:
# Save the model
joblib.dump(best_xgb_model, 'carprice_prediction_ml_model.pkl')

['carprice_prediction_ml_model.pkl']

In [19]:
# Load the model
# testing 1
model = joblib.load('carprice_prediction_ml_model.pkl')

# Test prediction
sample_data = [[2015, 20, 88, 1700, 2400, 5, 4.8, 14.5, 1, 14, 16, 152, 1, 2015]]
print("Prediction:", model.predict(sample_data))

Prediction: [454685.34]


In [20]:
# Test prediction
# testing 2
sample_data = [[2015, 35, 170, 2000, 2700, 8, 5.8, 8, 8, 17, 0, 1, 1, 2015]]
print("Prediction:", model.predict(sample_data))

Prediction: [2204282.5]
