In [259]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as px
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer, mean_squared_error, r2_score,mean_absolute_error

In [270]:
# Extracting the data frame from csv file
path2=r'A:\DAIICT_sem1\Used_car_Price_prediction\Data_CSV\cars_24_updated.csv'
cars24_regression= pd.read_csv(path2)
cars24_regression

Unnamed: 0,Price,Engine capacity,KM driven,Ownership,Imperfections,Repainted Parts,car_age,Transmission_Automatic,Transmission_Manual,Fuel type_CNG,Fuel type_Diesel,Fuel type_Petrol
0,561000,1197,25847,2,6,2,7,0,1,0,0,1
1,498000,1197,55511,2,12,1,8,0,1,0,0,1
2,577000,1197,47110,1,4,2,4,0,1,0,0,1
3,1084000,1462,35378,1,2,3,2,1,0,0,0,1
4,603000,1197,91856,1,3,2,5,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1440,862000,1462,19901,1,1,0,3,0,1,0,0,1
1441,507000,1373,50022,1,5,2,9,0,1,0,0,1
1442,554000,1197,58679,1,24,4,5,0,1,0,0,1
1443,557000,1373,73948,2,4,5,7,0,1,0,0,1


<h1>Split the data in 80:10:10 </h1> 

10p


In [271]:
# Selecting features based on the user's correlation analysis
features = [
     'Engine capacity', 'KM driven', 'Ownership', 
    'Imperfections', 'Repainted Parts', 'Transmission_Automatic', 'Transmission_Manual',
    'Fuel type_CNG',	'Fuel type_Diesel'	,'Fuel type_Petrol','car_age'
]

X_all_features = cars24_regression[features]
y_feature_to_predict = cars24_regression['Price']

In [263]:
X_all_features

Unnamed: 0,Engine capacity,KM driven,Ownership,Imperfections,Repainted Parts,Transmission_Automatic,Transmission_Manual,Fuel type_CNG,Fuel type_Diesel,Fuel type_Petrol,car_age
0,1197,25847,2,6,2,0,1,0,0,1,7
1,1197,55511,2,12,1,0,1,0,0,1,8
2,1197,47110,1,4,2,0,1,0,0,1,4
3,1462,35378,1,2,3,1,0,0,0,1,2
4,1197,91856,1,3,2,0,1,0,0,1,5
...,...,...,...,...,...,...,...,...,...,...,...
1440,1462,19901,1,1,0,0,1,0,0,1,3
1441,1373,50022,1,5,2,0,1,0,0,1,9
1442,1197,58679,1,24,4,0,1,0,0,1,5
1443,1373,73948,2,4,5,0,1,0,0,1,7


# Train Test and Validation split 

In [272]:
X_train, X_test, y_train, y_test = train_test_split(X_all_features, y_feature_to_predict, test_size=0.2, random_state=42)

In [273]:
X_test,X_val,y_test,y_val=train_test_split(X_test,y_test,test_size=0.5,random_state=42)

# Creating a Machine Learning Pipeline

In [278]:
# Define models and pipelines
pipelines = {
    'Ridge Regression': Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge())
    ]),
    
    'Polynomial Regression': Pipeline([
        ('poly_features', PolynomialFeatures(degree=2)),
        ('scaler', StandardScaler()),
        ('model', LinearRegression())
    ]),
    'Random Forest': Pipeline([  ('scaler', StandardScaler()),
        ('model', RandomForestRegressor())
    ])
}

# Define hyperparameter grids for tuning inside my pipeline 
param_grids = {
    'Ridge Regression': {
    'model__alpha': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'model__solver': ['auto', 'svd', 'cholesky'],  # Solvers for optimization
    'model__max_iter': [1000, 5000, 10000]  # Maximum number of iterations
                        },
    'Random Forest': {'model__n_estimators': [100, 200, 300],
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5, 10]
                        }
}

# Define evaluation metrics inside my pipeline 
scoring = {
    'MSE': make_scorer(mean_squared_error, greater_is_better=False),
    'R2': make_scorer(r2_score),
    'MAE': make_scorer(mean_absolute_error)
}
# Store the best model for final training
best_models = {}

# Iterate through models and evaluate performance using GridSearchCV
for name, pipeline in pipelines.items():
    print(f"\nTraining {name}...")

    # If hyperparameters exist for the model, use GridSearchCV
    if name in param_grids:
        grid_search = GridSearchCV(pipeline, param_grids[name], scoring=scoring, refit='R2', cv=5, return_train_score=True)
        grid_search.fit(X_train, y_train.to_numpy())
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_models[name] = grid_search.best_estimator_
    else:
        pipeline.fit(X_train, y_train.to_numpy())
        best_model = pipeline
        best_params = "No hyperparameter tuning"
        best_models[name] = grid_search.best_estimator_

    # Evaluate on validation set
    y_val_pred = best_model.predict(X_val)  # Normalization is automatically applied to X_val
    
    val_mse = mean_squared_error(y_val, y_val_pred)
    val_r2 = r2_score(y_val, y_val_pred)
    val_mae = mean_absolute_error(y_val, y_val_pred)

    print(f"Best Parameters: {best_params}")
    print(f"Validation Mean Squared Error: {val_mse:.2f}")
    print(f"Validation R² Score: {val_r2:.2f}")
    print(f"Validation Mean Absolute Error: {val_mae:.2f}")
    
    
# Evaluate on the test set (without merging it with train/validation)
for name, model in best_models.items():
    print(f"\nEvaluating {name} on Test Set...")

    # Make predictions on the test set
    y_test_pred = model.predict(X_test)
    
    # # Calculate evaluation metrics
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    
    # # Print the evaluation metrics
    print(f"\n{name} Performance on Test Set:")
    print(f"Test Mean Squared Error: {test_mse:.2f}")
    print(f"Test R² Score: {test_r2:.2f}")
    print(f"Test Mean Absolute Error: {test_mae:.2f}")
    


Training Ridge Regression...
Best Parameters: {'model__alpha': 10, 'model__max_iter': 1000, 'model__solver': 'auto'}
Validation Mean Squared Error: 6020704608.48
Validation R² Score: 0.80
Validation Mean Absolute Error: 59805.84

Training Polynomial Regression...
Best Parameters: No hyperparameter tuning
Validation Mean Squared Error: 4675801101.28
Validation R² Score: 0.84
Validation Mean Absolute Error: 50905.85

Training Random Forest...
Best Parameters: {'model__max_depth': 20, 'model__min_samples_split': 10, 'model__n_estimators': 300}
Validation Mean Squared Error: 4321026123.85
Validation R² Score: 0.86
Validation Mean Absolute Error: 49516.28

Evaluating Ridge Regression on Test Set...

Ridge Regression Performance on Test Set:
Test Mean Squared Error: 7059030608.32
Test R² Score: 0.81
Test Mean Absolute Error: 63008.77

Evaluating Polynomial Regression on Test Set...

Polynomial Regression Performance on Test Set:
Test Mean Squared Error: 7059030608.32
Test R² Score: 0.81
Tes

In [269]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error, r2_score,mean_absolute_error
# Define models and pipelines
pipelines = {
    'Linear Regression': Pipeline([  # Scaling is optional for linear regression
        ('model', LinearRegression())
    ]),
    
    'Ridge Regression': Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge())
    ]),
    
    'Lasso Regression': Pipeline([
        ('scaler', StandardScaler()),
        ('model', Lasso())
    ]),
    
    'Polynomial Regression': Pipeline([
        ('poly_features', PolynomialFeatures(degree=2)),
        ('scaler', StandardScaler()),
        ('model', LinearRegression())
    ]),
    
    # 'Support Vector Regression': Pipeline([
    #     ('scaler', StandardScaler()),
    #     ('model', SVR())
    # ]),
    
    'Decision Tree': Pipeline([
        ('model', DecisionTreeRegressor())
    ]),
    
    'Random Forest': Pipeline([
        ('model', RandomForestRegressor())
    ])
}

# Define hyperparameter grids for tuning
param_grids = {
    'Ridge Regression': {
    'model__alpha': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'model__solver': ['auto', 'svd', 'cholesky'],  # Solvers for optimization
    'model__max_iter': [1000, 5000, 10000]  # Maximum number of iterations
},
    'Lasso Regression': {'model__alpha': [0.001, 0.01, 0.1, 1, 10, 100],'model__max_iter': [1000, 5000, 10000]},
    'Support Vector Regression':  {
    'model__C': [0.1, 1, 10, 100],  # Regularization parameter
    'model__epsilon': [0.001, 0.01, 0.1, 1],  # Epsilon-tube within which no penalty is given
    'model__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]  # Kernel coefficient
},
    'Decision Tree': {'model__max_depth': [3, 5, 10]},
    'Random Forest': {'model__n_estimators': [50, 100], 'model__max_depth': [None, 5, 10]}
}

# Define evaluation metrics
scoring = {
    'MSE': make_scorer(mean_squared_error, greater_is_better=False),
    'R2': make_scorer(r2_score),
    'MAE': make_scorer(mean_absolute_error)
}
# Store the best model for final training
best_models = {}

# Iterate through models and evaluate performance using GridSearchCV
for name, pipeline in pipelines.items():
    print(f"\nTraining {name}...")

    # If hyperparameters exist for the model, use GridSearchCV
    if name in param_grids:
        grid_search = GridSearchCV(pipeline, param_grids[name], scoring=scoring, refit='R2', cv=5, return_train_score=True)
        grid_search.fit(X_train, y_train.ravel())
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_models[name] = grid_search.best_estimator_
    else:
        pipeline.fit(X_train, y_train.ravel())
        best_model = pipeline
        best_params = "No hyperparameter tuning"
        best_models[name] = grid_search.best_estimator_

    # Evaluate on validation set
    y_val_pred = best_model.predict(X_val)  # Normalization is automatically applied to X_val
    
    val_mse = mean_squared_error(y_val, y_val_pred)
    val_r2 = r2_score(y_val, y_val_pred)
    val_mae = mean_absolute_error(y_val, y_val_pred)

    print(f"Best Parameters: {best_params}")
    print(f"Validation Mean Squared Error: {val_mse:.2f}")
    print(f"Validation R² Score: {val_r2:.2f}")
    print(f"Validation Mean Absolute Error: {val_mae:.2f}")
    
# Train the final model on the entire dataset (training + validation) and evaluate on the test set
for name, model in best_models.items():
    print(f"\nTraining Final {name} on Full Training Data...")

    # Combine training and validation sets
    X_full_train = np.vstack([X_train, X_val])
    y_full_train = np.concatenate([y_train, y_val])
    
    # Train the model on the combined training set
    model.fit(X_full_train, y_full_train.ravel())
    
    # Evaluate on the test set
    y_test_pred = model.predict(X_test)
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)

    print(f"Test Mean Squared Error: {test_mse:.2f}")
    print(f"Test R² Score: {test_r2:.2f}")
    print(f"Test Mean Absolute Error: {test_mae:.2f}")


Training Linear Regression...
Best Parameters: No hyperparameter tuning
Validation Mean Squared Error: 6069961262.08
Validation R² Score: 0.80
Validation Mean Absolute Error: 60079.71

Training Ridge Regression...


  pipeline.fit(X_train, y_train.ravel())
  grid_search.fit(X_train, y_train.ravel())


Best Parameters: {'model__alpha': 10, 'model__max_iter': 1000, 'model__solver': 'auto'}
Validation Mean Squared Error: 6020704608.48
Validation R² Score: 0.80
Validation Mean Absolute Error: 59805.84

Training Lasso Regression...


  grid_search.fit(X_train, y_train.ravel())


Best Parameters: {'model__alpha': 100, 'model__max_iter': 1000}
Validation Mean Squared Error: 6065293695.19
Validation R² Score: 0.80
Validation Mean Absolute Error: 60035.60

Training Polynomial Regression...
Best Parameters: No hyperparameter tuning
Validation Mean Squared Error: 4675801101.28
Validation R² Score: 0.84
Validation Mean Absolute Error: 50905.85

Training Decision Tree...
Best Parameters: {'model__max_depth': 5}
Validation Mean Squared Error: 5455647962.88
Validation R² Score: 0.82
Validation Mean Absolute Error: 56488.70

Training Random Forest...


  pipeline.fit(X_train, y_train.ravel())
  grid_search.fit(X_train, y_train.ravel())
  grid_search.fit(X_train, y_train.ravel())


Best Parameters: {'model__max_depth': 10, 'model__n_estimators': 100}
Validation Mean Squared Error: 4217738317.21
Validation R² Score: 0.86
Validation Mean Absolute Error: 47949.54

Training Final Linear Regression on Full Training Data...




Test Mean Squared Error: 7021047144.51
Test R² Score: 0.81
Test Mean Absolute Error: 59906.66

Training Final Ridge Regression on Full Training Data...
Test Mean Squared Error: 7078573891.66
Test R² Score: 0.81
Test Mean Absolute Error: 62585.63

Training Final Lasso Regression on Full Training Data...
Test Mean Squared Error: 7084890588.55
Test R² Score: 0.81
Test Mean Absolute Error: 62782.39

Training Final Polynomial Regression on Full Training Data...
Test Mean Squared Error: 7084890588.55
Test R² Score: 0.81
Test Mean Absolute Error: 62782.39

Training Final Decision Tree on Full Training Data...
Test Mean Squared Error: 8859276475.15
Test R² Score: 0.76
Test Mean Absolute Error: 67949.40

Training Final Random Forest on Full Training Data...
Test Mean Squared Error: 6942578480.83
Test R² Score: 0.81
Test Mean Absolute Error: 59079.19


