In [42]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.svm import SVR

In [43]:
#1 Loading and Preprocessing
df=pd.read_csv("C:/Users/Lenovo/Downloads/CarPrice_Assignment.csv")

In [44]:
df

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,201,-1,volvo 145e (sw),gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0
201,202,-1,volvo 144ea,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0
202,203,-1,volvo 244dl,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0
203,204,-1,volvo 246,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470.0


In [45]:
df.isnull().sum()

car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

In [46]:
#seperate features and target
X = df.drop('price', axis=1) 
y = df['price']

In [47]:
categorical_cols = X.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    X[col] = le.fit_transform(X[col])

In [48]:
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

In [49]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data preprocessing completed.", X_train.shape)

Data preprocessing completed. (164, 25)


In [50]:
#2 Model Implementation
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'SVR': SVR()
}
# Train and predict with each model
predictions = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions[name] = model.predict(X_test)

print("Models trained successfully.")

Models trained successfully.


In [51]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Compare the performance of all the models
results = {}
for name, y_pred in predictions.items():
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    results[name] = {'R-squared': r2, 'MSE': mse, 'MAE': mae}

# Display results
results_df = pd.DataFrame(results).T
print("Model Evaluation Results:\n", results_df)

# Identify the best model (highest R-squared, lowest MSE/MAE)
best_model = results_df['R-squared'].idxmax()
print(f"\nBest performing model: {best_model}")
print(f"Reason: {best_model} has the highest R-squared ({results_df.loc[best_model,'R-squared']:.4f})") 

Model Evaluation Results:
                    R-squared           MSE          MAE
Linear Regression   0.844116  1.230612e+07  2087.306212
Decision Tree       0.879253  9.532216e+06  2090.699195
Random Forest       0.956862  3.405525e+06  1309.526171
Gradient Boosting   0.933109  5.280630e+06  1595.420951
SVR                -0.102042  8.699963e+07  5707.944787

Best performing model: Random Forest
Reason: Random Forest has the highest R-squared (0.9569)


In [67]:
best_model_name = 'Random Forest'  # Replace with actual best model
best_model_instance = models[best_model_name]

# Feature Importance
importances = best_model_instance.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display top 10 features
print("Top 10 Significant Features Affecting Car Prices:\n", feature_importance_df.head(10))


Top 10 Significant Features Affecting Car Prices:
        Feature  Importance
16  enginesize    0.556389
13  curbweight    0.290457
24  highwaympg    0.044403
21  horsepower    0.026663
0       car_ID    0.015699
11    carwidth    0.012077
2      CarName    0.009800
10   carlength    0.007784
9    wheelbase    0.006116
23     citympg    0.005177


In [53]:
# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV
# Define parameter grid for Random Forest
param_grid = {
    'n_estimators': [100,200],
    'max_depth': [10,20],
    'min_samples_split': [2,5]
}

# Perform Grid Search
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best R-squared after tuning:", grid_search.best_score_)


# Evaluate tuned model on test set
tuned_model = grid_search.best_estimator_
y_pred_tuned = tuned_model.predict(X_test)
r2_tuned = r2_score(y_test, y_pred_tuned)
mse_tuned = mean_squared_error(y_test, y_pred_tuned)
mae_tuned = mean_absolute_error(y_test, y_pred_tuned)

print(f"Tuned Model Performance: R-squared={r2_tuned:.4f}, MSE={mse_tuned:.4f}, MAE={mae_tuned:.4f}")
# check whether the performance of the model has increased
print(f"Improvement in R-squared: {(r2_tuned - results[best_model_name]['R-squared']):.4f}")

Best Parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
Best R-squared after tuning: 0.8909996001852877
Tuned Model Performance: R-squared=0.9573, MSE=3373246.0542, MAE=1282.7833
Improvement in R-squared: 0.0004
