## Assignement 6 / Car Price Prediction Analysis

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV


### 1. Loading and Preprocessing


In [3]:
# Load the dataset
df = pd.read_csv('CarPrice_Assignment.csv')

In [9]:
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
display(df.head())
print("\nDataset info:")
df.info()
print("\nSummary statistics:")
display(df.describe())
print("\nMissing values per column:")
print(df.isnull().sum())

Dataset shape: (205, 25)

First 5 rows:


Unnamed: 0,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0



Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   symboling         205 non-null    int64  
 1   CarName           205 non-null    object 
 2   fueltype          205 non-null    object 
 3   aspiration        205 non-null    object 
 4   doornumber        205 non-null    object 
 5   carbody           205 non-null    object 
 6   drivewheel        205 non-null    object 
 7   enginelocation    205 non-null    object 
 8   wheelbase         205 non-null    float64
 9   carlength         205 non-null    float64
 10  carwidth          205 non-null    float64
 11  carheight         205 non-null    float64
 12  curbweight        205 non-null    int64  
 13  enginetype        205 non-null    object 
 14  cylindernumber    205 non-null    object 
 15  enginesize        205 non-null    int64  
 16  fuelsystem        205 non-nul

Unnamed: 0,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,0.834146,98.756585,174.049268,65.907805,53.724878,2555.565854,126.907317,3.329756,3.255415,10.142537,104.117073,5125.121951,25.219512,30.75122,13276.710571
std,1.245307,6.021776,12.337289,2.145204,2.443522,520.680204,41.642693,0.270844,0.313597,3.97204,39.544167,476.985643,6.542142,6.886443,7988.852332
min,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,2.54,2.07,7.0,48.0,4150.0,13.0,16.0,5118.0
25%,0.0,94.5,166.3,64.1,52.0,2145.0,97.0,3.15,3.11,8.6,70.0,4800.0,19.0,25.0,7788.0
50%,1.0,97.0,173.2,65.5,54.1,2414.0,120.0,3.31,3.29,9.0,95.0,5200.0,24.0,30.0,10295.0
75%,2.0,102.4,183.1,66.9,55.5,2935.0,141.0,3.58,3.41,9.4,116.0,5500.0,30.0,34.0,16503.0
max,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,3.94,4.17,23.0,288.0,6600.0,49.0,54.0,45400.0



Missing values per column:
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64


In [33]:
# Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col].astype(str))

In [34]:
# Separate features and target
X = df.drop('price', axis=1)
y = df['price']

In [35]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [37]:
# Function to evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return {
        'R-squared': r2_score(y_test, y_pred),
        'MSE': mean_squared_error(y_test, y_pred),
        'MAE': mean_absolute_error(y_test, y_pred)
    }

### 2. Model Implementation

#### A. Linear Regression

In [39]:
from sklearn.linear_model import LinearRegression
print("\n=== Linear Regression ===")
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)


=== Linear Regression ===


In [40]:
lr_metrics = evaluate_model(lr, X_test_scaled, y_test)
print("R-squared:", lr_metrics['R-squared'])
print("MSE:", lr_metrics['MSE'])
print("MAE:", lr_metrics['MAE'])


R-squared: 0.836206054937787
MSE: 12930552.618952848
MAE: 2189.017199313628


### B.Decision Tree Regressor

In [41]:
from sklearn.tree import DecisionTreeRegressor
print("\n=== Decision Tree Regressor ===")
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_scaled, y_train)


=== Decision Tree Regressor ===


In [42]:
dt_metrics = evaluate_model(dt, X_test_scaled, y_test)
print("R-squared:", dt_metrics['R-squared'])
print("MSE:", dt_metrics['MSE'])
print("MAE:", dt_metrics['MAE'])

R-squared: 0.9153274443858332
MSE: 6684391.998338756
MAE: 1832.4674878048781


### C. Random Forest Regressor

In [43]:
from sklearn.ensemble import RandomForestRegressor
print("\n=== Random Forest Regressor ===")
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_scaled, y_train)


=== Random Forest Regressor ===


In [44]:
rf_metrics = evaluate_model(rf, X_test_scaled, y_test)
print("R-squared:", rf_metrics['R-squared'])
print("MSE:", rf_metrics['MSE'])
print("MAE:", rf_metrics['MAE'])

R-squared: 0.9537064854920896
MSE: 3654596.175905219
MAE: 1332.4984878048779


### D. Gradient Boosting Regressor

In [45]:
from sklearn.ensemble import GradientBoostingRegressor
print("\n=== Gradient Boosting Regressor ===")
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train_scaled, y_train)


=== Gradient Boosting Regressor ===


In [46]:
gb_metrics = evaluate_model(gb, X_test_scaled, y_test)
print("R-squared:", gb_metrics['R-squared'])
print("MSE:", gb_metrics['MSE'])
print("MAE:", gb_metrics['MAE'])


R-squared: 0.9321115398855577
MSE: 5359388.01276234
MAE: 1626.492761827474


### E. Support Vector Regressor

In [47]:
from sklearn.svm import SVR
print("\n=== Support Vector Regressor ===")
svr = SVR()
svr.fit(X_train_scaled, y_train)


=== Support Vector Regressor ===


In [48]:
svr_metrics = evaluate_model(svr, X_test_scaled, y_test)
print("R-squared:", svr_metrics['R-squared'])
print("MSE:", svr_metrics['MSE'])
print("MAE:", svr_metrics['MAE'])


R-squared: -0.10006675564418743
MSE: 86843692.926592
MAE: 5696.507678871264


### 3. Model Evaluation

In [49]:
results = {
    'Linear Regression': lr_metrics,
    'Decision Tree': dt_metrics,
    'Random Forest': rf_metrics,
    'Gradient Boosting': gb_metrics,
    'Support Vector': svr_metrics
}

results_df = pd.DataFrame(results).T
print("\n=== Model Comparison ===")
print(results_df)



=== Model Comparison ===
                   R-squared           MSE          MAE
Linear Regression   0.836206  1.293055e+07  2189.017199
Decision Tree       0.915327  6.684392e+06  1832.467488
Random Forest       0.953706  3.654596e+06  1332.498488
Gradient Boosting   0.932112  5.359388e+06  1626.492762
Support Vector     -0.100067  8.684369e+07  5696.507679


In [50]:
# Identify best model
best_model_name = max(results.items(), key=lambda x: x[1]['R-squared'])[0]
print(f"\nBest performing model: {best_model_name}")


Best performing model: Random Forest


### 4. Feature Importance Analysis

#### Feature Importance for Random Forest

In [51]:

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)
print("\nTop 10 Important Features from Random Forest:")
print(feature_importance.head(10))


Top 10 Important Features from Random Forest:
       Feature  Importance
14  enginesize    0.551483
11  curbweight    0.301507
22  highwaympg    0.045569
19  horsepower    0.027706
1      CarName    0.020822
9    carlength    0.008518
8    wheelbase    0.007621
20     peakrpm    0.006565
17      stroke    0.004830
21     citympg    0.004647


#### Feature Importance For Linear Regression (using coefficients):

In [52]:
lr_coef = pd.DataFrame({
    'Feature': X.columns,
    'Importance': np.abs(lr.coef_)  # Absolute value of coefficients
}).sort_values('Importance', ascending=False)
print("Linear Regression Feature Importance:")
print(lr_coef.head(10))

Linear Regression Feature Importance:
             Feature   Importance
18  compressionratio  4208.505813
2           fueltype  3619.780574
14        enginesize  2939.741125
8          wheelbase  1761.878354
11        curbweight  1693.463856
7     enginelocation  1426.538736
19        horsepower  1301.941641
21           citympg  1160.931412
1            CarName  1118.363763
5            carbody  1093.349926


#### Feature Importance For Gradient Boosting 

In [53]:
gb_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': gb.feature_importances_
}).sort_values('Importance', ascending=False)
print("Gradient Boosting Feature Importance:")
print(gb_importance.head(10))

Gradient Boosting Feature Importance:
       Feature  Importance
14  enginesize    0.614093
11  curbweight    0.153128
22  highwaympg    0.068225
19  horsepower    0.067208
1      CarName    0.020199
9    carlength    0.019487
8    wheelbase    0.015501
15  fuelsystem    0.007926
6   drivewheel    0.006946
21     citympg    0.005262


#### Feature Importance For For SVM

In [54]:
from sklearn.inspection import permutation_importance

svr_importance = permutation_importance(svr, X_test_scaled, y_test, n_repeats=10, random_state=42)
svr_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': svr_importance.importances_mean
}).sort_values('Importance', ascending=False)
print("SVR Feature Importance (Permutation):")
print(svr_importance_df.head(10))

SVR Feature Importance (Permutation):
       Feature  Importance
11  curbweight    0.000247
22  highwaympg    0.000215
9    carlength    0.000198
21     citympg    0.000197
15  fuelsystem    0.000196
8    wheelbase    0.000178
19  horsepower    0.000155
6   drivewheel    0.000154
16   boreratio    0.000135
14  enginesize    0.000126


### 5. Hyperparameter Tuning for the best model

In [67]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

In [68]:
# Initialize the base model
rf = RandomForestRegressor(random_state=42)

In [69]:
# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

In [70]:
# Perform the grid search
print("Starting grid search...")
grid_search.fit(X_train_scaled, y_train)


Starting grid search...
Fitting 5 folds for each of 216 candidates, totalling 1080 fits


  _data = np.array(data, dtype=dtype, copy=copy,


In [71]:
# Get the best parameters
best_params = grid_search.best_params_
print("\nBest parameters found:")
print(best_params)


Best parameters found:
{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}


In [72]:
# Evaluate the tuned model
tuned_rf = grid_search.best_estimator_
y_pred_tuned = tuned_rf.predict(X_test_scaled)


In [73]:
# Calculate metrics
r2_tuned = r2_score(y_test, y_pred_tuned)
mse_tuned = mean_squared_error(y_test, y_pred_tuned)
mae_tuned = mean_absolute_error(y_test, y_pred_tuned)

print("\n=== Tuned Random Forest Performance ===")
print(f"R-squared: {r2_tuned}")
print(f"MSE: {mse_tuned}")
print(f"MAE: {mae_tuned}")


=== Tuned Random Forest Performance ===
R-squared: 0.9425701701744617
MSE: 4533741.684862483
MAE: 1357.5346666666667


Performance Degradation:

All metrics became worse after tuning

MSE increased by 24% (879k higher)

The model now explains 1.1% less variance in the data (R² decrease)

## Summary:
### Significant Variables in Predicting Car Prices


####  Based on the Random Forest model's feature importance analysis (both before and after tuning), these are the most important factors that determine car prices:

#### Top 10 Important Features from Random Forest:
#### Feature           Importance
14  enginesize         0.551483

11  curbweight         0.301507

22  highwaympg         0.045569

19  horsepower         0.027706

1      CarName         0.020822

9    carlength         0.008518

8    wheelbase         0.007621

20     peakrpm         0.006565

17      stroke         0.004830

21     citympg         0.004647

### How Well These Variables Describe Car Prices?
The R² score of 0.9537 (before tuning) means:
✅ 95.37% of price variation is explained by these features → Excellent fit

✅ Only 4.63% of price differences come from unmeasured factors (brand prestige, rare features, etc.)