In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [9]:
# Load the dataset
url = "C:\\Users\\HP\\Downloads\\CarPrice_Assignment.csv"


df = pd.read_csv(url)

In [11]:
# Display basic information and summary statistics
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

In [13]:
# Drop irrelevant columns and handle missing values (customize based on your dataset)
df.dropna(inplace=True)  


In [15]:
# Encoding categorical variables if present
df = pd.get_dummies(df, drop_first=True)

In [17]:
# Feature scaling
scaler = StandardScaler()
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [19]:
# Split the dataset into features and target variable
X = df.drop('price', axis=1) 
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

# Initialize the models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(random_state=42),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
    'Support Vector Regressor': SVR()
}

In [25]:
# Initialize the models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(random_state=42),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
    'Support Vector Regressor': SVR()
}

In [27]:
# Train each model and store results
model_performance = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    model_performance[model_name] = {
        'R-squared': r2_score(y_test, y_pred),
        'MSE': mean_squared_error(y_test, y_pred),
        'MAE': mean_absolute_error(y_test, y_pred)
    }

In [29]:
# Display model performance
model_performance_df = pd.DataFrame(model_performance).T
print(model_performance_df)

                             R-squared       MSE       MAE
Linear Regression            -1.835248  3.093010  1.064698
Decision Tree Regressor       0.862314  0.150204  0.265689
Random Forest Regressor       0.940599  0.064801  0.180743
Gradient Boosting Regressor   0.923781  0.083148  0.202019
Support Vector Regressor      0.855732  0.157384  0.275968


In [31]:
# For Random Forest Regressor feature importance
rf_model = models['Random Forest Regressor']
importances = rf_model.feature_importances_

In [33]:
# Creating a DataFrame for feature importance
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)

                               Feature  Importance
7                           enginesize    0.631010
6                           curbweight    0.153259
14                          highwaympg    0.109254
11                          horsepower    0.024475
0                               car_ID    0.012151
..                                 ...         ...
85                 CarName_nissan dayz    0.000000
97                CarName_nissan titan    0.000000
48           CarName_dodge monaco (sw)    0.000000
45   CarName_dodge coronet custom (sw)    0.000000
110             CarName_porsche boxter    0.000000

[190 rows x 2 columns]


In [35]:
from sklearn.model_selection import GridSearchCV

# Example: Hyperparameter tuning for Random Forest Regressor
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)
print("Best R-squared score:", grid_search.best_score_)

Best parameters found: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Best R-squared score: 0.894231961918929


In [36]:
# Train final model with best parameters if needed
best_rf_model = grid_search.best_estimator_
best_rf_model.fit(X_train, y_train)
y_pred_best = best_rf_model.predict(X_test)
print("Final R-squared:", r2_score(y_test, y_pred_best))

Final R-squared: 0.9405990768228465
