In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder


# URL for downloading the CSV from Google Drive
url = 'https://drive.google.com/uc?export=download&id=1FHmYNLs9v0Enc-UExEMpitOFGsWvB2dP'

# Load the dataset into a pandas DataFrame
df = pd.read_csv(url)

# Display the first few rows to understand the data
df.head()

# List all columns
df.columns


Index(['car_ID', 'symboling', 'CarName', 'fueltype', 'aspiration',
       'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'wheelbase',
       'carlength', 'carwidth', 'carheight', 'curbweight', 'enginetype',
       'cylindernumber', 'enginesize', 'fuelsystem', 'boreratio', 'stroke',
       'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'price'],
      dtype='object')

In [21]:
#preprocessing
# Define features (X) and target (y)
X = df.drop(columns=['price'])
y = df['price']

# Identify categorical and numerical columns
categorical_columns = X.select_dtypes(include=['object']).columns
numerical_columns = X.select_dtypes(exclude=['object']).columns

# Define preprocessors
# Categorical features: OneHotEncoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing categorical values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Numerical features: Imputation + Scaling
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing numerical values
    ('scaler', StandardScaler())  # Feature scaling
])

# Create a ColumnTransformer to apply transformations to appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

# Apply transformations and split the data into training and testing sets
X_preprocessed = preprocessor.fit_transform(X)

# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

In [25]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Define features (X) and target (y)
X = df.drop(columns=['price'])
y = df['price']

# Identifying categorical and numerical columns
categorical_columns = X.select_dtypes(include=['object']).columns
numerical_columns = X.select_dtypes(exclude=['object']).columns

# Defining preprocessors
# Categorical features: OneHotEncoding with sparse_output=False
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing categorical values
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # Dense matrix output
])

# Numerical features: Imputation + Scaling
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing numerical values
    ('scaler', StandardScaler())  # Feature scaling
])

# Creating a ColumnTransformer to apply transformations to appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

# Splitting the data into train and test sets (80% train, 20% test)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Support Vector Regressor': SVR()
}

# Store evaluation metrics
model_metrics = {}

# Train and evaluate each model
for name, model in models.items():
    # Create a pipeline with the preprocessor and the model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = pipeline.predict(X_test)
    
    # Evaluate performance
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    # Store metrics
    model_metrics[name] = {'R2': r2, 'MSE': mse, 'MAE': mae}

# Display the evaluation metrics for each model
pd.DataFrame(model_metrics)

Unnamed: 0,Linear Regression,Decision Tree,Random Forest,Gradient Boosting,Support Vector Regressor
R2,-1.261189,0.8482671,0.9548691,0.9347586,-0.09986409
MSE,178507400.0,11978400.0,3562814.0,5150417.0,86827690.0
MAE,7036.823,2165.516,1335.531,1612.445,5695.713


In [None]:
Explanation:
After modifying the OneHotEncoder with sparse_output=False, the transformed data will be a dense matrix instead of a sparse matrix. Dense matrices retain the proper column structure for subsequent steps.
The rest of the code for training and evaluating models remains the same.

In [29]:
# Feature Importance Analysiis
for name, model in models.items():
    if isinstance(model, (DecisionTreeRegressor, RandomForestRegressor, GradientBoostingRegressor)):
        pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
        pipeline.fit(X_train, y_train)
        
        # Extracting feature importances
        importance = pipeline.named_steps['model'].feature_importances_
        
        # Ensuring the number of features in importance matches the number of feature names
        feature_names = numerical_columns.tolist() + categorical_columns.tolist()
        
        # If preprocessing changes the number of features (like OneHotEncoding), adjusting feature_names
        transformed_X = pipeline.named_steps['preprocessor'].transform(X_train)
        transformed_feature_names = numerical_columns.tolist() + list(pipeline.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_columns))  # This assumes you have one categorical transformer
        
        # Debugging: Checking the lengths of the arrays
        print(f"Number of features in importance: {len(importance)}")
        print(f"Number of feature names after transformation: {len(transformed_feature_names)}")
        
        # Checking if the lengths match
        if len(importance) == len(transformed_feature_names):
            # Creating DataFrame if lengths match
            importance_df = pd.DataFrame({'Feature': transformed_feature_names, 'Importance': importance})
            importance_df = importance_df.sort_values(by='Importance', ascending=False)
            print(f"Feature importance for {name}:")
            print(importance_df.head())
        else:
            print(f"Length mismatch for {name}: importance length = {len(importance)}, feature names length = {len(transformed_feature_names)}")


Number of features in importance: 176
Number of feature names after transformation: 176
Feature importance for Decision Tree:
       Feature  Importance
7   enginesize    0.648274
6   curbweight    0.262232
0       car_ID    0.018296
14  highwaympg    0.015944
4     carwidth    0.009474
Number of features in importance: 176
Number of feature names after transformation: 176
Feature importance for Random Forest:
       Feature  Importance
7   enginesize    0.633281
6   curbweight    0.244027
14  highwaympg    0.029046
0       car_ID    0.018986
11  horsepower    0.018564
Number of features in importance: 176
Number of feature names after transformation: 176
Feature importance for Gradient Boosting:
       Feature  Importance
7   enginesize    0.596375
6   curbweight    0.157593
11  horsepower    0.074358
14  highwaympg    0.061859
0       car_ID    0.019752


In [31]:
# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
import numpy as np
# Defining parameter grids for each model
param_grid_rf = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

param_grid_gb = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7],
    'model__subsample': [0.8, 1.0]
}

param_grid_dt = {
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}
# Creating a dictionary of models
models = {
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Decision Tree': DecisionTreeRegressor()
}

# Listing of parameter grids for the models
param_grids = {
    'Random Forest': param_grid_rf,
    'Gradient Boosting': param_grid_gb,
    'Decision Tree': param_grid_dt
}

# Initializing the best models dictionary
best_models = {}

# Hyperparameter tuning
for name, model in models.items():
    # Create pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    
    # Get the parameter grid
    param_grid = param_grids[name]
    
    # Perform GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error', verbose=1)
    
    # Fit the grid search
    grid_search.fit(X_train, y_train)
    
    # Get the best model from GridSearchCV
    best_models[name] = grid_search.best_estimator_
    
    # Evaluate the performance on the test set
    y_pred = best_models[name].predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    print(f"Best model for {name}: {grid_search.best_params_}")
    print(f"Performance (MSE) for {name} after tuning: {mse}")


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best model for Random Forest: {'model__max_depth': 20, 'model__min_samples_leaf': 2, 'model__min_samples_split': 2, 'model__n_estimators': 200}
Performance (MSE) for Random Forest after tuning: 3525656.302828681
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best model for Gradient Boosting: {'model__learning_rate': 0.2, 'model__max_depth': 3, 'model__n_estimators': 200, 'model__subsample': 0.8}
Performance (MSE) for Gradient Boosting after tuning: 4334643.537520105
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best model for Decision Tree: {'model__max_depth': 10, 'model__min_samples_leaf': 2, 'model__min_samples_split': 5}
Performance (MSE) for Decision Tree after tuning: 7123153.965179465


In [32]:
# comparing the performance
# Original models
original_models = {
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Decision Tree': DecisionTreeRegressor()
}

# Initial performance before tuning
for name, model in original_models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Original MSE for {name}: {mse}")
    
# Compare the results of hyperparameter tuning
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Tuned MSE for {name}: {mse}")

Original MSE for Random Forest: 3408423.3692637556
Original MSE for Gradient Boosting: 5151606.228910859
Original MSE for Decision Tree: 9461232.721997293
Tuned MSE for Random Forest: 3525656.302828681
Tuned MSE for Gradient Boosting: 4334643.537520105
Tuned MSE for Decision Tree: 7123153.965179465


In [None]:
Summary:
For, Hyperparameter Tuning, we used GridSearchCV to perform an exhaustive search over the hyperparameters. 
Cross-Validation, cv=5 in GridSearchCV ensures 5-fold cross-validation, which helps avoid overfitting and ensures robust model selection.
After tuning, the performance (MSE) of the models is compared to their performance before tuning to check if the tuning improved the model.
Evaluation:
The model's performance is measured by Mean Squared Error (MSE). since the tuned models have a lower MSE compared to the original models, the hyperparameter tuning was successful in improving performance.