In [11]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Step 1: Load and Preprocess the Data
file_path = 'CarPrice_Assignment (1).csv'
car_data = pd.read_csv(file_path)

# Separate features and target variable
X = car_data.drop(columns=['price', 'car_ID', 'CarName'])
y = car_data['price']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing for numerical data: Standardization
numerical_transformer = StandardScaler()

# Preprocessing for categorical data: One-hot encoding
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Step 2: Model Implementation and Evaluation
# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor": SVR()
}

# Dictionary to store models and their evaluation metrics
model_results = {}

# Train and evaluate each model
for model_name, model in models.items():
    # Fit the model on training data
    model.fit(X_train_transformed, y_train)
    
    # Make predictions on test data
    y_pred = model.predict(X_test_transformed)
    
    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store results
    model_results[model_name] = {"MSE": mse, "MAE": mae, "R-squared": r2}

# Convert the results dictionary into a DataFrame
results_df = pd.DataFrame(model_results).T
print("Model Evaluation Results:\n", results_df)

# Step 3: Feature Importance (Random Forest as an example)
best_model = RandomForestRegressor(random_state=42)
best_model.fit(X_train_transformed, y_train)

# Get feature names after preprocessing
feature_names = numerical_cols.tolist() + list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols))

# Feature importance from the best model
importances = best_model.feature_importances_
feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)
print("\nFeature Importance:\n", feature_importance)

# Step 4: Hyperparameter Tuning Example (Random Forest)
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='r2', n_jobs=-1)
grid_search.fit(X_train_transformed, y_train)

print("\nBest Hyperparameters:\n", grid_search.best_params_)
print("Best R-squared after tuning: ", grid_search.best_score_)


Model Evaluation Results:
                                       MSE          MAE  R-squared
Linear Regression            1.006731e+07  2244.601726   0.872475
Decision Tree Regressor      8.223687e+06  1847.434951   0.895829
Random Forest Regressor      3.337152e+06  1276.398752   0.957728
Gradient Boosting Regressor  5.786643e+06  1666.400413   0.926699
Support Vector Regressor     8.681186e+07  5694.471715  -0.099663

Feature Importance:
                   Feature  Importance
6              enginesize    0.552162
5              curbweight    0.294245
13             highwaympg    0.045325
10             horsepower    0.031434
3                carwidth    0.014010
2               carlength    0.008621
1               wheelbase    0.007589
11                peakrpm    0.006911
12                citympg    0.006624
8                  stroke    0.004688
7               boreratio    0.004293
9        compressionratio    0.004204
4               carheight    0.003482
48        fuelsystem_mp