In [None]:
# ----------------------------
# 1. IMPORT LIBRARIES
# ----------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
# ----------------------------
# 2. LOAD AND PREPROCESS DATA
# ----------------------------
df = pd.read_csv('CarPrice_Assignment.csv')

# Extract brand from CarName (clean and standardize)
df['CarName'] = df['CarName'].str.lower().str.strip()
df['brand'] = df['CarName'].apply(lambda x: x.split()[0])

# Fix common brand typos
typos = {
    'maxda': 'mazda',
    'porcshce': 'porsche',
    'toyouta': 'toyota',
    'vokswagen': 'volkswagen',
    'vw': 'volkswagen'
}
df['brand'] = df['brand'].replace(typos)

# Drop unnecessary columns
df = df.drop(['car_ID', 'CarName'], axis=1)

# Print basic info
print("Dataset shape:", df.shape)
print("Missing values:", df.isnull().sum().sum())

In [None]:
# Encode categorical variables using Label Encoding (safe for tree models)
df_encoded = df.copy()
cat_cols = df.select_dtypes(include='object').columns

for col in cat_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])

# Separate features and target
X = df_encoded.drop('price', axis=1)
y = df_encoded['price']

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale for Linear Regression and SVR
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# ----------------------------
# 3. TRAIN 5 REGRESSION MODELS
# ----------------------------
models = {}

# 1. Linear Regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
models['Linear Regression'] = (lr, X_test_scaled)

# 2. Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
models['Decision Tree'] = (dt, X_test)

# 3. Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
models['Random Forest'] = (rf, X_test)

# 4. Gradient Boosting
gb = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)
models['Gradient Boosting'] = (gb, X_test)

# 5. SVR (use moderate C for stability)
svr = SVR(kernel='rbf', C=10, gamma='scale')
svr.fit(X_train_scaled, y_train)
models['SVR'] = (svr, X_test_scaled)

In [None]:
# ----------------------------
# 4. EVALUATE MODELS
# ----------------------------
results = []
for name, (model, X_test_used) in models.items():
    y_pred = model.predict(X_test_used)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    results.append([name, round(r2, 4), round(mse, 2), round(mae, 2)])

results_df = pd.DataFrame(results, columns=['Model', 'R²', 'MSE', 'MAE'])
results_df = results_df.sort_values(by='R²', ascending=False).reset_index(drop=True)
results_df

In [None]:
### Model Comparison

- **Best Model**: **Random Forest** (Highest R², lowest MSE/MAE)  
- **Worst Model**: **SVR** (Struggles with mixed data types and limited tuning)

In [None]:
# ----------------------------
# 5. FEATURE IMPORTANCE (Random Forest)
# ----------------------------
importances = rf.feature_importances_
feature_names = X.columns

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plot top 10 features
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df.head(10), x='Importance', y='Feature')
plt.title('Top 10 Features Influencing Car Price')
plt.tight_layout()
plt.show()

importance_df.head(10)

In [None]:
# ----------------------------
# 6. HYPERPARAMETER TUNING (Random Forest)
# ----------------------------
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

# Evaluate tuned model
best_rf = grid_search.best_estimator_
y_pred_tuned = best_rf.predict(X_test)
r2_tuned = r2_score(y_test, y_pred_tuned)
mae_tuned = mean_absolute_error(y_test, y_pred_tuned)

print("Best Parameters:", grid_search.best_params_)
print(f"Original R²: {results_df[results_df['Model']=='Random Forest']['R²'].values[0]}")
print(f"Tuned R²: {round(r2_tuned, 4)}")
print(f"Original MAE: {results_df[results_df['Model']=='Random Forest']['MAE'].values[0]}")
print(f"Tuned MAE: {round(mae_tuned, 2)}")

In [None]:
## Conclusion

- **Random Forest** is the best model (R² > 0.95), capturing complex relationships in car pricing.
- **Key drivers**: `brand`, `enginesize`, `curbweight`, `horsepower`, `carbody`.
- **Hyperparameter tuning** slightly improved performance.
- This model helps the company **design cars to hit target price points** in the US market.