In [114]:
import numpy as np
import seaborn as sns
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import re
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import *
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import RFE, RFECV
import plotly.express as px
warnings.filterwarnings('ignore')

In [115]:
df = pd.read_csv('car_prices.csv', index_col = 0)
df.head()

Unnamed: 0_level_0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000
1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250
2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000
3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,Yes,63500
4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,Yes,7850


### clean_title ustunini tashlab yuboramiz chunki bu ustunda bizga ma'no beradigan jihatlar yo'q

In [116]:
df = df.drop('clean_title', axis = 1)

### Avtomobillarning yillik bosib o'tgan masofalari uchun ham ustun yaratib olamiz

In [117]:
df['milage_per_year'] = df['milage'] // (2024 - df['model_year'])

### milage va price uchun qoshimcha ustunlar yaratib olamiz (bins)

In [118]:
milage_bins = [df['milage'].min(), df['milage'].quantile(1/3), df['milage'].quantile(2/3), df['milage'].max()]
price_bins = [df['price'].min(), df['price'].quantile(1/3), df['price'].quantile(2/3), df['price'].max()]

bin_labels = ['Low', 'Medium', 'High']

df['milage_bins'] = pd.cut(df['milage'], bins = milage_bins, labels=bin_labels, include_lowest=True)

In [119]:
df.head()

Unnamed: 0_level_0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,price,milage_per_year,milage_bins
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,11000,12391.0,Medium
1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,8250,4705.0,Medium
2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,15000,6099.0,High
3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,63500,1218.0,Low
4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,7850,4826.0,High


### HP (Horse-Power) ustunini yasab olamiz

In [120]:
df['horse_power'] = df['engine'].apply(lambda x: int(x.split('.0HP')[0]) if x.split('.0HP')[0].isdigit() else np.nan)
df['horse_power'].isna().sum()

np.int64(4057)

In [121]:
mini2 = df.sample(1000)
mini2.to_csv('cars2.csv')

In [122]:
df['Car_Age'] = 2024 - df['model_year']

### Monotonic usul bilan brand ustunini numerical qilamiz

In [123]:
brand_monotonic = df.groupby('brand')['price'].mean()

df['brand_monotonic'] = df['brand'].map(brand_monotonic)

In [124]:
df.head(3)

Unnamed: 0_level_0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,price,milage_per_year,milage_bins,horse_power,Car_Age,brand_monotonic
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,11000,12391.0,Medium,375.0,6,38154.063227
1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,8250,4705.0,Medium,300.0,17,40276.029448
2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,15000,6099.0,High,300.0,15,34840.403933


In [125]:
for i in df['transmission'].unique():
    print(i)

10-Speed A/T
6-Speed M/T
6-Speed A/T
Transmission w/Dual Shift Mode
A/T
5-Speed M/T
7-Speed A/T
5-Speed A/T
8-Speed A/T
Transmission Overdrive Switch
9-Speed Automatic
7-Speed M/T
10-Speed Automatic
6-Speed Automatic
M/T
5-Speed Automatic
CVT Transmission
9-Speed A/T
8-Speed Automatic
4-Speed A/T
Automatic
1-Speed A/T
8-Speed Automatic with Auto-Shift
7-Speed DCT Automatic
Automatic CVT
7-Speed Automatic
7-Speed Automatic with Auto-Shift
4-Speed Automatic
6-Speed Automatic with Auto-Shift
6-Speed Manual
7-Speed Manual
6-Speed Electronically Controlled Automatic with O
1-Speed Automatic
10-Speed Automatic with Overdrive
8-Speed Manual
2-Speed A/T
CVT-F
–
F
9-Speed Automatic with Auto-Shift
7-Speed
Variable
SCHEDULED FOR OR IN PRODUCTION
6-Speed
6 Speed At/Mt
6 Speed Mt


In [126]:
automatic = df['transmission'].str.contains('Automatic|A/T')
df['Automatic'] = automatic
df['Manual'] = ~automatic

In [127]:
df['horse_power'] = df['horse_power'].fillna(df.groupby('brand')['horse_power'].transform('median'))

In [128]:
df['horse_power'].isna().sum()

np.int64(1)

In [129]:
df['brand'].isna().sum()

np.int64(0)

In [130]:
global_median = df['horse_power'].median()
df['horse_power'] = df['horse_power'].fillna(global_median)

In [131]:
print(df['horse_power'].isna().sum())

0


In [132]:
df.head()

Unnamed: 0_level_0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,price,milage_per_year,milage_bins,horse_power,Car_Age,brand_monotonic,Automatic,Manual
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,11000,12391.0,Medium,375.0,6,38154.063227,True,False
1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,8250,4705.0,Medium,300.0,17,40276.029448,False,True
2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,15000,6099.0,High,300.0,15,34840.403933,True,False
3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,63500,1218.0,Low,335.0,2,40276.029448,False,True
4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,7850,4826.0,High,200.0,23,17526.060403,True,False


In [133]:
cols_for_encode = ['fuel_type', 'ext_col', 'int_col', 'accident', 'Automatic', 'Manual']
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

for col in df[cols_for_encode]:
    df[col] = label_encoder.fit_transform(df[col])

In [134]:
df.select_dtypes('number')

Unnamed: 0_level_0,model_year,milage,fuel_type,ext_col,int_col,accident,price,milage_per_year,horse_power,Car_Age,brand_monotonic,Automatic,Manual
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,2018,74349,2,26,57,1,11000,12391.0,375.0,6,38154.063227,1,0
1,2007,80000,2,17,9,1,8250,4705.0,300.0,17,40276.029448,0,1
2,2009,91491,2,181,6,1,15000,6099.0,300.0,15,34840.403933,1,0
3,2022,2437,3,100,24,1,63500,1218.0,335.0,2,40276.029448,0,1
4,2001,111000,2,249,9,1,7850,4826.0,200.0,23,17526.060403,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
54268,2017,29000,2,249,24,1,29000,4142.0,445.0,7,40276.029448,1,0
54269,2015,94634,1,17,9,0,6500,10514.0,220.0,9,37091.368241,1,0
54270,2013,40989,2,249,9,0,18950,3726.0,420.0,11,63742.154930,0,1
54271,2023,1518,2,12,24,1,194965,1518.0,375.0,1,63742.154930,1,0


### Blending with RandomizedSearchCV

In [138]:
cols = ['model_year', 'milage', 'accident', 'price', 'horse_power', 'Car_Age']
X = df[cols]
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Define models
models = {
    'linear': LinearRegression(),
    'ridge': Ridge(),
    'lasso': Lasso(),
    'huber': HuberRegressor(),
    'theil': TheilSenRegressor(),
    'ransac': RANSACRegressor()
}

# Define parameter grids for each model
param_grids = {
    'ridge': {'alpha': [0.1, 1.0, 10.0]},
    'lasso': {'alpha': [0.01, 0.1, 1.0]},
    'huber': {'epsilon': [1.35, 1.5, 1.75]},
    'ransac': {'min_samples': [0.5, 0.75, 1.0]},
    # TheilSenRegressor and LinearRegression have no hyperparameters to tune in this basic setup
}
best_models = {}
for name, model in models.items():
    if name in param_grids:  # Only run RandomizedSearchCV if there are hyperparameters to tune
        random_search = RandomizedSearchCV(model, param_grids[name], n_iter=10, scoring='neg_mean_absolute_error', cv=5, random_state=42)
        random_search.fit(X_train, y_train)
        best_models[name] = random_search.best_estimator_
    else:
        model.fit(X_train, y_train)
        best_models[name] = model  # Use the default model when no hyperparameters are specified
predictions = []

for name, model in best_models.items():
    y_pred = model.predict(X_test)
    predictions.append(y_pred)

# Blending: Averaging the predictions from all the best models
y_pred_avg = np.mean(predictions, axis=0)

# Calculate metrics for the blended model
rmse_avg = np.sqrt(mean_squared_error(y_test, y_pred_avg))
mae_avg = mean_absolute_error(y_test, y_pred_avg)
median_ae_avg = median_absolute_error(y_test, y_pred_avg)
mape_avg = mean_absolute_percentage_error(y_test, y_pred_avg)
r2_avg = r2_score(y_test, y_pred_avg)

# Store the blended model's metrics
results_avg = {
    'Model': 'Blended (Averaging)',
    'RMSE': rmse_avg,
    'MAE': mae_avg,
    'MedianAE': median_ae_avg,
    'MAPE': mape_avg,
    'R2': r2_avg
}

results_individual = []

for name, model in best_models.items():
    y_pred = model.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    median_ae = median_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results_individual.append({
        'Model': name,
        'RMSE': rmse,
        'MAE': mae,
        'MedianAE': median_ae,
        'MAPE': mape,
        'R2': r2
    })

# Add the blended model's results
results_individual.append(results_avg)

# Create a DataFrame and sort it by MAE
results_df = pd.DataFrame(results_individual)
results_df_sorted = results_df.sort_values(by='MAE', ascending=True)
results_df_sorted


Unnamed: 0,Model,RMSE,MAE,MedianAE,MAPE,R2
4,theil,2.294732e-11,1.296782e-11,7.275958e-12,3.432268e-16,1.0
1,ridge,2.542163e-11,1.434863e-11,1.455192e-11,6.194302e-16,1.0
0,linear,3.309607e-11,2.768775e-11,2.546585e-11,1.569964e-15,1.0
5,ransac,8.456706e-11,3.167459e-11,1.455192e-11,7.518075e-16,1.0
3,huber,0.0002807745,0.0002210845,0.0001836427,1.10016e-08,1.0
6,Blended (Averaging),0.08228995,0.06283105,0.0487772,3.621058e-06,1.0
2,lasso,0.4939239,0.3771323,0.2926257,2.173284e-05,1.0


### Stacking with RandomizedSearchCV (without passthrough)

In [140]:
from sklearn.ensemble import StackingRegressor
# Define the pipelines
linear_pipeline = Pipeline([('model', LinearRegression())])
ridge_pipeline = Pipeline([('model', Ridge())])
lasso_pipeline = Pipeline([('model', Lasso())])
huber_pipeline = Pipeline([('model', HuberRegressor())])
ransac_pipeline = Pipeline([('model', RANSACRegressor())])

# Define the parameter grids for RandomizedSearchCV
param_grids = {
    'ridge__model__alpha': [0.1, 1.0, 10.0],
    'lasso__model__alpha': [0.01, 0.1, 1.0],
    'huber__model__epsilon': [1.35, 1.5, 1.75],
    'ransac__model__min_samples': [0.5, 0.75, 1.0],
}

# Define the meta-model
meta_model = LinearRegression()

# Create the stacking regressor
stacking_regressor = StackingRegressor(
    estimators=[
        ('linear', linear_pipeline),
        ('ridge', ridge_pipeline),
        ('lasso', lasso_pipeline),
        ('huber', huber_pipeline),
        ('ransac', ransac_pipeline)
    ],
    final_estimator=meta_model,
    cv=5
)

# Set up the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=stacking_regressor,
    param_distributions=param_grids,
    n_iter=10,
    scoring='neg_mean_absolute_error',
    cv=5,
    random_state=42,
    n_jobs=-1
)

# Fit the RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best estimator
best_stacking_regressor = random_search.best_estimator_

# Predict using the best stacking model
y_pred_stack = best_stacking_regressor.predict(X_test)

# Evaluate the stacked predictions
rmse_stack = np.sqrt(mean_squared_error(y_test, y_pred_stack))
mae_stack = mean_absolute_error(y_test, y_pred_stack)
median_ae_stack = median_absolute_error(y_test, y_pred_stack)
mape_stack = mean_absolute_percentage_error(y_test, y_pred_stack)
r2_stack = r2_score(y_test, y_pred_stack)

# Store results for comparison
results_stack = {
    'Model': 'Stacking (Tuned)',
    'RMSE': rmse_stack,
    'MAE': mae_stack,
    'MedianAE': median_ae_stack,
    'MAPE': mape_stack,
    'R2': r2_stack
}

# Display the results
results_df = pd.DataFrame([results_stack])
results_df


Unnamed: 0,Model,RMSE,MAE,MedianAE,MAPE,R2
0,Stacking (Tuned),5.851724e-11,2.543847e-11,1.818989e-11,1.175079e-15,1.0


###