In [42]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, HuberRegressor, TheilSenRegressor, RANSACRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import root_mean_squared_error as rmse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import median_absolute_error as mdn
from sklearn.metrics import r2_score as r2
from manual_metrics import adjusted_r2_score as adr2
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.datasets import fetch_california_housing
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import r2_score

## Ensemble Methods in Machine Learning

In [43]:
cars_df = pd.read_csv('car_prices_new.csv', index_col=0)

In [44]:
cars_df.shape

(54273, 18)

In [45]:
cars_df.select_dtypes('number').corr().style.background_gradient()

Unnamed: 0,model_year,milage,fuel_type,ext_col,int_col,accident,price,milage_per_year,horse_power,Car_Age,brand_monotonic,Automatic,Manual
model_year,1.0,-0.625226,-0.044165,-0.023136,-0.018272,0.216034,0.21615,0.17891,0.358856,-1.0,0.140151,0.091554,-0.091554
milage,-0.625226,1.0,-0.084541,0.01243,0.002399,-0.295166,-0.248927,0.431499,-0.38881,0.625226,-0.211792,0.052443,-0.052443
fuel_type,-0.044165,-0.084541,1.0,-0.017347,0.015191,0.03195,0.001832,-0.082575,-0.032243,0.044165,-0.024212,-0.066529,0.066529
ext_col,-0.023136,0.01243,-0.017347,1.0,0.072006,-0.008808,-0.009888,-0.001658,-0.041941,0.023136,-0.000421,0.005536,-0.005536
int_col,-0.018272,0.002399,0.015191,0.072006,1.0,-0.019442,0.021952,-0.009901,0.007636,0.018272,0.007511,0.030862,-0.030862
accident,0.216034,-0.295166,0.03195,-0.008808,-0.019442,1.0,0.114705,-0.070513,0.184151,-0.216034,0.10252,-0.025829,0.025829
price,0.21615,-0.248927,0.001832,-0.009888,0.021952,0.114705,1.0,-0.060656,0.245079,-0.21615,0.226354,-0.004036,0.004036
milage_per_year,0.17891,0.431499,-0.082575,-0.001658,-0.009901,-0.070513,-0.060656,1.0,-0.09245,-0.17891,-0.094845,0.107342,-0.107342
horse_power,0.358856,-0.38881,-0.032243,-0.041941,0.007636,0.184151,0.245079,-0.09245,1.0,-0.358856,0.405416,-0.052207,0.052207
Car_Age,-1.0,0.625226,0.044165,0.023136,0.018272,-0.216034,-0.21615,-0.17891,-0.358856,1.0,-0.140151,-0.091554,0.091554


### Datasetimizda infinitive qiymatlar bor ekan shularni nan qiymatga o'tkazib so'ngra tashlab yuboramiz

In [46]:
cars_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with NaN values
cars_df.dropna(inplace=True)

In [47]:
cars_df.shape

(54231, 18)

### Linear regression orqali metricslarni chiqarib olamiz

In [48]:
X = cars_df.select_dtypes('number').drop('price', axis = 1)
y = cars_df[['price']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_linear = LinearRegression()
model_linear.fit(X_train, y_train)

y_pred = model_linear.predict(X_test)

print('RMSE for linear:', rmse(y_pred, y_test))
print('MSE for linear:', mse(y_pred, y_test))
print('MAE for linear:', mae(y_pred, y_test))
print('MAPE for linear:', mape(y_pred, y_test))
print('R2 for linear:', r2(y_pred, y_test))
n = X_test.shape[0]
p = X_test.shape[1]
print('Adjusted R2 for linear:', adr2(y_pred, y_test, n, p))


RMSE for linear: 73522.55985397419
MSE for linear: 5405566807.481217
MAE for linear: 19631.72945345156
MAPE for linear: 1.2153371101556256
R2 for linear: -7.925515562196514
Adjusted R2 for linear: -7.935401678750543


## Blending (Ensemble Learning)

In [52]:
def adjusted_r2_score(y_true, y_pred, n, p):
    r2_value = r2(y_true, y_pred)
    if n == p + 1:
        return np.nan
    adjusted_r2 = 1 - ((1 - r2_value) * (n - 1)) / (n - p - 1)
    return adjusted_r2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the base models
models = [
    ('linear', LinearRegression()),
    ('ridge', Ridge(alpha=1)),
    ('lasso', Lasso(alpha=0.1)),
    ('huber', HuberRegressor()),
    ('ransac', RANSACRegressor()),
    ('theil', TheilSenRegressor())
]

# Train models and collect predictions
predictions = []
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    predictions.append(y_pred)

# Average predictions
y_pred_avg = np.mean(predictions, axis=0)

# Evaluate the averaged predictions
rmse_avg = np.sqrt(mse(y_test, y_pred_avg))
mae_avg = mae(y_test, y_pred_avg)
median_ae_avg = mdn(y_test, y_pred_avg)
mape_avg = np.mean(np.abs((y_test - y_pred_avg) / y_test)) * 100
r2_avg = r2(y_test, y_pred_avg)
n = X_test.shape[0]
p = X_test.shape[1]
r2_adjusted_avg = adjusted_r2_score(y_test, y_pred_avg, n, p)

# Store results for comparison
results_avg = {
    'Model': 'Averaging',
    'RMSE': rmse_avg,
    'MAE': mae_avg,
    'MedianAE': median_ae_avg,
    'MAPE': mape_avg,
    'R2': r2_avg,
    'Adjusted R2': r2_adjusted_avg
}

# Compare with individual models
results_individual = []

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    rmse = np.sqrt(mse(y_test, y_pred))
    mae_val = mae(y_test, y_pred)
    median_ae = mdn(y_test, y_pred)
    mape_val = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    r2_val = r2(y_test, y_pred)
    r2_adjusted_val = adjusted_r2_score(y_test, y_pred, n, p)

    results_individual.append({
        'Model': name,
        'RMSE': rmse,
        'MAE': mae_val,
        'MedianAE': median_ae,
        'MAPE': mape_val,
        'R2': r2_val,
        'Adjusted R2': r2_adjusted_val
    })

# Add averaging results to individual results
results_individual.append(results_avg)

# Create a DataFrame to display results
results_df = pd.DataFrame(results_individual)
results_df_sorted = results_df.sort_values(by='MAPE', ascending=True)
results_df_sorted

Unnamed: 0,Model,RMSE,MAE,MedianAE,MAPE,R2,Adjusted R2
0,linear,1.428571,1.428571,1.428571,35.714286,,
4,ransac,1.428571,1.428571,1.428571,35.714286,,
3,huber,1.428636,1.428636,1.428636,35.715911,,
2,lasso,1.485714,1.485714,1.485714,37.142857,,
6,Averaging,1.543382,1.543382,1.543382,38.58456,,
5,theil,1.664477,1.664477,1.664477,41.611915,,
1,ridge,1.824324,1.824324,1.824324,45.608108,,
