In [5]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet,TheilSenRegressor,HuberRegressor,RANSACRegressor
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score,median_absolute_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures
import warnings
import re
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')


In [6]:
data = pd.read_csv('car_prices.csv')
col = data.columns
index = data.index

col, index

(Index(['id', 'brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
        'transmission', 'ext_col', 'int_col', 'accident', 'clean_title',
        'price'],
       dtype='object'),
 RangeIndex(start=0, stop=54273, step=1))

In [7]:
data.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000
3,3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,Yes,63500
4,4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,Yes,7850


#### Uniqual datalar sonini korib chiqamiz har bir column uchun

In [8]:
for i in col:
    print(f"{i}: {data[i].nunique()}")
    

id: 54273
brand: 53
model: 1827
model_year: 34
milage: 3212
fuel_type: 7
engine: 1061
transmission: 46
ext_col: 260
int_col: 124
accident: 2
clean_title: 1
price: 1481


#### Feature enginering qilamiz


In [9]:
def feature_engine(engine):
    if pd.isna(engine):
        return pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan])

    hp = re.search(r'(\d+\.?\d*)HP', engine)
    liter = re.search(r'(\d+\.?\d*)L', engine)
    motor = re.search(r'(Straight|V)\s*\d*', engine)
    cylinder = re.search(r'(\d+)\s*Cylinder', engine)
    fuel = re.search(r'(Gasoline|Hybrid|Flex|Diesel)', engine)
    
    return pd.Series([
        hp.group(1) if hp else np.nan,
        liter.group(1) if liter else np.nan,
        motor.group(0) if motor else np.nan,
        cylinder.group(1) if cylinder else np.nan,
        fuel.group(1) if fuel else np.nan
    ])




#### NaN va zero ustunlarni 2ta metrica yani mode va mean boyicha toldirib olamiz feature enginering qilgandan keyin 

In [10]:
### Numeric datalani mean boyicha nan va zerolarini toldirib olamiz

def fillna_mean(df, col, columns):
    print(f"Col: {col}, Columns: {columns}, Nan: {df[col].isna().sum()}", end='')
    value = df[df[col].notna()].groupby(by=columns)[col].mean().reset_index()
    df = pd.merge(df, value, on=columns, how='left', suffixes=('', '_mean'))
    df[col] = df[col].fillna(df[col + '_mean'])
    df.drop(columns=[col + '_mean'], inplace=True)
    print(f", After Clean: {df[col].isna().sum()}")
    return df


###  Objective ustunlardagi nan va zero datalarni  modelari boyicha toldirib olamiz

def fillna_mode(df, col, columns):
    print(f"Col: {col}, Columns: {columns}, Nan: {df[col].isna().sum()}", end='')
    mode_values = df.groupby(columns)[col].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else pd.NA).reset_index()
    df = pd.merge(df, mode_values, on=columns, how='left', suffixes=('', '_mode'))
    df[col] = df[col].fillna(df[col + '_mode'])
    df.drop(columns=[col + '_mode'], inplace=True)

    print(f", After Clean: {df[col].isna().sum()}")
    return df

def to_numeric(x):
    try:
        return float(x)
    except ValueError as err:
        print(err)
        return np.nan


#### datalarni price ustuini boyicha sortlab olamiz

In [11]:
def ordinal_encoder(df, column, on):

    avg_prices = df.groupby(column)[on].mean().reset_index()
    avg_prices.columns = [column, 'avg_price']
    avg_prices = avg_prices.sort_values(by='avg_price').reset_index(drop=True)
    avg_prices[column+'_rank'] = avg_prices.index + 1
    return pd.merge(df, avg_prices[[column, column+'_rank']], on=column, how='left')

####  Feature enginering qilamiz engine ustunidagi datalarni

In [12]:
new_columns = ['hp', 'litr', 'motor', 'Cylinder', 'fuel']
data[new_columns] = data['engine'].apply(feature_engine)

In [13]:
data.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price,hp,litr,motor,Cylinder,fuel
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000,375.0,3.5,V6,6,Gasoline
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250,300.0,3.0,Straight 6,6,Gasoline
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000,300.0,4.2,,8,Gasoline
3,3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,Yes,63500,335.0,3.0,Straight 6,6,Gasoline
4,4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,Yes,7850,200.0,3.8,V6,6,Gasoline


In [14]:
data = data.drop(columns=['clean_title', 'id'])

#### Nan va zero qiymatlarni korib chiqamiz


In [15]:
data[new_columns].isna().sum()

hp           4057
litr          606
motor       29640
Cylinder     4175
fuel         4258
dtype: int64

### Har bir yaratilgan yangi ustunlar boyicha NaN larni toldirib chiqamiz tepadagi yozgan functionlardan foydalanib

#### HP ustuni boyicha 

In [16]:
data['hp'] = data['hp'].agg(to_numeric)

data = fillna_mean(data.copy(), 'hp', ['motor', 'Cylinder', 'litr'])
data = fillna_mean(data.copy(), 'hp', ['motor', 'litr'])
data = fillna_mean(data.copy(), 'hp', ['brand', 'model'])
data = fillna_mean(data.copy(), 'hp', ['litr', 'fuel_type'])
data = fillna_mean(data.copy(), 'hp', ['model'])
data = fillna_mean(data.copy(), 'hp', ['motor'])
data = fillna_mean(data.copy(), 'hp', ['price', 'brand'])
data = fillna_mean(data.copy(), 'hp', ['price', 'fuel_type'])
data = fillna_mean(data.copy(), 'hp', ['engine'])



print(data['hp'].isna().sum())

Col: hp, Columns: ['motor', 'Cylinder', 'litr'], Nan: 4057, After Clean: 4055
Col: hp, Columns: ['motor', 'litr'], Nan: 4055, After Clean: 2809
Col: hp, Columns: ['brand', 'model'], Nan: 2809, After Clean: 369
Col: hp, Columns: ['litr', 'fuel_type'], Nan: 369, After Clean: 119
Col: hp, Columns: ['model'], Nan: 119, After Clean: 111
Col: hp, Columns: ['motor'], Nan: 111, After Clean: 107
Col: hp, Columns: ['price', 'brand'], Nan: 107, After Clean: 23
Col: hp, Columns: ['price', 'fuel_type'], Nan: 23, After Clean: 14
Col: hp, Columns: ['engine'], Nan: 14, After Clean: 0
0


#### Litr ustuni boyicha

In [17]:
data['litr'] = data['litr'].agg(to_numeric)


data = fillna_mean(data.copy(), 'litr', ['motor', 'Cylinder', 'hp'])
data = fillna_mean(data.copy(), 'litr', ['hp', 'fuel_type'])
data = fillna_mean(data.copy(), 'litr', ['model'])
data = fillna_mean(data.copy(), 'litr', ['engine'])
data = fillna_mean(data.copy(), 'litr', ['price', 'fuel_type'])
data = fillna_mean(data.copy(), 'litr', ['price', 'transmission'])


print(data['litr'].isna().sum())

Col: litr, Columns: ['motor', 'Cylinder', 'hp'], Nan: 606, After Clean: 606
Col: litr, Columns: ['hp', 'fuel_type'], Nan: 606, After Clean: 396
Col: litr, Columns: ['model'], Nan: 396, After Clean: 124
Col: litr, Columns: ['engine'], Nan: 124, After Clean: 46
Col: litr, Columns: ['price', 'fuel_type'], Nan: 46, After Clean: 3
Col: litr, Columns: ['price', 'transmission'], Nan: 3, After Clean: 0
0


#### Cylinder boyicha

In [18]:
data['Cylinder'] = data['Cylinder'].agg(to_numeric)

data = fillna_mean(data.copy(), 'Cylinder', ['litr', 'hp'])
data = fillna_mean(data.copy(), 'Cylinder', ['litr', 'model'])
data = fillna_mean(data.copy(), 'Cylinder', ['hp', 'model'])
data = fillna_mean(data.copy(), 'Cylinder', ['model'])
data = fillna_mean(data.copy(), 'Cylinder', ['hp', 'fuel_type'])
data = fillna_mean(data.copy(), 'Cylinder', ['price', 'fuel_type'])
data = fillna_mean(data.copy(), 'Cylinder', ['price', 'transmission'])
data = fillna_mean(data.copy(), 'Cylinder', ['litr', 'fuel_type'])
data = fillna_mean(data.copy(), 'Cylinder', ['engine'])


print(data['Cylinder'].isna().sum())

Col: Cylinder, Columns: ['litr', 'hp'], Nan: 4175, After Clean: 3772
Col: Cylinder, Columns: ['litr', 'model'], Nan: 3772, After Clean: 1383
Col: Cylinder, Columns: ['hp', 'model'], Nan: 1383, After Clean: 1033
Col: Cylinder, Columns: ['model'], Nan: 1033, After Clean: 632
Col: Cylinder, Columns: ['hp', 'fuel_type'], Nan: 632, After Clean: 214
Col: Cylinder, Columns: ['price', 'fuel_type'], Nan: 214, After Clean: 32
Col: Cylinder, Columns: ['price', 'transmission'], Nan: 32, After Clean: 8
Col: Cylinder, Columns: ['litr', 'fuel_type'], Nan: 8, After Clean: 2
Col: Cylinder, Columns: ['engine'], Nan: 2, After Clean: 0
0


#### Objectlarni yani Motor ustunini todiramiz mode boyicha


In [19]:
data = fillna_mode(data.copy(), 'motor', ['litr', 'hp', 'Cylinder', 'model'])
data = fillna_mode(data.copy(), 'motor', ['litr', 'hp', 'Cylinder'])
data = fillna_mode(data.copy(), 'motor', ['litr', 'hp'])
data = fillna_mode(data.copy(), 'motor', ['model'])
data = fillna_mode(data.copy(), 'motor', ['engine'])
data = fillna_mode(data.copy(), 'motor', ['fuel_type', 'hp'])
data = fillna_mode(data.copy(), 'motor', ['litr', 'Cylinder'])
data = fillna_mode(data.copy(), 'motor', ['Cylinder', 'price'])
data = fillna_mode(data.copy(), 'motor', ['Cylinder', 'brand'])
data = fillna_mode(data.copy(), 'motor', ['litr', 'brand'])
data = fillna_mode(data.copy(), 'motor', ['hp', 'brand'])
data = fillna_mode(data.copy(), 'motor', ['hp', 'fuel'])
data = fillna_mode(data.copy(), 'motor', ['brand', 'price'])
data = fillna_mode(data.copy(), 'motor', ['brand', 'price'])
data = fillna_mode(data.copy(), 'motor', ['brand', 'fuel_type'])


print(data['motor'].isna().sum())

Col: motor, Columns: ['litr', 'hp', 'Cylinder', 'model'], Nan: 29640, After Clean: 28986
Col: motor, Columns: ['litr', 'hp', 'Cylinder'], Nan: 28986, After Clean: 21847
Col: motor, Columns: ['litr', 'hp'], Nan: 21847, After Clean: 21438
Col: motor, Columns: ['model'], Nan: 21438, After Clean: 6649
Col: motor, Columns: ['engine'], Nan: 6649, After Clean: 1320
Col: motor, Columns: ['fuel_type', 'hp'], Nan: 1320, After Clean: 522
Col: motor, Columns: ['litr', 'Cylinder'], Nan: 522, After Clean: 76
Col: motor, Columns: ['Cylinder', 'price'], Nan: 76, After Clean: 66
Col: motor, Columns: ['Cylinder', 'brand'], Nan: 66, After Clean: 52
Col: motor, Columns: ['litr', 'brand'], Nan: 52, After Clean: 40
Col: motor, Columns: ['hp', 'brand'], Nan: 40, After Clean: 12
Col: motor, Columns: ['hp', 'fuel'], Nan: 12, After Clean: 11
Col: motor, Columns: ['brand', 'price'], Nan: 11, After Clean: 4
Col: motor, Columns: ['brand', 'price'], Nan: 4, After Clean: 4
Col: motor, Columns: ['brand', 'fuel_type']

#### Fuel ustunini toldiramiz


In [20]:
data = fillna_mode(data.copy(), 'fuel', ['litr', 'hp', 'Cylinder', 'model'])
data = fillna_mode(data.copy(), 'fuel', ['litr', 'hp', 'model'])
data = fillna_mode(data.copy(), 'fuel', ['hp', 'model'])
data = fillna_mode(data.copy(), 'fuel', ['model'])
data = fillna_mode(data.copy(), 'fuel', ['motor', 'hp'])
data = fillna_mode(data.copy(), 'fuel', ['motor', 'hp'])
data = fillna_mode(data.copy(), 'fuel', ['motor', 'Cylinder'])
data = fillna_mode(data.copy(), 'fuel', ['motor', 'litr'])
data = fillna_mode(data.copy(), 'fuel', ['motor'])

print(data['fuel'].isna().sum())

Col: fuel, Columns: ['litr', 'hp', 'Cylinder', 'model'], Nan: 4258, After Clean: 3855
Col: fuel, Columns: ['litr', 'hp', 'model'], Nan: 3855, After Clean: 3756
Col: fuel, Columns: ['hp', 'model'], Nan: 3756, After Clean: 3515
Col: fuel, Columns: ['model'], Nan: 3515, After Clean: 787
Col: fuel, Columns: ['motor', 'hp'], Nan: 787, After Clean: 304
Col: fuel, Columns: ['motor', 'hp'], Nan: 304, After Clean: 304
Col: fuel, Columns: ['motor', 'Cylinder'], Nan: 304, After Clean: 182
Col: fuel, Columns: ['motor', 'litr'], Nan: 182, After Clean: 58
Col: fuel, Columns: ['motor'], Nan: 58, After Clean: 0
0


In [21]:
data.head(1)

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,price,hp,litr,motor,Cylinder,fuel
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,11000,375.0,3.5,V6,6.0,Gasoline


In [22]:
data[new_columns].isna().sum()

hp          0
litr        0
motor       0
Cylinder    0
fuel        0
dtype: int64

In [23]:
data.select_dtypes(include='object').columns

Index(['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col',
       'int_col', 'accident', 'motor', 'fuel'],
      dtype='object')

In [24]:
for col in data.select_dtypes(include='object').columns:
    data = ordinal_encoder(data.copy(), col, 'price')

In [25]:
data.head(1)

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,...,brand_rank,model_rank,fuel_type_rank,engine_rank,transmission_rank,ext_col_rank,int_col_rank,accident_rank,motor_rank,fuel_rank
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,...,31,1256,4,821,28,91,14,2,1,3


### Chiziqli model

In [26]:
columns = ['model_year',
           'milage',
           'hp',
           'litr',
           'Cylinder',
           'brand_rank',
           'model_rank',
           'fuel_type_rank',
           'engine_rank',
           'transmission_rank',
           'ext_col_rank',
           'int_col_rank',
           'accident_rank',
           'motor_rank',
           'fuel_rank'
           ]

X = data[columns]
y = data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.37, random_state=125)

model = LinearRegression()

cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_rmse = np.sqrt(np.mean((-cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error'))))

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'CV RMSE: {cv_rmse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

CV RMSE: 68040.11252712364
RMSE: 69944.67979996078
MAE: 19389.99585093425
R2: 0.1094867627865036


#### Polynomial Reg models

In [27]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.37, random_state=125)

model = LinearRegression()

cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_rmse = np.sqrt(np.mean((-cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error'))))

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'CV RMSE: {cv_rmse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

CV RMSE: 67275.41297095054
RMSE: 68974.20593149104
MAE: 18491.2921885455
R2: 0.13402685237038292


#### Outlierlarni filtirlab olib korib chiqamiz


In [28]:
data1 = data[data['price'] <= data['price'].quantile(0.95)]

In [29]:
X = data1[columns]
y = data1['price']

In [30]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.37, random_state=125)

model = LinearRegression()

cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_rmse = np.sqrt(np.mean((-cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error'))))

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'CV RMSE: {cv_rmse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

CV RMSE: 12015.062360617138
RMSE: 11961.465478650307
MAE: 8395.475149823636
R2: 0.6362014146709627


#### Robostic Reg tahlillari

In [31]:
# linear_model = LinearRegression()
# linear_model.fit(X, y)

# tail = TheilSenRegressor()
# tail.fit(X, y)

# huber = HuberRegressor()
# huber.fit(X, y)

# ransac = RANSACRegressor()
# ransac.fit(X, y)


# linear_rmse = np.sqrt(mean_squared_error(y, linear_model.predict(X)))
# tail_rmse = np.sqrt(mean_squared_error(y, tail.predict(X)))
# huber_rmse = np.sqrt(mean_squared_error(y, huber.predict(X)))
# ransac_rmse = np.sqrt(mean_squared_error(y, ransac.predict(X)))

# mae_linear = mean_absolute_error(y, linear_model.predict(X))
# mae_tail = mean_absolute_error(y, tail.predict(X))
# mae_huber = mean_absolute_error(y, huber.predict(X))
# mae_ransac = mean_absolute_error(y, ransac.predict(X))


# median_error = median_absolute_error(y, linear_model.predict(X))
# median_error_tail = median_absolute_error(y, tail.predict(X))
# median_error_huber = median_absolute_error(y, huber.predict(X))
# median_error_ransac = median_absolute_error(y, ransac.predict(X))

# print('Median Error Linear:', median_error)
# print('Linear MAE:', mae_linear)
# print('Linear RMSE:', linear_rmse)
# print()
# print('Median Error RANSAC:', median_error_ransac)
# print('RANSAC MAE:', mae_ransac)
# print('RANSAC RMSE:', ransac_rmse)
# print()
# print('Median Error Tail:', median_error_tail)
# print('Theil MAE:', mae_tail)
# print('Theil RMSE:', tail_rmse)
# print()
# print('Median Error Huber:', median_error_huber)
# print('Huber MAE:', mae_huber)
# print('Huber RMSE:', huber_rmse)

In [32]:
# linear_model = LinearRegression()
# linear_model.fit(X_poly, y)

# tail = TheilSenRegressor()
# tail.fit(X_poly, y)

# huber = HuberRegressor()
# huber.fit(X_poly, y)

# ransac = RANSACRegressor()
# ransac.fit(X_poly, y)


# linear_rmse = np.sqrt(mean_squared_error(y, linear_model.predict(X_poly)))
# tail_rmse = np.sqrt(mean_squared_error(y, tail.predict(X_poly)))
# huber_rmse = np.sqrt(mean_squared_error(y, huber.predict(X_poly)))
# ransac_rmse = np.sqrt(mean_squared_error(y, ransac.predict(X_poly)))

# mae_linear = mean_absolute_error(y, linear_model.predict(X_poly))
# mae_tail = mean_absolute_error(y, tail.predict(X_poly))
# mae_huber = mean_absolute_error(y, huber.predict(X_poly))
# mae_ransac = mean_absolute_error(y, ransac.predict(X_poly))


# median_error = median_absolute_error(y, linear_model.predict(X_poly))
# median_error_tail = median_absolute_error(y, tail.predict(X_poly))
# median_error_huber = median_absolute_error(y, huber.predict(X_poly))
# median_error_ransac = median_absolute_error(y, ransac.predict(X_poly))

# print('Median Error Linear:', median_error)
# print('Linear MAE:', mae_linear)
# print('Linear RMSE:', linear_rmse)
# print()
# print('Median Error RANSAC:', median_error_ransac)
# print('RANSAC MAE:', mae_ransac)
# print('RANSAC RMSE:', ransac_rmse)
# print()
# print('Median Error Tail:', median_error_tail)
# print('Theil MAE:', mae_tail)
# print('Theil RMSE:', tail_rmse)
# print()
# print('Median Error Huber:', median_error_huber)
# print('Huber MAE:', mae_huber)
# print('Huber RMSE:', huber_rmse)

In [33]:
# from sklearn.feature_selection import RFECV
# X = data[columns]
# y = data['price']
# poly = PolynomialFeatures(degree=2, include_bias=False)
# X_poly = pd.DataFrame(poly.fit_transform(X), columns=poly.get_feature_names_out(X.columns))


# cv = KFold(n_splits=6, shuffle=True, random_state=42)

# linear_model = LinearRegression()

# rfecv = RFECV(estimator=linear_model, step=1, cv=cv, scoring='neg_mean_squared_error', verbose=2)

# rfecv.fit(X_poly, y)
# optimal_num_features = rfecv.n_features_
# selected_features = X_poly.columns[rfecv.support_]

# print("Optimal number of features:", optimal_num_features)
# print("Selected features:", selected_features.tolist())

In [34]:
# X_poly = pd.DataFrame(poly.fit_transform(X), columns=poly.get_feature_names_out(X.columns))
# X = X_poly[selected_features]
# y = data['price']

# linear_model = LinearRegression()
# linear_model.fit(X, y)

# tail = TheilSenRegressor()
# tail.fit(X, y)

# huber = HuberRegressor()
# huber.fit(X, y)

# ransac = RANSACRegressor()
# ransac.fit(X, y)


# linear_rmse = np.sqrt(mean_squared_error(y, linear_model.predict(X)))
# tail_rmse = np.sqrt(mean_squared_error(y, tail.predict(X)))
# huber_rmse = np.sqrt(mean_squared_error(y, huber.predict(X)))
# ransac_rmse = np.sqrt(mean_squared_error(y, ransac.predict(X)))

# mae_linear = mean_absolute_error(y, linear_model.predict(X))
# mae_tail = mean_absolute_error(y, tail.predict(X))
# mae_huber = mean_absolute_error(y, huber.predict(X))
# mae_ransac = mean_absolute_error(y, ransac.predict(X))


# median_error = median_absolute_error(y, linear_model.predict(X))
# median_error_tail = median_absolute_error(y, tail.predict(X))
# median_error_huber = median_absolute_error(y, huber.predict(X))
# median_error_ransac = median_absolute_error(y, ransac.predict(X))

# print('Median Error Linear:', median_error)
# print('Linear MAE:', mae_linear)
# print('Linear RMSE:', linear_rmse)
# print()
# print('Median Error RANSAC:', median_error_ransac)
# print('RANSAC MAE:', mae_ransac)
# print('RANSAC RMSE:', ransac_rmse)
# print()
# print('Median Error Tail:', median_error_tail)
# print('Theil MAE:', mae_tail)
# print('Theil RMSE:', tail_rmse)
# print()
# print('Median Error Huber:', median_error_huber)
# print('Huber MAE:', mae_huber)
# print('Huber RMSE:', huber_rmse)

In [35]:
X = data1[columns]
y = data1['price']

In [36]:

def calculate_metrics(X, y):
    model = LinearRegression()
    model.fit(X, y)

    y_pred = model.predict(X)

    r_squared = model.score(X, y)

    n = X.shape[0]
    k = X.shape[1]
    adjusted_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - k - 1)

    
    rmse = np.sqrt(mean_squared_error(y, y_pred))

    
    mae = mean_absolute_error(y, y_pred)

    
    medae = median_absolute_error(y, y_pred)

    
    mape = np.mean(np.abs((y - y_pred) / y)) * 100


    smape = np.mean(2 * np.abs(y - y_pred) / (np.abs(y) + np.abs(y_pred))) * 100


    medape = np.median(np.abs((y - y_pred) / y)) * 100

    return {
        'R-squared': r_squared,
        'Adjusted R-squared': adjusted_r_squared,
        'RMSE': rmse,
        'MAE': mae,
        'MedAE': medae,
        'MAPE': mape,
        'sMAPE': smape,
        'MedAPE': medape
    }



 
metrics = calculate_metrics(X, y)
for metric_name, metric_value in metrics.items():
    print(f"{metric_name}: {metric_value:.4f}")


R-squared: 0.5922
Adjusted R-squared: 0.5920
RMSE: 12738.7051
MAE: 9370.4621
MedAE: 6964.2190
MAPE: 44.6106
sMAPE: 37.7572
MedAPE: 27.5337


### Blending and Stacking

#### Blending

In [37]:
base_models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'TheilSenRegressor': TheilSenRegressor(),
    'HuberRegressor': HuberRegressor(),
    'RANSACRegressor': RANSACRegressor()
}


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


base_model_preds_train = np.zeros((X_train.shape[0], len(base_models)))
base_model_preds_test = np.zeros((X_test.shape[0], len(base_models)))

for i, (model_name, model) in enumerate(base_models.items()):
    model.fit(X_train, y_train)
    base_model_preds_train[:, i] = model.predict(X_train)
    base_model_preds_test[:, i] = model.predict(X_test)


meta_model = LinearRegression()
meta_model.fit(base_model_preds_train, y_train)


final_predictions = meta_model.predict(base_model_preds_test)



r_squared = meta_model.score(base_model_preds_test, y_test)
rmse = np.sqrt(mean_squared_error(y_test, final_predictions))
mae = mean_absolute_error(y_test, final_predictions)

print(f"Blending Model Metrics:")
print(f"  R-squared: {r_squared:.4f}")
print(f"  RMSE: {rmse:.4f}")
print(f"  MAE: {mae:.4f}")

Blending Model Metrics:
  R-squared: 0.5954
  RMSE: 12729.9380
  MAE: 9368.7879


#### Stacking 

In [38]:
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import cross_val_predict


base_models = [
    ('LinearRegression', LinearRegression()),
    ('Ridge', Ridge()),
    ('Lasso', Lasso()),
    ('ElasticNet', ElasticNet()),
    ('TheilSenRegressor', TheilSenRegressor()),
    ('HuberRegressor', HuberRegressor()),
    ('RANSACRegressor', RANSACRegressor())
]

meta_model = LinearRegression()

stacking_model = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5)

stacking_model.fit(X_train, y_train)

stacking_predictions = stacking_model.predict(X_test)

r_squared = stacking_model.score(X_test, y_test)
rmse = np.sqrt(mean_squared_error(y_test, stacking_predictions))
mae = mean_absolute_error(y_test, stacking_predictions)

print(f"Stacking Model Metrics:")
print(f"  R-squared: {r_squared:.4f}")
print(f"  RMSE: {rmse:.4f}")
print(f"  MAE: {mae:.4f}")

Stacking Model Metrics:
  R-squared: 0.5955
  RMSE: 12728.0108
  MAE: 9371.8901


### RandomizedSearchCV for both Blending and Stacking 

#### Blending with RandomizedSearchCV

In [39]:
from sklearn.model_selection import RandomizedSearchCV


In [42]:
param_grids = {
    'LinearRegression': {},
    'Ridge': {'alpha': [0.1, 1, 10, 100]},
    'Lasso': {'alpha': [0.1, 1, 10, 100]},
    'ElasticNet': {'alpha': [0.1, 1, 10, 100], 'l1_ratio': [0.1, 0.5, 0.9]},
    # 'TheilSenRegressor': {'alpha': [1e-6, 1e-4, 1e-2, 1e0]},
    'HuberRegressor': {'alpha': [1e-6, 1e-4, 1e-2, 1e0], 'epsilon': [1.0, 1.1, 1.2]},
    'RANSACRegressor': {'min_samples': [0.5, 0.7, 0.9], 'residual_threshold': [0.1, 1, 10]}
}


best_base_models = {}
for model_name, param_grid in param_grids.items():
    model = eval(f"{model_name}()")  # Create model instance
    random_search = RandomizedSearchCV(model, param_grid, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
    random_search.fit(X_train, y_train)
    best_base_models[model_name] = random_search.best_estimator_


base_model_preds_train = np.zeros((X_train.shape[0], len(best_base_models)))
base_model_preds_test = np.zeros((X_test.shape[0], len(best_base_models)))

for i, (model_name, model) in enumerate(best_base_models.items()):
    model.fit(X_train, y_train)
    base_model_preds_train[:, i] = model.predict(X_train)
    base_model_preds_test[:, i] = model.predict(X_test)


meta_model = LinearRegression()
meta_model.fit(base_model_preds_train, y_train)


final_predictions = meta_model.predict(base_model_preds_test)


r_squared = meta_model.score(base_model_preds_test, y_test)
rmse = np.sqrt(mean_squared_error(y_test, final_predictions))
mae = mean_absolute_error(y_test, final_predictions)

print(f"Blending Model Metrics with RandomizedSearchCV:")
print(f"  R-squared: {r_squared:.4f}")
print(f"  RMSE: {rmse:.4f}")
print(f"  MAE: {mae:.4f}")

Blending Model Metrics with RandomizedSearchCV:
  R-squared: 0.5954
  RMSE: 12729.9380
  MAE: 9368.7879


In [None]:
! pip install scipy




[notice] A new release of pip is available: 23.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


#### Stacking with RandomizedSearchCV


In [44]:
param_grids = {
    'LinearRegression': {},
    'Ridge': {'alpha': [0.1, 1, 10, 100]},
    'Lasso': {'alpha': [0.1, 1, 10, 100]},
    'ElasticNet': {'alpha': [0.1, 1, 10, 100], 'l1_ratio': [0.1, 0.5, 0.9]},
    # 'TheilSenRegressor': {'alpha': [1e-6, 1e-4, 1e-2, 1e0]},
    'HuberRegressor': {'alpha': [1e-6, 1e-4, 1e-2, 1e0], 'epsilon': [1.0, 1.1, 1.2]},
    'RANSACRegressor': {'min_samples': [0.5, 0.7, 0.9], 'residual_threshold': [0.1, 1, 10]}
}


best_base_models = []
for model_name, param_grid in param_grids.items():
    model = eval(f"{model_name}()")  # Create model instance
    random_search = RandomizedSearchCV(model, param_grid, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
    random_search.fit(X_train, y_train)
    best_base_models.append((model_name, random_search.best_estimator_))


meta_model = LinearRegression()


stacking_model = StackingRegressor(estimators=best_base_models, final_estimator=meta_model, cv=5)
stacking_model.fit(X_train, y_train)


stacking_predictions = stacking_model.predict(X_test)


r_squared = stacking_model.score(X_test, y_test)
rmse = np.sqrt(mean_squared_error(y_test, stacking_predictions))
mae = mean_absolute_error(y_test, stacking_predictions)

print(f"Stacking Model Metrics with RandomizedSearchCV:")
print(f"  R-squared: {r_squared:.4f}")
print(f"  RMSE: {rmse:.4f}")
print(f"  MAE: {mae:.4f}")

Stacking Model Metrics with RandomizedSearchCV:
  R-squared: 0.5951
  RMSE: 12734.0199
  MAE: 9367.0636
