In [4]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet,TheilSenRegressor,HuberRegressor,RANSACRegressor
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score,median_absolute_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures
import warnings
import re
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')


In [5]:
data = pd.read_csv('car_prices.csv')
col = data.columns
index = data.index

col, index

(Index(['id', 'brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
        'transmission', 'ext_col', 'int_col', 'accident', 'clean_title',
        'price'],
       dtype='object'),
 RangeIndex(start=0, stop=54273, step=1))

In [6]:
data.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000
3,3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,Yes,63500
4,4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,Yes,7850


#### Uniqual datalar sonini korib chiqamiz har bir column uchun

In [7]:
for i in col:
    print(f"{i}: {data[i].nunique()}")
    

id: 54273
brand: 53
model: 1827
model_year: 34
milage: 3212
fuel_type: 7
engine: 1061
transmission: 46
ext_col: 260
int_col: 124
accident: 2
clean_title: 1
price: 1481


#### Feature enginering qilamiz


In [8]:
def feature_engine(engine):
    if pd.isna(engine):
        return pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan])

    hp = re.search(r'(\d+\.?\d*)HP', engine)
    liter = re.search(r'(\d+\.?\d*)L', engine)
    motor = re.search(r'(Straight|V)\s*\d*', engine)
    cylinder = re.search(r'(\d+)\s*Cylinder', engine)
    fuel = re.search(r'(Gasoline|Hybrid|Flex|Diesel)', engine)
    
    return pd.Series([
        hp.group(1) if hp else np.nan,
        liter.group(1) if liter else np.nan,
        motor.group(0) if motor else np.nan,
        cylinder.group(1) if cylinder else np.nan,
        fuel.group(1) if fuel else np.nan
    ])




#### NaN va zero ustunlarni 2ta metrica yani mode va mean boyicha toldirib olamiz feature enginering qilgandan keyin 

In [9]:
### Numeric datalani mean boyicha nan va zerolarini toldirib olamiz

def fillna_mean(df, col, columns):
    print(f"Col: {col}, Columns: {columns}, Nan: {df[col].isna().sum()}", end='')
    value = df[df[col].notna()].groupby(by=columns)[col].mean().reset_index()
    df = pd.merge(df, value, on=columns, how='left', suffixes=('', '_mean'))
    df[col] = df[col].fillna(df[col + '_mean'])
    df.drop(columns=[col + '_mean'], inplace=True)
    print(f", After Clean: {df[col].isna().sum()}")
    return df


###  Objective ustunlardagi nan va zero datalarni  modelari boyicha toldirib olamiz

def fillna_mode(df, col, columns):
    print(f"Col: {col}, Columns: {columns}, Nan: {df[col].isna().sum()}", end='')
    mode_values = df.groupby(columns)[col].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else pd.NA).reset_index()
    df = pd.merge(df, mode_values, on=columns, how='left', suffixes=('', '_mode'))
    df[col] = df[col].fillna(df[col + '_mode'])
    df.drop(columns=[col + '_mode'], inplace=True)

    print(f", After Clean: {df[col].isna().sum()}")
    return df

def to_numeric(x):
    try:
        return float(x)
    except ValueError as err:
        print(err)
        return np.nan


#### datalarni price ustuini boyicha sortlab olamiz

In [10]:
def ordinal_encoder(df, column, on):

    avg_prices = df.groupby(column)[on].mean().reset_index()
    avg_prices.columns = [column, 'avg_price']
    avg_prices = avg_prices.sort_values(by='avg_price').reset_index(drop=True)
    avg_prices[column+'_rank'] = avg_prices.index + 1
    return pd.merge(df, avg_prices[[column, column+'_rank']], on=column, how='left')

####  Feature enginering qilamiz engine ustunidagi datalarni

In [11]:
new_columns = ['hp', 'litr', 'motor', 'Cylinder', 'fuel']
data[new_columns] = data['engine'].apply(feature_engine)

In [12]:
data.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price,hp,litr,motor,Cylinder,fuel
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000,375.0,3.5,V6,6,Gasoline
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250,300.0,3.0,Straight 6,6,Gasoline
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000,300.0,4.2,,8,Gasoline
3,3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,Yes,63500,335.0,3.0,Straight 6,6,Gasoline
4,4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,Yes,7850,200.0,3.8,V6,6,Gasoline


In [13]:
data = data.drop(columns=['clean_title', 'id'])

#### Nan va zero qiymatlarni korib chiqamiz


In [14]:
data[new_columns].isna().sum()

hp           4057
litr          606
motor       29640
Cylinder     4175
fuel         4258
dtype: int64

### Har bir yaratilgan yangi ustunlar boyicha NaN larni toldirib chiqamiz tepadagi yozgan functionlardan foydalanib

#### HP ustuni boyicha 

In [15]:
data['hp'] = data['hp'].agg(to_numeric)

data = fillna_mean(data.copy(), 'hp', ['motor', 'Cylinder', 'litr'])
data = fillna_mean(data.copy(), 'hp', ['motor', 'litr'])
data = fillna_mean(data.copy(), 'hp', ['brand', 'model'])
data = fillna_mean(data.copy(), 'hp', ['litr', 'fuel_type'])
data = fillna_mean(data.copy(), 'hp', ['model'])
data = fillna_mean(data.copy(), 'hp', ['motor'])
data = fillna_mean(data.copy(), 'hp', ['price', 'brand'])
data = fillna_mean(data.copy(), 'hp', ['price', 'fuel_type'])
data = fillna_mean(data.copy(), 'hp', ['engine'])



print(data['hp'].isna().sum())

Col: hp, Columns: ['motor', 'Cylinder', 'litr'], Nan: 4057, After Clean: 4055
Col: hp, Columns: ['motor', 'litr'], Nan: 4055, After Clean: 2809
Col: hp, Columns: ['brand', 'model'], Nan: 2809, After Clean: 369
Col: hp, Columns: ['litr', 'fuel_type'], Nan: 369, After Clean: 119
Col: hp, Columns: ['model'], Nan: 119, After Clean: 111
Col: hp, Columns: ['motor'], Nan: 111, After Clean: 107
Col: hp, Columns: ['price', 'brand'], Nan: 107, After Clean: 23
Col: hp, Columns: ['price', 'fuel_type'], Nan: 23, After Clean: 14
Col: hp, Columns: ['engine'], Nan: 14, After Clean: 0
0


#### Litr ustuni boyicha

In [16]:
data['litr'] = data['litr'].agg(to_numeric)


data = fillna_mean(data.copy(), 'litr', ['motor', 'Cylinder', 'hp'])
data = fillna_mean(data.copy(), 'litr', ['hp', 'fuel_type'])
data = fillna_mean(data.copy(), 'litr', ['model'])
data = fillna_mean(data.copy(), 'litr', ['engine'])
data = fillna_mean(data.copy(), 'litr', ['price', 'fuel_type'])
data = fillna_mean(data.copy(), 'litr', ['price', 'transmission'])


print(data['litr'].isna().sum())

Col: litr, Columns: ['motor', 'Cylinder', 'hp'], Nan: 606, After Clean: 606
Col: litr, Columns: ['hp', 'fuel_type'], Nan: 606, After Clean: 396
Col: litr, Columns: ['model'], Nan: 396, After Clean: 124
Col: litr, Columns: ['engine'], Nan: 124, After Clean: 46
Col: litr, Columns: ['price', 'fuel_type'], Nan: 46, After Clean: 3
Col: litr, Columns: ['price', 'transmission'], Nan: 3, After Clean: 0
0


#### Cylinder boyicha

In [17]:
data['Cylinder'] = data['Cylinder'].agg(to_numeric)

data = fillna_mean(data.copy(), 'Cylinder', ['litr', 'hp'])
data = fillna_mean(data.copy(), 'Cylinder', ['litr', 'model'])
data = fillna_mean(data.copy(), 'Cylinder', ['hp', 'model'])
data = fillna_mean(data.copy(), 'Cylinder', ['model'])
data = fillna_mean(data.copy(), 'Cylinder', ['hp', 'fuel_type'])
data = fillna_mean(data.copy(), 'Cylinder', ['price', 'fuel_type'])
data = fillna_mean(data.copy(), 'Cylinder', ['price', 'transmission'])
data = fillna_mean(data.copy(), 'Cylinder', ['litr', 'fuel_type'])
data = fillna_mean(data.copy(), 'Cylinder', ['engine'])


print(data['Cylinder'].isna().sum())

Col: Cylinder, Columns: ['litr', 'hp'], Nan: 4175, After Clean: 3772
Col: Cylinder, Columns: ['litr', 'model'], Nan: 3772, After Clean: 1383
Col: Cylinder, Columns: ['hp', 'model'], Nan: 1383, After Clean: 1033
Col: Cylinder, Columns: ['model'], Nan: 1033, After Clean: 632
Col: Cylinder, Columns: ['hp', 'fuel_type'], Nan: 632, After Clean: 214
Col: Cylinder, Columns: ['price', 'fuel_type'], Nan: 214, After Clean: 32
Col: Cylinder, Columns: ['price', 'transmission'], Nan: 32, After Clean: 8
Col: Cylinder, Columns: ['litr', 'fuel_type'], Nan: 8, After Clean: 2
Col: Cylinder, Columns: ['engine'], Nan: 2, After Clean: 0
0


#### Objectlarni yani Motor ustunini todiramiz mode boyicha


In [18]:
data = fillna_mode(data.copy(), 'motor', ['litr', 'hp', 'Cylinder', 'model'])
data = fillna_mode(data.copy(), 'motor', ['litr', 'hp', 'Cylinder'])
data = fillna_mode(data.copy(), 'motor', ['litr', 'hp'])
data = fillna_mode(data.copy(), 'motor', ['model'])
data = fillna_mode(data.copy(), 'motor', ['engine'])
data = fillna_mode(data.copy(), 'motor', ['fuel_type', 'hp'])
data = fillna_mode(data.copy(), 'motor', ['litr', 'Cylinder'])
data = fillna_mode(data.copy(), 'motor', ['Cylinder', 'price'])
data = fillna_mode(data.copy(), 'motor', ['Cylinder', 'brand'])
data = fillna_mode(data.copy(), 'motor', ['litr', 'brand'])
data = fillna_mode(data.copy(), 'motor', ['hp', 'brand'])
data = fillna_mode(data.copy(), 'motor', ['hp', 'fuel'])
data = fillna_mode(data.copy(), 'motor', ['brand', 'price'])
data = fillna_mode(data.copy(), 'motor', ['brand', 'price'])
data = fillna_mode(data.copy(), 'motor', ['brand', 'fuel_type'])


print(data['motor'].isna().sum())

Col: motor, Columns: ['litr', 'hp', 'Cylinder', 'model'], Nan: 29640, After Clean: 28986
Col: motor, Columns: ['litr', 'hp', 'Cylinder'], Nan: 28986, After Clean: 21847
Col: motor, Columns: ['litr', 'hp'], Nan: 21847, After Clean: 21438
Col: motor, Columns: ['model'], Nan: 21438, After Clean: 6649
Col: motor, Columns: ['engine'], Nan: 6649, After Clean: 1320
Col: motor, Columns: ['fuel_type', 'hp'], Nan: 1320, After Clean: 522
Col: motor, Columns: ['litr', 'Cylinder'], Nan: 522, After Clean: 76
Col: motor, Columns: ['Cylinder', 'price'], Nan: 76, After Clean: 66
Col: motor, Columns: ['Cylinder', 'brand'], Nan: 66, After Clean: 52
Col: motor, Columns: ['litr', 'brand'], Nan: 52, After Clean: 40
Col: motor, Columns: ['hp', 'brand'], Nan: 40, After Clean: 12
Col: motor, Columns: ['hp', 'fuel'], Nan: 12, After Clean: 11
Col: motor, Columns: ['brand', 'price'], Nan: 11, After Clean: 4
Col: motor, Columns: ['brand', 'price'], Nan: 4, After Clean: 4
Col: motor, Columns: ['brand', 'fuel_type']

#### Fuel ustunini toldiramiz


In [19]:
data = fillna_mode(data.copy(), 'fuel', ['litr', 'hp', 'Cylinder', 'model'])
data = fillna_mode(data.copy(), 'fuel', ['litr', 'hp', 'model'])
data = fillna_mode(data.copy(), 'fuel', ['hp', 'model'])
data = fillna_mode(data.copy(), 'fuel', ['model'])
data = fillna_mode(data.copy(), 'fuel', ['motor', 'hp'])
data = fillna_mode(data.copy(), 'fuel', ['motor', 'hp'])
data = fillna_mode(data.copy(), 'fuel', ['motor', 'Cylinder'])
data = fillna_mode(data.copy(), 'fuel', ['motor', 'litr'])
data = fillna_mode(data.copy(), 'fuel', ['motor'])

print(data['fuel'].isna().sum())

Col: fuel, Columns: ['litr', 'hp', 'Cylinder', 'model'], Nan: 4258, After Clean: 3855
Col: fuel, Columns: ['litr', 'hp', 'model'], Nan: 3855, After Clean: 3756
Col: fuel, Columns: ['hp', 'model'], Nan: 3756, After Clean: 3515
Col: fuel, Columns: ['model'], Nan: 3515, After Clean: 787
Col: fuel, Columns: ['motor', 'hp'], Nan: 787, After Clean: 304
Col: fuel, Columns: ['motor', 'hp'], Nan: 304, After Clean: 304
Col: fuel, Columns: ['motor', 'Cylinder'], Nan: 304, After Clean: 182
Col: fuel, Columns: ['motor', 'litr'], Nan: 182, After Clean: 58
Col: fuel, Columns: ['motor'], Nan: 58, After Clean: 0
0


In [20]:
data.head(1)

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,price,hp,litr,motor,Cylinder,fuel
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,11000,375.0,3.5,V6,6.0,Gasoline


In [21]:
data[new_columns].isna().sum()

hp          0
litr        0
motor       0
Cylinder    0
fuel        0
dtype: int64

In [22]:
data.select_dtypes(include='object').columns

Index(['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col',
       'int_col', 'accident', 'motor', 'fuel'],
      dtype='object')

In [23]:
for col in data.select_dtypes(include='object').columns:
    data = ordinal_encoder(data.copy(), col, 'price')

In [24]:
data.head(1)

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,...,brand_rank,model_rank,fuel_type_rank,engine_rank,transmission_rank,ext_col_rank,int_col_rank,accident_rank,motor_rank,fuel_rank
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,...,31,1256,4,821,28,91,14,2,1,3


### Chiziqli model

In [25]:
columns = ['model_year',
           'milage',
           'hp',
           'litr',
           'Cylinder',
           'brand_rank',
           'model_rank',
           'fuel_type_rank',
           'engine_rank',
           'transmission_rank',
           'ext_col_rank',
           'int_col_rank',
           'accident_rank',
           'motor_rank',
           'fuel_rank'
           ]

X = data[columns]
y = data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.37, random_state=125)

model = LinearRegression()

cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_rmse = np.sqrt(np.mean((-cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error'))))

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'CV RMSE: {cv_rmse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

CV RMSE: 68040.11252712364
RMSE: 69944.67979996078
MAE: 19389.99585093425
R2: 0.1094867627865036


#### Polynomial Reg models

In [26]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.37, random_state=125)

model = LinearRegression()

cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_rmse = np.sqrt(np.mean((-cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error'))))

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'CV RMSE: {cv_rmse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

CV RMSE: 67275.41297095054
RMSE: 68974.20593149104
MAE: 18491.2921885455
R2: 0.13402685237038292


#### Outlierlarni filtirlab olib korib chiqamiz


In [27]:
data1 = data[data['price'] <= data['price'].quantile(0.95)]

In [28]:
X = data1[columns]
y = data1['price']

In [29]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.37, random_state=125)

model = LinearRegression()

cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_rmse = np.sqrt(np.mean((-cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error'))))

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'CV RMSE: {cv_rmse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

CV RMSE: 12015.062360617138
RMSE: 11961.465478650307
MAE: 8395.475149823636
R2: 0.6362014146709627


#### Robostic Reg tahlillari

In [30]:
linear_model = LinearRegression()
linear_model.fit(X, y)

tail = TheilSenRegressor()
tail.fit(X, y)

huber = HuberRegressor()
huber.fit(X, y)

ransac = RANSACRegressor()
ransac.fit(X, y)


linear_rmse = np.sqrt(mean_squared_error(y, linear_model.predict(X)))
tail_rmse = np.sqrt(mean_squared_error(y, tail.predict(X)))
huber_rmse = np.sqrt(mean_squared_error(y, huber.predict(X)))
ransac_rmse = np.sqrt(mean_squared_error(y, ransac.predict(X)))

mae_linear = mean_absolute_error(y, linear_model.predict(X))
mae_tail = mean_absolute_error(y, tail.predict(X))
mae_huber = mean_absolute_error(y, huber.predict(X))
mae_ransac = mean_absolute_error(y, ransac.predict(X))


median_error = median_absolute_error(y, linear_model.predict(X))
median_error_tail = median_absolute_error(y, tail.predict(X))
median_error_huber = median_absolute_error(y, huber.predict(X))
median_error_ransac = median_absolute_error(y, ransac.predict(X))

print('Median Error Linear:', median_error)
print('Linear MAE:', mae_linear)
print('Linear RMSE:', linear_rmse)
print()
print('Median Error RANSAC:', median_error_ransac)
print('RANSAC MAE:', mae_ransac)
print('RANSAC RMSE:', ransac_rmse)
print()
print('Median Error Tail:', median_error_tail)
print('Theil MAE:', mae_tail)
print('Theil RMSE:', tail_rmse)
print()
print('Median Error Huber:', median_error_huber)
print('Huber MAE:', mae_huber)
print('Huber RMSE:', huber_rmse)

Median Error Linear: 6964.218996915035
Linear MAE: 9370.46208536432
Linear RMSE: 12738.705142203507

Median Error RANSAC: 7248.238710686332
RANSAC MAE: 10520.305266942089
RANSAC RMSE: 14805.389016713698

Median Error Tail: 8094.999202676481
Theil MAE: 36540.60190795033
Theil RMSE: 101325.75503692553

Median Error Huber: 6714.827474342092
Huber MAE: 9535.035319854673
Huber RMSE: 13294.047785866176


In [31]:
linear_model = LinearRegression()
linear_model.fit(X_poly, y)

tail = TheilSenRegressor()
tail.fit(X_poly, y)

huber = HuberRegressor()
huber.fit(X_poly, y)

ransac = RANSACRegressor()
ransac.fit(X_poly, y)


linear_rmse = np.sqrt(mean_squared_error(y, linear_model.predict(X_poly)))
tail_rmse = np.sqrt(mean_squared_error(y, tail.predict(X_poly)))
huber_rmse = np.sqrt(mean_squared_error(y, huber.predict(X_poly)))
ransac_rmse = np.sqrt(mean_squared_error(y, ransac.predict(X_poly)))

mae_linear = mean_absolute_error(y, linear_model.predict(X_poly))
mae_tail = mean_absolute_error(y, tail.predict(X_poly))
mae_huber = mean_absolute_error(y, huber.predict(X_poly))
mae_ransac = mean_absolute_error(y, ransac.predict(X_poly))


median_error = median_absolute_error(y, linear_model.predict(X_poly))
median_error_tail = median_absolute_error(y, tail.predict(X_poly))
median_error_huber = median_absolute_error(y, huber.predict(X_poly))
median_error_ransac = median_absolute_error(y, ransac.predict(X_poly))

print('Median Error Linear:', median_error)
print('Linear MAE:', mae_linear)
print('Linear RMSE:', linear_rmse)
print()
print('Median Error RANSAC:', median_error_ransac)
print('RANSAC MAE:', mae_ransac)
print('RANSAC RMSE:', ransac_rmse)
print()
print('Median Error Tail:', median_error_tail)
print('Theil MAE:', mae_tail)
print('Theil RMSE:', tail_rmse)
print()
print('Median Error Huber:', median_error_huber)
print('Huber MAE:', mae_huber)
print('Huber RMSE:', huber_rmse)

Median Error Linear: 5758.549297064543
Linear MAE: 8405.500456738731
Linear RMSE: 11940.65805175825

Median Error RANSAC: 11827.48089325428
RANSAC MAE: 522541.9546758093
RANSAC RMSE: 2520527.3366236263

Median Error Tail: 6472.839043731568
Theil MAE: 3260412.5458397055
Theil RMSE: 13794997.959713273

Median Error Huber: 6208.180488821228
Huber MAE: 9029.633004577552
Huber RMSE: 12728.688838867307


In [32]:
from sklearn.feature_selection import RFECV
X = data[columns]
y = data['price']
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = pd.DataFrame(poly.fit_transform(X), columns=poly.get_feature_names_out(X.columns))


cv = KFold(n_splits=6, shuffle=True, random_state=42)

linear_model = LinearRegression()

rfecv = RFECV(estimator=linear_model, step=1, cv=cv, scoring='neg_mean_squared_error', verbose=2)

rfecv.fit(X_poly, y)
optimal_num_features = rfecv.n_features_
selected_features = X_poly.columns[rfecv.support_]

print("Optimal number of features:", optimal_num_features)
print("Selected features:", selected_features.tolist())

Fitting estimator with 135 features.
Fitting estimator with 134 features.
Fitting estimator with 133 features.
Fitting estimator with 132 features.
Fitting estimator with 131 features.
Fitting estimator with 130 features.
Fitting estimator with 129 features.
Fitting estimator with 128 features.
Fitting estimator with 127 features.
Fitting estimator with 126 features.
Fitting estimator with 125 features.
Fitting estimator with 124 features.
Fitting estimator with 123 features.
Fitting estimator with 122 features.
Fitting estimator with 121 features.
Fitting estimator with 120 features.
Fitting estimator with 119 features.
Fitting estimator with 118 features.
Fitting estimator with 117 features.
Fitting estimator with 116 features.
Fitting estimator with 115 features.
Fitting estimator with 114 features.
Fitting estimator with 113 features.
Fitting estimator with 112 features.
Fitting estimator with 111 features.
Fitting estimator with 110 features.
Fitting estimator with 109 features.
F

In [33]:
X_poly = pd.DataFrame(poly.fit_transform(X), columns=poly.get_feature_names_out(X.columns))
X = X_poly[selected_features]
y = data['price']

linear_model = LinearRegression()
linear_model.fit(X, y)

tail = TheilSenRegressor()
tail.fit(X, y)

huber = HuberRegressor()
huber.fit(X, y)

ransac = RANSACRegressor()
ransac.fit(X, y)


linear_rmse = np.sqrt(mean_squared_error(y, linear_model.predict(X)))
tail_rmse = np.sqrt(mean_squared_error(y, tail.predict(X)))
huber_rmse = np.sqrt(mean_squared_error(y, huber.predict(X)))
ransac_rmse = np.sqrt(mean_squared_error(y, ransac.predict(X)))

mae_linear = mean_absolute_error(y, linear_model.predict(X))
mae_tail = mean_absolute_error(y, tail.predict(X))
mae_huber = mean_absolute_error(y, huber.predict(X))
mae_ransac = mean_absolute_error(y, ransac.predict(X))


median_error = median_absolute_error(y, linear_model.predict(X))
median_error_tail = median_absolute_error(y, tail.predict(X))
median_error_huber = median_absolute_error(y, huber.predict(X))
median_error_ransac = median_absolute_error(y, ransac.predict(X))

print('Median Error Linear:', median_error)
print('Linear MAE:', mae_linear)
print('Linear RMSE:', linear_rmse)
print()
print('Median Error RANSAC:', median_error_ransac)
print('RANSAC MAE:', mae_ransac)
print('RANSAC RMSE:', ransac_rmse)
print()
print('Median Error Tail:', median_error_tail)
print('Theil MAE:', mae_tail)
print('Theil RMSE:', tail_rmse)
print()
print('Median Error Huber:', median_error_huber)
print('Huber MAE:', mae_huber)
print('Huber RMSE:', huber_rmse)

Median Error Linear: 9049.879391551018
Linear MAE: 18245.722347294624
Linear RMSE: 67378.69110002086

Median Error RANSAC: 16772.430571436882
RANSAC MAE: 406801.3675774485
RANSAC RMSE: 1722465.312011279

Median Error Tail: 8585.277916659485
Theil MAE: 4210261.422314947
Theil RMSE: 18048572.16085143

Median Error Huber: 6844.224676916378
Huber MAE: 15935.464038467357
Huber RMSE: 69219.55574071071
