In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import optuna


In [43]:
df = pd.read_csv('cleaned_dataset.csv')

In [44]:
print(df.shape)
print(df.info())
print(df.describe())

(94891, 30)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94891 entries, 0 to 94890
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   bathroomcount      94891 non-null  int64  
 1   bedroomcount       94891 non-null  int64  
 2   constructionyear   94891 non-null  int64  
 3   country            94891 non-null  object 
 4   district           94891 non-null  object 
 5   fireplace          94891 non-null  int64  
 6   floodingzone       94891 non-null  object 
 7   furnished          94891 non-null  int64  
 8   garden             94891 non-null  int64  
 9   kitchen            94891 non-null  int64  
 10  livingarea         94891 non-null  float64
 11  locality           94888 non-null  object 
 12  monthlycharges     94891 non-null  float64
 13  numberoffacades    94891 non-null  int64  
 14  peb                94891 non-null  object 
 15  postalcode         94891 non-null  int64  
 16  price     

In [45]:
df.columns

Index(['bathroomcount', 'bedroomcount', 'constructionyear', 'country',
       'district', 'fireplace', 'floodingzone', 'furnished', 'garden',
       'kitchen', 'livingarea', 'locality', 'monthlycharges',
       'numberoffacades', 'peb', 'postalcode', 'price', 'propertyid',
       'province', 'region', 'roomcount', 'showercount', 'stateofbuilding',
       'subtypeofproperty', 'surfaceofplot', 'swimmingpool', 'terrace',
       'toiletcount', 'typeofproperty', 'typeofsale'],
      dtype='object')

refining cleaned data

In [84]:
import pandas as pd

def clean_data(df):
    # Drop column: 'country'
    df = df.drop(columns=['country'])
    # Drop column: 'propertyid'
    df = df.drop(columns=['propertyid'])
    # Filter rows based on column: 'constructionyear'
    df = df[df['constructionyear'] > 1950]
    # Filter rows based on column: 'constructionyear'
    df = df[df['constructionyear'] < 2025]

    df = df.drop(columns=['locality'])
    # Drop column: 'monthlycharges'
    df = df.drop(columns=['monthlycharges'])
    # Filter rows based on column: 'numberoffacades'
    df = df[df['numberoffacades'] < 5]
    # Drop column: 'postalcode'
    df = df.drop(columns=['postalcode'])
    # Filter rows based on column: 'roomcount'
    df = df[df['roomcount'] < 12]
    # Filter rows based on column: 'showercount'
    df = df[df['showercount'] < 5]
    # Filter rows based on column: 'stateofbuilding'
    df = df[df['stateofbuilding'] > 0]
    # Filter rows based on column: 'surfaceofplot'
    df = df[df['surfaceofplot'] < 20000]
    # Filter rows based on column: 'surfaceofplot'
    df = df[df['surfaceofplot'] <= 1000]
    # Filter rows based on column: 'toiletcount'
    df = df[df['toiletcount'] < 4]
    # Change column type to object for column: 'constructionyear'
    df = df.astype({'constructionyear': 'object'})
    return df

# Loaded variable 'df' from URI: c:\Users\pieta\OneDrive\Bureau\immoModeltraining\immoModel\cleaned_dataset.csv
df = pd.read_csv(r'c:\Users\pieta\OneDrive\Bureau\immoModeltraining\immoModel\cleaned_dataset.csv')

df_clean = clean_data(df.copy())

print(df_clean.shape)
print(df_clean.info())
df_clean.head()


(68378, 25)
<class 'pandas.core.frame.DataFrame'>
Index: 68378 entries, 0 to 94890
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   bathroomcount      68378 non-null  int64  
 1   bedroomcount       68378 non-null  int64  
 2   constructionyear   68378 non-null  object 
 3   district           68378 non-null  object 
 4   fireplace          68378 non-null  int64  
 5   floodingzone       68378 non-null  object 
 6   furnished          68378 non-null  int64  
 7   garden             68378 non-null  int64  
 8   kitchen            68378 non-null  int64  
 9   livingarea         68378 non-null  float64
 10  numberoffacades    68378 non-null  int64  
 11  peb                68378 non-null  object 
 12  price              68378 non-null  int64  
 13  province           68378 non-null  object 
 14  region             68378 non-null  object 
 15  roomcount          68378 non-null  int64  
 16  showercount    

Unnamed: 0,bathroomcount,bedroomcount,constructionyear,district,fireplace,floodingzone,furnished,garden,kitchen,livingarea,...,roomcount,showercount,stateofbuilding,subtypeofproperty,surfaceofplot,swimmingpool,terrace,toiletcount,typeofproperty,typeofsale
0,1,1,1969,Brugge,0,NON_FLOOD_ZONE,0,0,1,29.0,...,1,0,4,flat_studio,203,0,1,1,2,residential_sale
2,2,4,2008,Brugge,0,NON_FLOOD_ZONE,1,0,1,111.0,...,3,0,4,house,0,0,0,2,1,residential_sale
3,1,4,1979,Veurne,0,NON_FLOOD_ZONE,0,1,1,113.6,...,9,1,2,house,170,0,1,2,1,residential_sale
4,0,2,1972,Hasselt,0,NON_FLOOD_ZONE,0,0,1,92.0,...,1,0,5,apartment,400,0,1,1,2,residential_sale
5,1,1,1994,Brussels,0,NON_FLOOD_ZONE,1,0,3,50.0,...,3,1,5,apartment,143,0,1,1,2,residential_sale


In [47]:
print(df_clean.columns)


Index(['bathroomcount', 'bedroomcount', 'constructionyear', 'district',
       'fireplace', 'floodingzone', 'furnished', 'garden', 'kitchen',
       'livingarea', 'numberoffacades', 'peb', 'price', 'province', 'region',
       'roomcount', 'showercount', 'stateofbuilding', 'subtypeofproperty',
       'surfaceofplot', 'swimmingpool', 'terrace', 'toiletcount',
       'typeofproperty', 'typeofsale'],
      dtype='object')


Categorize str values
constructionyear, district, floodingzone, subtypeofproperty, typeofsale

In [85]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

columns_to_encode = ['constructionyear', 'district', 'floodingzone', 'subtypeofproperty', 'typeofsale', 'peb','province', 'region']

data_to_encode = df_clean[columns_to_encode]

one = OneHotEncoder()

encoded_data = one.fit_transform(data_to_encode)

encoded_df = pd.DataFrame(encoded_data.toarray(), columns=one.get_feature_names_out(columns_to_encode))

df_final = pd.concat([df_clean.drop(columns=columns_to_encode), encoded_df], axis=1)

print(df_final.shape)
print(df_final.info())
df_final.head()


(88283, 195)
<class 'pandas.core.frame.DataFrame'>
Index: 88283 entries, 0 to 68375
Columns: 195 entries, bathroomcount to region_Wallonie
dtypes: float64(195)
memory usage: 132.0 MB
None


Unnamed: 0,bathroomcount,bedroomcount,fireplace,furnished,garden,kitchen,livingarea,numberoffacades,price,roomcount,...,province_Hainaut,province_Limburg,province_Liège,province_Luxembourg,province_Namur,province_Walloon Brabant,province_West Flanders,region_Brussels,region_Flanders,region_Wallonie
0,1.0,1.0,0.0,0.0,0.0,1.0,29.0,2.0,99000.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,2.0,4.0,0.0,1.0,0.0,1.0,111.0,2.0,399000.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,1.0,4.0,0.0,0.0,1.0,1.0,113.6,2.0,230000.0,9.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,2.0,0.0,0.0,0.0,1.0,92.0,2.0,198000.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,1.0,1.0,0.0,1.0,0.0,3.0,50.0,2.0,215000.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Machine learning process

In [87]:
correlations = df_final.drop(columns=['price']).corrwith(df_final['price'])

print(correlations)

bathroomcount               0.234109
bedroomcount                0.385934
fireplace                   0.076896
furnished                  -0.066459
garden                      0.097737
                              ...   
province_Walloon Brabant   -0.002037
province_West Flanders      0.001812
region_Brussels            -0.002416
region_Flanders             0.007931
region_Wallonie            -0.006954
Length: 194, dtype: float64


  c /= stddev[:, None]
  c /= stddev[None, :]


In [58]:
from sklearn.model_selection import train_test_split

df = df_final

y = np.array(df ['price'])
X = np.array(df.drop(columns=['price']))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (70626, 194)
X_test shape: (17657, 194)
y_train shape: (70626,)
y_test shape: (17657,)


Cleaning X train, test


In [71]:
X_train

import pandas as pd

def clean_data(X_train_df):
    # Replace missing values with the median of each column in: 0, 1 and 192 other columns
    X_train_df = X_train_df.fillna({0: X_train_df[0].median(), 1: X_train_df[1].median(), 2: X_train_df[2].median(), 3: X_train_df[3].median(), 4: X_train_df[4].median(), 5: X_train_df[5].median(), 6: X_train_df[6].median(), 7: X_train_df[7].median(), 8: X_train_df[8].median(), 9: X_train_df[9].median(), 10: X_train_df[10].median(), 11: X_train_df[11].median(), 12: X_train_df[12].median(), 13: X_train_df[13].median(), 14: X_train_df[14].median(), 15: X_train_df[15].median(), 16: X_train_df[16].median(), 17: X_train_df[17].median(), 18: X_train_df[18].median(), 19: X_train_df[19].median(), 20: X_train_df[20].median(), 21: X_train_df[21].median(), 22: X_train_df[22].median(), 23: X_train_df[23].median(), 24: X_train_df[24].median(), 25: X_train_df[25].median(), 26: X_train_df[26].median(), 27: X_train_df[27].median(), 28: X_train_df[28].median(), 29: X_train_df[29].median(), 30: X_train_df[30].median(), 31: X_train_df[31].median(), 32: X_train_df[32].median(), 33: X_train_df[33].median(), 34: X_train_df[34].median(), 35: X_train_df[35].median(), 36: X_train_df[36].median(), 37: X_train_df[37].median(), 38: X_train_df[38].median(), 39: X_train_df[39].median(), 40: X_train_df[40].median(), 41: X_train_df[41].median(), 42: X_train_df[42].median(), 43: X_train_df[43].median(), 44: X_train_df[44].median(), 45: X_train_df[45].median(), 46: X_train_df[46].median(), 47: X_train_df[47].median(), 48: X_train_df[48].median(), 49: X_train_df[49].median(), 50: X_train_df[50].median(), 51: X_train_df[51].median(), 52: X_train_df[52].median(), 53: X_train_df[53].median(), 54: X_train_df[54].median(), 55: X_train_df[55].median(), 56: X_train_df[56].median(), 57: X_train_df[57].median(), 58: X_train_df[58].median(), 59: X_train_df[59].median(), 60: X_train_df[60].median(), 61: X_train_df[61].median(), 62: X_train_df[62].median(), 63: X_train_df[63].median(), 64: X_train_df[64].median(), 65: X_train_df[65].median(), 66: X_train_df[66].median(), 67: X_train_df[67].median(), 68: X_train_df[68].median(), 69: X_train_df[69].median(), 70: X_train_df[70].median(), 71: X_train_df[71].median(), 72: X_train_df[72].median(), 73: X_train_df[73].median(), 74: X_train_df[74].median(), 75: X_train_df[75].median(), 76: X_train_df[76].median(), 77: X_train_df[77].median(), 78: X_train_df[78].median(), 79: X_train_df[79].median(), 80: X_train_df[80].median(), 81: X_train_df[81].median(), 82: X_train_df[82].median(), 83: X_train_df[83].median(), 84: X_train_df[84].median(), 85: X_train_df[85].median(), 86: X_train_df[86].median(), 87: X_train_df[87].median(), 88: X_train_df[88].median(), 89: X_train_df[89].median(), 90: X_train_df[90].median(), 91: X_train_df[91].median(), 92: X_train_df[92].median(), 93: X_train_df[93].median(), 94: X_train_df[94].median(), 95: X_train_df[95].median(), 96: X_train_df[96].median(), 97: X_train_df[97].median(), 98: X_train_df[98].median(), 99: X_train_df[99].median(), 100: X_train_df[100].median(), 101: X_train_df[101].median(), 102: X_train_df[102].median(), 103: X_train_df[103].median(), 104: X_train_df[104].median(), 105: X_train_df[105].median(), 106: X_train_df[106].median(), 107: X_train_df[107].median(), 108: X_train_df[108].median(), 109: X_train_df[109].median(), 110: X_train_df[110].median(), 111: X_train_df[111].median(), 112: X_train_df[112].median(), 113: X_train_df[113].median(), 114: X_train_df[114].median(), 115: X_train_df[115].median(), 116: X_train_df[116].median(), 117: X_train_df[117].median(), 118: X_train_df[118].median(), 119: X_train_df[119].median(), 120: X_train_df[120].median(), 121: X_train_df[121].median(), 122: X_train_df[122].median(), 123: X_train_df[123].median(), 124: X_train_df[124].median(), 125: X_train_df[125].median(), 126: X_train_df[126].median(), 127: X_train_df[127].median(), 128: X_train_df[128].median(), 129: X_train_df[129].median(), 130: X_train_df[130].median(), 131: X_train_df[131].median(), 132: X_train_df[132].median(), 133: X_train_df[133].median(), 134: X_train_df[134].median(), 135: X_train_df[135].median(), 136: X_train_df[136].median(), 137: X_train_df[137].median(), 138: X_train_df[138].median(), 139: X_train_df[139].median(), 140: X_train_df[140].median(), 141: X_train_df[141].median(), 142: X_train_df[142].median(), 143: X_train_df[143].median(), 144: X_train_df[144].median(), 145: X_train_df[145].median(), 146: X_train_df[146].median(), 147: X_train_df[147].median(), 148: X_train_df[148].median(), 149: X_train_df[149].median(), 150: X_train_df[150].median(), 151: X_train_df[151].median(), 152: X_train_df[152].median(), 153: X_train_df[153].median(), 154: X_train_df[154].median(), 155: X_train_df[155].median(), 156: X_train_df[156].median(), 157: X_train_df[157].median(), 158: X_train_df[158].median(), 159: X_train_df[159].median(), 160: X_train_df[160].median(), 161: X_train_df[161].median(), 162: X_train_df[162].median(), 163: X_train_df[163].median(), 164: X_train_df[164].median(), 165: X_train_df[165].median(), 166: X_train_df[166].median(), 167: X_train_df[167].median(), 168: X_train_df[168].median(), 169: X_train_df[169].median(), 170: X_train_df[170].median(), 171: X_train_df[171].median(), 172: X_train_df[172].median(), 173: X_train_df[173].median(), 174: X_train_df[174].median(), 175: X_train_df[175].median(), 176: X_train_df[176].median(), 177: X_train_df[177].median(), 178: X_train_df[178].median(), 179: X_train_df[179].median(), 180: X_train_df[180].median(), 181: X_train_df[181].median(), 182: X_train_df[182].median(), 183: X_train_df[183].median(), 184: X_train_df[184].median(), 185: X_train_df[185].median(), 186: X_train_df[186].median(), 187: X_train_df[187].median(), 188: X_train_df[188].median(), 189: X_train_df[189].median(), 190: X_train_df[190].median(), 191: X_train_df[191].median(), 192: X_train_df[192].median(), 193: X_train_df[193].median()})
    return X_train_df

# Loaded variable 'X_train' from kernel state
X_train_df = pd.DataFrame(X_train.tolist() if len(X_train.shape) > 2 else X_train)

X_train_df_clean = clean_data(X_train_df.copy())
X_train_df_clean.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,184,185,186,187,188,189,190,191,192,193
0,0.0,4.0,0.0,0.0,0.0,1.0,134.2,2.0,5.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1.0,1.0,0.0,0.0,0.0,1.0,88.0,2.0,4.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,2.0,0.0,0.0,0.0,1.0,85.0,3.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,2.0,0.0,0.0,0.0,1.0,82.0,3.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,2.0,0.0,0.0,0.0,1.0,119.0,2.0,9.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [70]:
X_test

import pandas as pd

def clean_data(X_test_df):
    # Replace missing values with the median of each column in: 0, 1 and 192 other columns
    X_test_df = X_test_df.fillna({0: X_test_df[0].median(), 1: X_test_df[1].median(), 2: X_test_df[2].median(), 3: X_test_df[3].median(), 4: X_test_df[4].median(), 5: X_test_df[5].median(), 6: X_test_df[6].median(), 7: X_test_df[7].median(), 8: X_test_df[8].median(), 9: X_test_df[9].median(), 10: X_test_df[10].median(), 11: X_test_df[11].median(), 12: X_test_df[12].median(), 13: X_test_df[13].median(), 14: X_test_df[14].median(), 15: X_test_df[15].median(), 16: X_test_df[16].median(), 17: X_test_df[17].median(), 18: X_test_df[18].median(), 19: X_test_df[19].median(), 20: X_test_df[20].median(), 21: X_test_df[21].median(), 22: X_test_df[22].median(), 23: X_test_df[23].median(), 24: X_test_df[24].median(), 25: X_test_df[25].median(), 26: X_test_df[26].median(), 27: X_test_df[27].median(), 28: X_test_df[28].median(), 29: X_test_df[29].median(), 30: X_test_df[30].median(), 31: X_test_df[31].median(), 32: X_test_df[32].median(), 33: X_test_df[33].median(), 34: X_test_df[34].median(), 35: X_test_df[35].median(), 36: X_test_df[36].median(), 37: X_test_df[37].median(), 38: X_test_df[38].median(), 39: X_test_df[39].median(), 40: X_test_df[40].median(), 41: X_test_df[41].median(), 42: X_test_df[42].median(), 43: X_test_df[43].median(), 44: X_test_df[44].median(), 45: X_test_df[45].median(), 46: X_test_df[46].median(), 47: X_test_df[47].median(), 48: X_test_df[48].median(), 49: X_test_df[49].median(), 50: X_test_df[50].median(), 51: X_test_df[51].median(), 52: X_test_df[52].median(), 53: X_test_df[53].median(), 54: X_test_df[54].median(), 55: X_test_df[55].median(), 56: X_test_df[56].median(), 57: X_test_df[57].median(), 58: X_test_df[58].median(), 59: X_test_df[59].median(), 60: X_test_df[60].median(), 61: X_test_df[61].median(), 62: X_test_df[62].median(), 63: X_test_df[63].median(), 64: X_test_df[64].median(), 65: X_test_df[65].median(), 66: X_test_df[66].median(), 67: X_test_df[67].median(), 68: X_test_df[68].median(), 69: X_test_df[69].median(), 70: X_test_df[70].median(), 71: X_test_df[71].median(), 72: X_test_df[72].median(), 73: X_test_df[73].median(), 74: X_test_df[74].median(), 75: X_test_df[75].median(), 76: X_test_df[76].median(), 77: X_test_df[77].median(), 78: X_test_df[78].median(), 79: X_test_df[79].median(), 80: X_test_df[80].median(), 81: X_test_df[81].median(), 82: X_test_df[82].median(), 83: X_test_df[83].median(), 84: X_test_df[84].median(), 85: X_test_df[85].median(), 86: X_test_df[86].median(), 87: X_test_df[87].median(), 88: X_test_df[88].median(), 89: X_test_df[89].median(), 90: X_test_df[90].median(), 91: X_test_df[91].median(), 92: X_test_df[92].median(), 93: X_test_df[93].median(), 94: X_test_df[94].median(), 95: X_test_df[95].median(), 96: X_test_df[96].median(), 97: X_test_df[97].median(), 98: X_test_df[98].median(), 99: X_test_df[99].median(), 100: X_test_df[100].median(), 101: X_test_df[101].median(), 102: X_test_df[102].median(), 103: X_test_df[103].median(), 104: X_test_df[104].median(), 105: X_test_df[105].median(), 106: X_test_df[106].median(), 107: X_test_df[107].median(), 108: X_test_df[108].median(), 109: X_test_df[109].median(), 110: X_test_df[110].median(), 111: X_test_df[111].median(), 112: X_test_df[112].median(), 113: X_test_df[113].median(), 114: X_test_df[114].median(), 115: X_test_df[115].median(), 116: X_test_df[116].median(), 117: X_test_df[117].median(), 118: X_test_df[118].median(), 119: X_test_df[119].median(), 120: X_test_df[120].median(), 121: X_test_df[121].median(), 122: X_test_df[122].median(), 123: X_test_df[123].median(), 124: X_test_df[124].median(), 125: X_test_df[125].median(), 126: X_test_df[126].median(), 127: X_test_df[127].median(), 128: X_test_df[128].median(), 129: X_test_df[129].median(), 130: X_test_df[130].median(), 131: X_test_df[131].median(), 132: X_test_df[132].median(), 133: X_test_df[133].median(), 134: X_test_df[134].median(), 135: X_test_df[135].median(), 136: X_test_df[136].median(), 137: X_test_df[137].median(), 138: X_test_df[138].median(), 139: X_test_df[139].median(), 140: X_test_df[140].median(), 141: X_test_df[141].median(), 142: X_test_df[142].median(), 143: X_test_df[143].median(), 144: X_test_df[144].median(), 145: X_test_df[145].median(), 146: X_test_df[146].median(), 147: X_test_df[147].median(), 148: X_test_df[148].median(), 149: X_test_df[149].median(), 150: X_test_df[150].median(), 151: X_test_df[151].median(), 152: X_test_df[152].median(), 153: X_test_df[153].median(), 154: X_test_df[154].median(), 155: X_test_df[155].median(), 156: X_test_df[156].median(), 157: X_test_df[157].median(), 158: X_test_df[158].median(), 159: X_test_df[159].median(), 160: X_test_df[160].median(), 161: X_test_df[161].median(), 162: X_test_df[162].median(), 163: X_test_df[163].median(), 164: X_test_df[164].median(), 165: X_test_df[165].median(), 166: X_test_df[166].median(), 167: X_test_df[167].median(), 168: X_test_df[168].median(), 169: X_test_df[169].median(), 170: X_test_df[170].median(), 171: X_test_df[171].median(), 172: X_test_df[172].median(), 173: X_test_df[173].median(), 174: X_test_df[174].median(), 175: X_test_df[175].median(), 176: X_test_df[176].median(), 177: X_test_df[177].median(), 178: X_test_df[178].median(), 179: X_test_df[179].median(), 180: X_test_df[180].median(), 181: X_test_df[181].median(), 182: X_test_df[182].median(), 183: X_test_df[183].median(), 184: X_test_df[184].median(), 185: X_test_df[185].median(), 186: X_test_df[186].median(), 187: X_test_df[187].median(), 188: X_test_df[188].median(), 189: X_test_df[189].median(), 190: X_test_df[190].median(), 191: X_test_df[191].median(), 192: X_test_df[192].median(), 193: X_test_df[193].median()})
    return X_test_df

# Loaded variable 'X_test' from kernel state
X_test_df = pd.DataFrame(X_test.tolist() if len(X_test.shape) > 2 else X_test)

X_test_df_clean = clean_data(X_test_df.copy())
X_test_df_clean.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,184,185,186,187,188,189,190,191,192,193
0,1.0,2.0,0.0,0.0,0.0,1.0,108.0,2.0,5.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,2.0,0.0,0.0,0.0,1.0,73.0,2.0,10.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,1.0,3.0,0.0,0.0,1.0,3.0,212.0,3.0,9.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,2.0,0.0,0.0,1.0,2.0,56.0,2.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.0,2.0,0.0,0.0,0.0,1.0,108.0,2.0,5.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


Cleaning Y train and Y test

In [67]:
y_train

import pandas as pd

def clean_data(y_train_df):
    # Replace missing values with the median of each column in: 0
    y_train_df = y_train_df.fillna({0: y_train_df[0].median()})
    return y_train_df

# Loaded variable 'y_train' from kernel state
y_train_df = pd.DataFrame(y_train.tolist() if len(y_train.shape) > 2 else y_train)

y_train_df_clean = clean_data(y_train_df.copy())
y_train_df_clean.head()

Unnamed: 0,0
0,219000.0
1,239000.0
2,239000.0
3,384000.0
4,139000.0


In [78]:
y_test

import pandas as pd

def clean_data(y_test_df):
    # Replace missing values with 0 in column: 0
    y_test_df = y_test_df.fillna({0: 0})
    return y_test_df

# Loaded variable 'y_test' from kernel state
y_test_df = pd.DataFrame(y_test.tolist() if len(y_test.shape) > 2 else y_test)

y_test_df_clean = clean_data(y_test_df.copy())
y_test_df_clean.shape

(17657, 1)

Utiliser des transformers pour choisir quelle variable est plus intérressante pour le modèle
https://www.youtube.com/watch?v=T4nZDuakYlU&list=PLO_fdPEVlfKoHQ3Ua2NtDL4nmynQC8YiS&index=9

Model training


In [81]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_df_clean, y_train_df_clean)
y_pred = model.predict(X_test_df_clean)

  return fit_method(estimator, *args, **kwargs)


In [82]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Calculate evaluation metrics
mae = mean_absolute_error(y_test_df_clean, y_pred)
mse = mean_squared_error(y_test_df_clean, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_df_clean, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2 Score: {r2}")

MAE: 113289.38564937972
MSE: 25549177097.776703
RMSE: 159841.0995263005
R^2 Score: 0.15192718805418148
