In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn import set_config
set_config(display='diagram')
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import GridSearchCV



In [2]:
data_train = pd.read_csv("data/train.csv")

In [3]:
data_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.00,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.00,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.00,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.00,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.00,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.00,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.00,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.00,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.00,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [4]:
X = data_train.drop(columns = ["Id", "SalePrice"])
y = np.log(data_train.SalePrice)

In [5]:
X.replace(['Po', 'Fa', 'TA', 'Gd', 'Ex'], [1, 2, 3, 4, 5], inplace=True)
X.replace(['No', 'Mn','Av'], [1, 2, 3], inplace=True)
X[['LotFrontage', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'BsmtExposure', 'PoolQC', 'FireplaceQu', 'MasVnrArea']] = X[['LotFrontage', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'BsmtExposure', 'PoolQC', 'FireplaceQu', 'MasVnrArea']].fillna(value=0)

In [6]:
X.isna().sum().sort_values(ascending=False)

MiscFeature     1406
Alley           1369
Fence           1179
GarageType        81
GarageYrBlt       81
                ... 
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
KitchenQual        0
MSSubClass         0
Length: 79, dtype: int64

In [7]:
X.drop(columns=['MiscFeature', 'Alley', 'Fence'], inplace=True)

In [8]:
X_cat = X.select_dtypes(include = "object")
X_num = X.select_dtypes(include = ["int64", "float64"])

In [9]:
X_cat.isna().sum().sort_values(ascending=False).head(10)

GarageFinish     81
GarageType       81
BsmtFinType2     38
BsmtFinType1     37
MasVnrType        8
Electrical        1
SaleCondition     0
LotConfig         0
Condition1        0
Neighborhood      0
dtype: int64

In [10]:
X_num.isna().sum().sort_values(ascending=False).head(10)

GarageYrBlt     81
YrSold           0
BsmtFullBath     0
LowQualFinSF     0
2ndFlrSF         0
1stFlrSF         0
HeatingQC        0
TotalBsmtSF      0
BsmtUnfSF        0
BsmtFinSF2       0
dtype: int64

In [11]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_num_modified = imp_mean.fit_transform(X_num)
df_num_modified = pd.DataFrame(X_num_modified, columns=X_num.columns.values)
imp_frequent = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X_cat_modified = imp_frequent.fit_transform(X_cat)
df_cat_modified = pd.DataFrame(X_cat_modified, columns=X_cat.columns.values)

In [12]:
encoder = OneHotEncoder(sparse=False)
encoded = encoder.fit_transform(df_cat_modified)
df_encoded = pd.DataFrame(encoded)

In [13]:
df_train_modified = pd.concat([df_num_modified, df_encoded], axis=1)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df_train_modified, y, test_size=0.30, random_state=42)

In [15]:
#df_cat_train = X_train.select_dtypes(include = "object")
#df_cat_test = X_test.select_dtypes(include = "object")
#df_num_train = X_train.select_dtypes(include = ["int64", "float64"])
#df_num_test = X_test.select_dtypes(include = ["int64", "float64"])

In [16]:
#scaler = RobustScaler()
#scaled_train = scaler.fit_transform(df_num_train)
#scaled_test = scaler.fit_transform(df_num_test)
#df_scaled_train = pd.DataFrame(scaled_train)
#df_scaled_test = pd.DataFrame(scaled_test)

In [17]:
#df_scaled_and_encoded_train = pd.concat([df_scaled_train, df_cat_train], axis=1)
#df_scaled_and_encoded_test = pd.concat([df_scaled_test, df_cat_test], axis=1)

In [18]:
ridge = RidgeCV()

In [19]:
ridge.fit(X_train, y_train)

In [20]:
ridge.score(X_train, y_train)

0.9164327092261149

In [21]:
ridge.score(X_test, y_test)

0.9011875657130977

In [22]:
predictions = ridge.predict(X_test)

In [23]:
np.sqrt(mean_squared_error(y_test, predictions))

0.1294725860191083

In [24]:
np.exp(predictions)

array([151151.28685853, 326888.32574728, 107653.9526527 , 166480.08085518,
       318480.43773443,  82978.71774614, 256373.45914222, 142807.83460902,
        79700.31975353, 142865.79979382, 148636.13873549, 122136.38667251,
        95279.27829575, 215795.0416941 , 172330.03125481, 134114.57604854,
       195115.08896553, 129058.33957799, 108722.52734306, 213782.34954549,
       159126.14201758, 205975.93063407, 182223.0337276 , 137407.98812476,
       205400.40474961, 151037.54690817, 199865.83420367, 105413.29555288,
       171545.34928467, 192399.29258035, 125550.41377977, 275955.76801998,
       186678.27605877, 113221.99527076, 273051.29145846, 147599.53312841,
       143486.64312106, 206212.58966792, 345560.52326816, 102561.28752255,
       138361.18231442, 238736.25068136, 109324.66635137, 319630.13454496,
       130918.14253646, 117588.74565275, 106758.72173764, 131026.76912624,
       417006.36836458, 126505.2134044 , 118419.58426096, 204912.10679696,
       115026.33770765, 3

In [25]:
data_test = pd.read_csv("data/test.csv")
data_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.00,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.00,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.00,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.00,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.00,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.00,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.00,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.00,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.00,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [26]:
data_test.replace(['Po', 'Fa', 'TA', 'Gd', 'Ex'], [1, 2, 3, 4, 5], inplace=True)
data_test.replace(['No', 'Mn','Av'], [1, 2, 3], inplace=True)
data_test[['LotFrontage', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'BsmtExposure', 'PoolQC', 'FireplaceQu', 'MasVnrArea']] = data_test[['LotFrontage', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'BsmtExposure', 'PoolQC', 'FireplaceQu', 'MasVnrArea']].fillna(value=0)

In [27]:
data_test.drop(columns=['MiscFeature', 'Alley', 'Fence'], inplace=True)

In [28]:
data_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.00,11622,Pave,Reg,Lvl,AllPub,Inside,...,0,0,120,0,0.00,0,6,2010,WD,Normal
1,1462,20,RL,81.00,14267,Pave,IR1,Lvl,AllPub,Corner,...,0,0,0,0,0.00,12500,6,2010,WD,Normal
2,1463,60,RL,74.00,13830,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0.00,0,3,2010,WD,Normal
3,1464,60,RL,78.00,9978,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0.00,0,6,2010,WD,Normal
4,1465,120,RL,43.00,5005,Pave,IR1,HLS,AllPub,Inside,...,0,0,144,0,0.00,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.00,1936,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0.00,0,6,2006,WD,Normal
1455,2916,160,RM,21.00,1894,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0.00,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.00,20000,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0.00,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.00,10441,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0.00,700,7,2006,WD,Normal


In [29]:
data_test_cat = data_test.select_dtypes(include = "object")
data_test_num = data_test.select_dtypes(include = ["int64", "float64"])

In [30]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
data_test_num_modified = imp_mean.fit_transform(data_test_num)
df_test_num_modified = pd.DataFrame(data_test_num_modified, columns=data_test_num.columns.values)
imp_frequent = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
data_test_cat_modified = imp_frequent.fit_transform(data_test_cat)
df_test_cat_modified = pd.DataFrame(data_test_cat_modified, columns=data_test_cat.columns.values)

In [31]:
encoder = OneHotEncoder(sparse=False)
test_encoded = encoder.fit_transform(df_cat_modified)
df_test_encoded = pd.DataFrame(test_encoded)

In [32]:
df_test_modified = pd.concat([df_test_num_modified, df_test_encoded], axis=1)

In [33]:
df_test_modified

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,...,184,185,186,187,188,189,190,191,192,193
0,1461.00,20.00,80.00,11622.00,5.00,6.00,1961.00,1961.00,0.00,3.00,...,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00
1,1462.00,20.00,81.00,14267.00,6.00,6.00,1958.00,1958.00,108.00,3.00,...,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00
2,1463.00,60.00,74.00,13830.00,5.00,5.00,1997.00,1998.00,0.00,3.00,...,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00
3,1464.00,60.00,78.00,9978.00,6.00,6.00,1998.00,1998.00,20.00,3.00,...,0.00,0.00,0.00,1.00,1.00,0.00,0.00,0.00,0.00,0.00
4,1465.00,120.00,43.00,5005.00,8.00,5.00,1992.00,1992.00,0.00,4.00,...,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,2916.00,160.00,21.00,1894.00,4.00,5.00,1970.00,1970.00,0.00,3.00,...,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00
1456,2917.00,20.00,160.00,20000.00,5.00,7.00,1960.00,1996.00,0.00,3.00,...,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00
1457,2918.00,85.00,62.00,10441.00,5.00,5.00,1992.00,1992.00,0.00,3.00,...,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00
1458,2919.00,60.00,74.00,9627.00,7.00,5.00,1993.00,1994.00,94.00,3.00,...,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00


In [34]:
data_id = df_test_modified[['Id']]
data_id = data_id.drop(index=1459)
data_id

Unnamed: 0,Id
0,1461.00
1,1462.00
2,1463.00
3,1464.00
4,1465.00
...,...
1454,2915.00
1455,2916.00
1456,2917.00
1457,2918.00


In [35]:
df_test_modified = df_test_modified.drop(index=1459)

In [36]:
df_test_modified.drop(columns='Id', inplace=True)

In [37]:
predictions_test = ridge.predict(df_test_modified)

In [40]:
submit = pd.DataFrame(np.exp(predictions_test))
submit_final = pd.concat([data_id, submit], axis=1)
submit_final.columns = ["Id", "SalePrice"]
submit_final
submit_final['Id'] = submit_final['Id'].astype(np.int32)
submit_final.to_csv('submit_bis.csv', index=False)

In [41]:
submit_final

Unnamed: 0,Id,SalePrice
0,1461,121179.85
1,1462,127205.89
2,1463,171592.66
3,1464,182381.98
4,1465,179520.37
...,...,...
1454,2915,113569.63
1455,2916,98762.01
1456,2917,159022.59
1457,2918,124657.51
