In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor, RidgeCV
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn import set_config
set_config(display='diagram')
from lazypredict.Supervised import LazyRegressor
from sklearn.metrics import mean_squared_log_error, make_scorer, mean_squared_error
from sklearn.model_selection import GridSearchCV



In [2]:
data = pd.read_csv("data/train.csv")
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.00,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.00,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.00,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.00,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.00,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.00,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.00,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.00,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.00,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [3]:
X = data.drop(columns = ["Id", "SalePrice"])
y = np.log(data.SalePrice)

In [4]:
X.replace(['Po', 'Fa', 'TA', 'Gd', 'Ex'], [1, 2, 3, 4, 5], inplace=True)
X.replace(['No', 'Mn','Av'], [1, 2, 3], inplace=True)
X[['LotFrontage', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'BsmtExposure', 'PoolQC', 'FireplaceQu', 'MasVnrArea']] = X[['LotFrontage', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'BsmtExposure', 'PoolQC', 'FireplaceQu', 'MasVnrArea']].fillna(value=0)

In [5]:
null = {}
drop = []
for i in range(0,X.shape[1]):
    if 0 < X.iloc[:,i].isna().sum() <= X.shape[0]*(0.3):
        null[X.columns[i]] = X.iloc[:,i].isna().sum()
    elif X.iloc[:,i].isna().sum() > X.shape[0]*(0.3):# If the null percentage is greater than 30% we will drop that columns. 
        drop.append(X.columns[i])
X.drop(columns = drop, inplace=True)

In [6]:
drop

['Alley', 'Fence', 'MiscFeature']

In [7]:
X_cat = X.select_dtypes(include = "object")
X_num = X.select_dtypes(include = ["int64", "float64"])

In [8]:
#X_cat = X_cat.fillna(value='None')

In [9]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X_cat_modified = imp_mean.fit_transform(X_cat)

In [10]:
X = X.drop(columns=X_cat)
X_new = pd.DataFrame(X_cat_modified, columns=X_cat.columns.values)
X = pd.concat([X, X_new], axis=1)

In [11]:
df_garage = X_num[['YearBuilt', 'GarageYrBlt']]
df_garage.dropna()
df_garage["Difference"] = df_garage["GarageYrBlt"] - df_garage["YearBuilt"]
df_garage["Difference"].mean()

5.547498187092096

In [12]:
#count = 0
#big_cat = pd.DataFrame()
#small_cat = pd.DataFrame()
#for i in range(0,len(X_cat.columns)):
    #column = X_cat.columns[i]
    #print("Valeurs uniques pour " + X_cat.columns[i] + " : " )
    #print(len(X_cat[column].unique()))
    #if len(X_cat[column].unique()) > 7:
        #big_cat[column] = X_cat[column]
    #else:
        #small_cat[column] = X_cat[column]
    #count += len(X_cat[column].unique())

In [13]:
#big_cat

In [14]:
X_cat = X.select_dtypes(include = "object")
X_num = X.select_dtypes(include = ["int64", "float64"])

In [15]:
X_num["GarageYrBlt"] = X_num["GarageYrBlt"].fillna(value = X_num["YearBuilt"] + 6)

In [16]:
#small_cat

In [17]:
scaler = RobustScaler()
scaled = scaler.fit_transform(X_num)
df_scaled = pd.DataFrame(scaled)

In [18]:
encoder = OneHotEncoder(sparse=False)

In [19]:
#small_encoded = encoder.fit_transform(small_cat)
#df_small_encoded = pd.DataFrame(small_encoded)

In [20]:
encoded = encoder.fit_transform(X_cat)
df_encoded = pd.DataFrame(encoded)

In [21]:
#df_test = pd.concat([df_scaled, df_small_encoded], axis=1)
df_total = pd.concat([df_scaled, df_encoded], axis=1)

In [22]:
#X_train, X_test, y_train, y_test = train_test_split(df_test, y, test_size=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(df_total, y, test_size=0.30, random_state=42)

In [23]:
#reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)

In [24]:
#models, predictions = reg.fit(X_train, X_test, y_train, y_test)

In [25]:
#models

In [26]:
ridge = RidgeCV()

In [27]:
ridge.fit(X_train, y_train)

In [28]:
ridge.score(X_train, y_train)

0.9163648605198732

In [29]:
ridge.score(X_test, y_test)

0.9019708477125709

In [30]:
y_pred = ridge.predict(X_test)

In [31]:
np.sqrt(mean_squared_error(y_test, y_pred))

0.12895840315061322

In [32]:
#%%time

#params = {
            #'alpha': [1, 2, 5, 10, 25, 50, 100],
            #'max_iter': list(range(1000, 20000, 1000)),
            #'tol' : [1e-4, 1e-5, 1e-6, 1e-7],
         #}

#poisson_regressor_grid = GridSearchCV(PoissonRegressor(), scoring='neg_mean_squared_log_error', param_grid=params, n_jobs=-1, cv=5, verbose=5)
#poisson_regressor_grid.fit(X_train,y_train)

#print('Train Accuracy : %.3f'%poisson_regressor_grid.best_estimator_.score(X_train, y_train))
#print('Test Accuracy : %.3f'%poisson_regressor_grid.best_estimator_.score(X_test, y_test))
#print('Best Accuracy Through Grid Search : %.3f'%poisson_regressor_grid.best_score_)

In [33]:
data_test = pd.read_csv("data/test.csv")
data_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.00,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.00,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.00,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.00,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.00,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.00,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.00,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.00,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.00,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [34]:
data_test.replace(['Po', 'Fa', 'TA', 'Gd', 'Ex'], [1, 2, 3, 4, 5], inplace=True)
data_test.replace(['No', 'Mn','Av'], [1, 2, 3], inplace=True)
data_test[['LotFrontage', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'BsmtExposure', 'PoolQC', 'FireplaceQu', 'MasVnrArea']] = data_test[['LotFrontage', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'BsmtExposure', 'PoolQC', 'FireplaceQu', 'MasVnrArea']].fillna(value=0)

In [35]:
data_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.00,11622,Pave,,Reg,Lvl,AllPub,...,120,0,0.00,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.00,14267,Pave,,IR1,Lvl,AllPub,...,0,0,0.00,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.00,13830,Pave,,IR1,Lvl,AllPub,...,0,0,0.00,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.00,9978,Pave,,IR1,Lvl,AllPub,...,0,0,0.00,,,0,6,2010,WD,Normal
4,1465,120,RL,43.00,5005,Pave,,IR1,HLS,AllPub,...,144,0,0.00,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.00,1936,Pave,,Reg,Lvl,AllPub,...,0,0,0.00,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.00,1894,Pave,,Reg,Lvl,AllPub,...,0,0,0.00,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.00,20000,Pave,,Reg,Lvl,AllPub,...,0,0,0.00,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.00,10441,Pave,,Reg,Lvl,AllPub,...,0,0,0.00,MnPrv,Shed,700,7,2006,WD,Normal


In [36]:
null = {}
drop = []
for i in range(0,data_test.shape[1]):
    if 0 < data_test.iloc[:,i].isna().sum() <= data_test.shape[0]*(0.3):
        null[data_test.columns[i]] = data_test.iloc[:,i].isna().sum()
    elif data_test.iloc[:,i].isna().sum() > data_test.shape[0]*(0.3):# If the null percentage is greater than 30% we will drop that columns. 
        drop.append(data_test.columns[i])
data_test.drop(columns = drop, inplace=True)

In [37]:
data_cat = data_test.select_dtypes(include = "object")
data_num = data_test.select_dtypes(include = ["int64", "float64"])

In [38]:
imp_test = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
data_cat_modified = imp_test.fit_transform(data_cat)

In [39]:
data_test = data_test.drop(columns=data_cat)
data_new = pd.DataFrame(data_cat_modified, columns=data_cat.columns.values)
data_test = pd.concat([data_test, data_new], axis=1)

In [40]:
data_cat = data_test.select_dtypes(include = "object")
data_num = data_test.select_dtypes(include = ["int64", "float64"])

In [41]:
data_num["GarageYrBlt"] = data_num["GarageYrBlt"].fillna(value = data_num["YearBuilt"] + 6)

In [47]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
data_num_modified = imp_mean.fit_transform(data_num)

In [49]:
data_id = data_num[['Id']]
data_id

Unnamed: 0,Id
0,1461
1,1462
2,1463
3,1464
4,1465
...,...
1454,2915
1455,2916
1456,2917
1457,2918


In [50]:
data_num.drop(columns = 'Id', inplace=True)

In [51]:
scaled = scaler.fit_transform(data_num_modified)
data_scaled = pd.DataFrame(scaled)
encoded = encoder.fit_transform(data_cat)
data_encoded = pd.DataFrame(encoded)
data_total = pd.concat([data_scaled, data_encoded], axis=1)

In [52]:
ridge_submit = RidgeCV()

In [53]:
ridge_submit.fit(df_total, y)

In [55]:
data_total

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,169,170,171,172,173,174,175,176,177,178
0,-1.00,-0.60,0.50,0.54,-0.50,1.00,-0.25,-0.76,0.00,0.00,...,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00
1,-1.00,-0.60,0.53,1.18,0.00,1.00,-0.31,-0.83,0.67,0.00,...,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00
2,-1.00,0.20,0.32,1.07,-0.50,0.00,0.50,0.15,0.00,0.00,...,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00
3,-1.00,0.20,0.44,0.14,0.00,1.00,0.52,0.15,0.12,0.00,...,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00
4,-0.99,1.40,-0.59,-1.06,1.00,0.00,0.40,0.00,0.00,1.00,...,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0.99,2.20,-1.24,-1.81,-1.00,2.00,-0.06,-0.54,0.00,0.00,...,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00
1455,1.00,2.20,-1.24,-1.82,-1.00,0.00,-0.06,-0.54,0.00,0.00,...,0.00,0.00,0.00,1.00,1.00,0.00,0.00,0.00,0.00,0.00
1456,1.00,-0.60,2.85,2.57,-0.50,2.00,-0.27,0.10,0.00,0.00,...,0.00,0.00,0.00,1.00,1.00,0.00,0.00,0.00,0.00,0.00
1457,1.00,0.70,-0.03,0.25,-0.50,0.00,0.40,0.00,0.00,0.00,...,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00


In [54]:
y_pred_test = ridge_submit.predict(data_total)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 241 is different from 227)

In [None]:
predictions = np.exp(y_pred_test)

In [None]:
submit = pd.DataFrame(predictions)

In [None]:
submit_final = pd.concat([data_id, submit], axis=1)

In [None]:
submit_final.columns = ["Id", "SalePrice"]
submit_final
submit_final.iat[1459, 0] = 2920
submit_final = submit_final.drop(index=1459)

In [None]:
submit_final['Id'] = submit_final['Id'].astype(np.int32)
submit_final.to_csv('submit.csv', index=False)

In [None]:
submit_final