In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import BaggingRegressor,GradientBoostingRegressor
from sklearn.metrics import mean_squared_log_error

from sklearn import linear_model
import time


# ***IMPORT DATAS***

In [2]:
url = "https://drive.google.com/file/d/1iVBv5R6U53mofNpI9EkpFUQfwhYBk9MZ/view?usp=share_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = df = pd.read_csv(path)

# ***SPLIT DATAS***

In [3]:
# X and y creation
X = data.copy()
X.pop('Id')
y = X.pop("SalePrice")
# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=341)

# ***PREPROCESSING***

In [4]:
start_time = time.time()

# 1. defining categorical & ordinal columns
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

# 2. numerical pipeline
numeric_pipe = make_pipeline(SimpleImputer())

# 3. categorical pipeline
    # # 3.1 defining ordinal & onehot columns

ordinal_col_names = ['ExterQual', 
                     'ExterCond', 
                     'BsmtQual', 
                     'BsmtCond', 
                     'BsmtExposure',
                     'BsmtFinType1', 
                     'KitchenQual', 
                     'FireplaceQu', 
                     'LotShape',
                     'BsmtFinType2', 
                     'HeatingQC', 
                     'GarageFinish', 
                     'GarageQual', 
                     'GarageCond',
                     'PoolQC', 
                     'Fence']

ordinal_cols = X_cat.columns.get_indexer(ordinal_col_names)
ohe_cols = X_cat.columns.get_indexer(list(set(X_cat) - set(ordinal_col_names)))

X_cat_ordinal = X_cat.columns[ordinal_cols]
X_cat_ohe = X_cat.columns[ohe_cols]

    ## 3.2 explicitly determine categories for ordinal encoding including "N_A"
ExterQual_cats = ["N_A","Po", "Fa", "TA", "Gd", "Ex"]
ExterCond_cats = ["N_A","Po", "Fa", "TA", "Gd", "Ex"]
BsmtQual_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtCond_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtExposure_cats = ["N_A", "No", "Mn", "Av", "Gd"]
BsmtFinType1_cats = ["N_A", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
KitchenQual_cats = ["N_A","Po", "Fa", "TA", "Gd", "Ex"]
FireplaceQu_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
LotShape_cats = ["N_A",'Reg', 'IR1', 'IR2', 'IR3']
BsmtFinType2_cats = ['N_A','Unf','LwQ','Rec','BLQ','ALQ','GLQ']
HeatingQC_cats = ["N_A","Po", "Fa", "TA", "Gd", "Ex"]
GarageFinish_cats = ['N_A','Unf','RFn','Fin']
GarageQual_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
GarageCond_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
PoolQC_cats = ["N_A","Po", "Fa", "TA", "Gd", "Ex"]
Fence_cats = ["N_A",'NA','MnWw','GdWo','MnPrv','GdPrv']

cats_ord = [ExterQual_cats, ExterCond_cats, BsmtQual_cats, BsmtCond_cats, 
            BsmtExposure_cats, BsmtFinType1_cats, KitchenQual_cats, FireplaceQu_cats, 
            LotShape_cats,BsmtFinType2_cats,HeatingQC_cats,GarageFinish_cats,GarageQual_cats,
            GarageCond_cats,PoolQC_cats,Fence_cats]

        ### 3.2.2. defining the categorical encoder: a ColumnTransformer with 2 branches: ordinal & onehot
categorical_encoder = ColumnTransformer(
    transformers=[
        ("cat_ordinal", OrdinalEncoder(categories=cats_ord), ordinal_cols),
        ("cat_onehot", OneHotEncoder(handle_unknown="ignore"), ohe_cols),
    ]
)

    ## 3.3. categorical pipeline = "N_A" imputer + categorical encoder
categorical_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="N_A"),
                                 categorical_encoder
                                )

# 4. full preprocessing: a ColumnTransformer with 2 branches: numeric & categorical
full_preprocessing = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categorical_pipe, X_cat.columns),
    ]
)
end_time = time.time()

time_taken_no_pca = end_time - start_time

print(f"Time taken: {time_taken_no_pca} seconds")

Time taken: 0.007885217666625977 seconds


# ***MODELING***

In [5]:
scaler = StandardScaler()
scores = pd.DataFrame()
KBest = SelectKBest(score_func=f_regression)

##***Ridge***

In [6]:
# start_time = time.time()

# Ridge_pipeline = make_pipeline(full_preprocessing, 
#                              scaler,
#                              KBest,
#                              linear_model.Ridge())

# param_grid = {
#     "selectkbest__k": range(10,80,5),
#     "columntransformer__num_pipe__simpleimputer__strategy":["mean","constant"],
#     "standardscaler__with_mean": [True, False],
#     "standardscaler__with_std": [True, False],
#     "ridge__alpha": range(1,30,3)
# }

# Ridge_search = RandomizedSearchCV(Ridge_pipeline,
#                       param_grid,
#                       cv=10,
#                       scoring="neg_root_mean_squared_error",
#                       verbose=1, 
#                       n_iter=100)

# Ridge_search.fit(X_train, y_train)

# scores.loc['Ridge', 'RMSE_best'] = Ridge_search.best_score_

# end_time = time.time()

# time_taken_no_pca = end_time - start_time

# print(f"Time taken: {time_taken_no_pca} seconds")

In [7]:
# Ridge_predictions = Ridge_search.predict(X_test)
# #scores.loc['Ridge', 'RMSLE'] = mean_squared_log_error(y_true = y_test, y_pred = Ridge_predictions, squared=False)
# scores.loc['Ridge', 'RMSE'] = mean_squared_error(y_true = y_test, y_pred = Ridge_predictions, squared=False)

# #scores.loc['Ridge', 'MAPE'] = mean_absolute_percentage_error(y_true = y_test, y_pred = Ridge_predictions)
# #scores.loc['Ridge', 'R2'] = r2_score(y_true = y_test, y_pred = Ridge_predictions)

##***Poisson***

In [8]:
Poisson_pipeline = make_pipeline(full_preprocessing,
                                     scaler, 
                                     KBest,
                                     linear_model.PoissonRegressor())

param_grid = {
    "selectkbest__k": range(10,80,5),
    "columntransformer__num_pipe__simpleimputer__strategy":["mean","constant"],
    "standardscaler__with_mean": [True, False],
    "standardscaler__with_std": [True, False],
    "poissonregressor__max_iter": range(10,1000,10),
    "poissonregressor__alpha": range(1,30,3)

}

Poisson_search = RandomizedSearchCV(Poisson_pipeline,
                      param_grid,
                      cv=10,
                      scoring="neg_root_mean_squared_error",
                      verbose=1, 
                      n_iter=100
                      )

Poisson_search.fit(X_train, y_train)
scores.loc['PoissonRegression', 'RMSE_best'] = Poisson_search.best_score_


end_time = time.time()

time_taken_no_pca = end_time - start_time

print(f"Time taken: {time_taken_no_pca} seconds")

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  return np.exp(lin_pred)
  return np.exp(lin_pred)
  return -2 * (y - y_pred) / self.unit_variance(y_pred)
  return -2 * (y - y_pred) / self.unit_variance(y_pred)
  return -2 * (y - y_pred) / self.unit_variance(y_pred)
  temp = d1 * family.deviance_derivative(y, y_pred, weights)
  dev = 2 * (xlogy(y, y / y_pred) - y + y_pred)
  dev = 2 * (xlogy(y, y / y_pred) - y + y_pred)
  dev = 2 * (xlogy(y, y / y_pred) - y + y_pred)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
  return np.exp(lin_pred)
  return np.exp(lin_pred)
  return -2 * (y - y_pred) / self.unit_variance(y_pred)
  return -2 * (y - y_pred) / self.unit_variance(y_pred)
  return -2 * (y - y_pred) / self.unit_variance(y_pred)
  temp = d1 * family.deviance_derivative(y, y_pred, weights)

Time taken: 205.32519125938416 seconds


  return np.exp(lin_pred)
  return np.exp(lin_pred)
  return -2 * (y - y_pred) / self.unit_variance(y_pred)
  dev = 2 * (xlogy(y, y / y_pred) - y + y_pred)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


In [9]:
Poisson_predictions = Poisson_search.predict(X_test)

scores.loc['PoissonRegression', 'RMSLE'] = mean_squared_log_error(y_true = y_test, y_pred = Poisson_predictions, squared=False)
scores.loc['PoissonRegression', 'RMSE'] = mean_squared_error(y_true = y_test, y_pred = Poisson_predictions, squared=False)

# scores.loc['PoissonRegression', 'MAPE'] = mean_absolute_percentage_error(y_true = y_test, y_pred = Poisson_predictions)
# scores.loc['PoissonRegression', 'R2'] = r2_score(y_true = y_test, y_pred = Poisson_predictions)


##***Linear***


In [10]:
# Linear_pipeline = make_pipeline(full_preprocessing,
#                                      scaler, 
#                                      KBest,
#                                      linear_model.LinearRegression())

# param_grid = {
#     "selectkbest__k": range(10,80,5),
#     "columntransformer__num_pipe__simpleimputer__strategy":["mean","constant"],
#     "standardscaler__with_mean": [True, False],
#     "standardscaler__with_std": [True, False]

# }

# Linear_search = RandomizedSearchCV(Linear_pipeline,
#                       param_grid,
#                       cv=10,
#                       scoring="neg_root_mean_squared_error",
#                       verbose=1, 
#                       n_iter=100
#                       )

# Linear_search.fit(X_train, y_train)

# # create a dictionary to keep track of the scores of different models 
# scores.loc['LinearRegression', 'RMSE_best'] = Linear_search.best_score_

# end_time = time.time()

# time_taken_no_pca = end_time - start_time

# print(f"Time taken: {time_taken_no_pca} seconds")

In [11]:
# LinearRegression_predictions = Linear_search.predict(X_test)
# #scores.loc['LinearRegression', 'RMSLE'] = mean_squared_log_error(y_true = y_test, y_pred = LinearRegression_predictions, squared=False)
# scores.loc['LinearRegression', 'RMSE'] = mean_squared_error(y_true = y_test, y_pred = LinearRegression_predictions, squared=False)

# # scores.loc['LinearRegression', 'MAPE'] = mean_absolute_percentage_error(y_true = y_test, y_pred = LinearRegression_predictions)
# # scores.loc['LinearRegression', 'R2'] = r2_score(y_true = y_test, y_pred = LinearRegression_predictions)

##***BaggingRegressor***

In [12]:
BaggingRegressor_pipeline = make_pipeline(full_preprocessing,
                                     scaler, 
                                     KBest,
                                     BaggingRegressor())

param_grid = {
    "selectkbest__k": range(10,80,5),
    "columntransformer__num_pipe__simpleimputer__strategy":["mean","constant"],
    "standardscaler__with_mean": [True, False],
    "standardscaler__with_std": [True, False],
    #"gradientboostingregressor__max_depth" : range(5,15)

}

BaggingRegressor_search = RandomizedSearchCV(BaggingRegressor_pipeline,
                      param_grid,
                      cv=10,
                      scoring="neg_root_mean_squared_error",
                      verbose=1, 
                      n_iter=100
                      )

BaggingRegressor_search.fit(X_train, y_train)

# create a dictionary to keep track of the scores of different models 
scores.loc['BaggingRegressor', 'RMSE_best'] = BaggingRegressor_search.best_score_

end_time = time.time()

time_taken_no_pca = end_time - start_time

print(f"Time taken: {time_taken_no_pca} seconds")

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom
  correlation_coefficient /= X_norms
  f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom
  correlation_coefficient /= X_norms
  f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom
  correlation_coefficient /= X_norms
  f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  f_statistic = corr_coef_squared / (1 - corr_coef_squared) 

Time taken: 411.56248331069946 seconds


In [13]:
BaggingRegressor_predictions = BaggingRegressor_search.predict(X_test)
scores.loc['BaggingRegressor', 'RMSE'] = mean_squared_error(y_true = y_test, y_pred = BaggingRegressor_predictions, squared=False)
scores.loc['BaggingRegressor', 'RMSLE'] = mean_squared_log_error(y_true = y_test, y_pred = BaggingRegressor_predictions, squared=False)

##***GradientBoostingRegressor***

In [25]:
GradientBoostingRegressor_pipeline = make_pipeline(full_preprocessing,
                                     StandardScaler(with_mean=True), 
                                     SelectKBest(score_func=f_regression),
                                     GradientBoostingRegressor(loss='squared_error'))

param_grid = {
    "selectkbest__k": range(10,80),
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    # "standardscaler__with_mean": [True, False],
    # "standardscaler__with_std": [True, False],
    "gradientboostingregressor__max_depth" : range(5,10),


}

GradientBoostingRegressor_search = GridSearchCV(GradientBoostingRegressor_pipeline,
                      param_grid,
                      cv=10,
                      scoring="neg_root_mean_squared_error",
                      verbose=1 
                      #n_iter=100
                      )

GradientBoostingRegressor_search.fit(X_train, y_train)

# create a dictionary to keep track of the scores of different models 
scores.loc['GradientBoostingRegressor', 'RMSE_best'] = GradientBoostingRegressor_search.best_score_

end_time = time.time()

time_taken_no_pca = end_time - start_time

print(f"Time taken: {time_taken_no_pca} seconds")

Fitting 10 folds for each of 700 candidates, totalling 7000 fits


  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
 

Time taken: 7873.438563108444 seconds


In [26]:
GradientBoostingRegressor_predictions = GradientBoostingRegressor_search.predict(X_test)
scores.loc['GradientBoostingRegressor', 'RMSE'] = mean_squared_error(y_true = y_test, y_pred = GradientBoostingRegressor_predictions, squared=False)
scores.loc['GradientBoostingRegressor', 'RMSLE'] = mean_squared_log_error(y_true = y_test, y_pred = GradientBoostingRegressor_predictions, squared=False)

##***LassoCV***

In [16]:
Lasso_pipeline = make_pipeline(full_preprocessing,
                                     StandardScaler(with_mean=True), 
                                     SelectKBest(score_func=f_regression),
                                     linear_model.LassoCV())

param_grid = {
    "selectkbest__k": range(10,80,5),
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    # "standardscaler__with_mean": [True, False],
    # "standardscaler__with_std": [True, False],
}

Lasso_search = GridSearchCV(Lasso_pipeline,
                      param_grid,
                      cv=10,
                      scoring="neg_root_mean_squared_error",
                      verbose=1 
                      #n_iter=100
                      )

Lasso_search.fit(X_train, y_train)
scores.loc['Lasso', 'RMSE_best'] = Lasso_search.best_score_

Fitting 10 folds for each of 28 candidates, totalling 280 fits


  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
 

In [17]:
Lasso_predictions = Lasso_search.predict(X_test)
scores.loc['Lasso', 'RMSE'] = mean_squared_error(y_true = y_test, y_pred = Lasso_predictions, squared=False)
scores.loc['Lasso', 'RMSLE'] = mean_squared_log_error(y_true = y_test, y_pred = Lasso_predictions, squared=False)

In [27]:
scores

Unnamed: 0,RMSE_best,RMSLE,RMSE
PoissonRegression,-43798.475088,0.143426,34748.421214
BaggingRegressor,-30214.134568,0.144324,33466.650546
GradientBoostingRegressor,-29825.643325,0.13775,31796.974259
Lasso,-34253.656662,0.162398,34431.722356


In [24]:
GradientBoostingRegressor_search.best_params_

{'columntransformer__num_pipe__simpleimputer__strategy': 'median',
 'gradientboostingregressor__max_depth': 5}

In [19]:
parameters_df = pd.DataFrame(index=['PoissonRegression', 'BaggingRegressor', 'GradientBoostingRegressor'],
                             data=[Poisson_search.best_params_,BaggingRegressor_search.best_params_, GradientBoostingRegressor_search.best_params_],
                             columns=['standardscaler__with_std', 'standardscaler__with_mean','columntransformer__num_pipe__simpleimputer__strategy','selectkbest__k'
                             ])

parameters_df = parameters_df.rename(columns=({'standardscaler__with_std':'S_scaler_std'
                            , 'standardscaler__with_mean': 'S_scaler_mean'
                            ,'columntransformer__num_pipe__simpleimputer__strategy': 'Num_Imputer_Strategy'
                            ,'selectkbest__k':'FeatureKBest'}))

In [20]:
parameters_df

Unnamed: 0,S_scaler_std,S_scaler_mean,Num_Imputer_Strategy,FeatureKBest
PoissonRegression,True,False,mean,70.0
BaggingRegressor,False,False,mean,70.0
GradientBoostingRegressor,,,median,


#Submission


In [28]:
url = "https://drive.google.com/file/d/1jnn7sVeWjrKyWe2DDkpbtGpM-vCmWsnW/view?usp=share_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
Competition_DF = competition_df = pd.read_csv(path)

Compet_DF = Competition_DF.copy()
Compet_DF.pop('Id')

Alex_Compet_Submission = pd.DataFrame(Competition_DF["Id"])

Submission1 = BaggingRegressor_search.predict(Compet_DF)

Alex_Compet_Submission['SalePrice'] = Submission1

Alex_Compet_Submission.to_csv('Submission.csv', index=False)
from google.colab import files
files.download("Submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>