In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import scipy.stats as st
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import seaborn as sns
import matplotlib.pyplot as plt
from warnings import filterwarnings

filterwarnings("ignore")
sns.set_theme()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.feature_selection import RFE, RFECV
from mlxtend.feature_selection import SequentialFeatureSelector

from pandas_profiling import ProfileReport

print("imported")

In [2]:
data = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
data.head()

In [3]:
data.T.head(60)

In [4]:
data.T.tail(20)

In [5]:
profile = ProfileReport(data)
profile

## NULL HANDLING

In [6]:
null_log = ((data.isnull().sum()/data.shape[0])*100).to_frame().rename(columns = {0:"null_percentage"})
null_log["datatypes"] = data.dtypes.values
null_log[null_log["null_percentage"]>40]

In [7]:
data.drop(null_log[null_log["null_percentage"]>40].index, axis = 1, inplace = True)

In [8]:
null_log = ((data.isnull().sum()/data.shape[0])*100).to_frame().rename(columns = {0:"null_percentage"})
null_log["datatypes"] = data.dtypes.values
null_log[(null_log["null_percentage"]>0) &(null_log["null_percentage"]<10)]

In [9]:
indexes = null_log[(null_log["null_percentage"]>0)&(null_log["null_percentage"]<10)].index
data.dropna(subset = indexes.to_list(), inplace = True)

In [10]:
null_log = ((data.isnull().sum()/data.shape[0])*100).to_frame().rename(columns = {0:"null_percentage"})
null_log["datatypes"] = data.dtypes.values
null_log[null_log["null_percentage"]>0]

In [11]:
data.dropna(inplace = True)

In [12]:
data.isnull().sum().sum()

## OUTLIER HANDLING

In [13]:
data.drop("Id", axis = 1, inplace = True)

In [14]:
def outliers(df):
    a,b,c = 10,4,1
    plt.figure(figsize = (50,100))
    for i in df.columns:
        plt.subplot(a,b,c)
        sns.boxplot(df[i])
        plt.title(i.upper())
        c = c+1
    plt.show()

In [15]:
'''a,b,c = 13,3,1
plt.figure(figsize = (30,100))
for i in data.select_dtypes("number").columns:
    plt.subplot(a,b,c)
    sns.boxplot(data[i])
    plt.xticks(rotation = 45)
    c = c+1
plt.show()'''

outliers(data.select_dtypes("number"))

In [16]:
feature = ["BsmtFinSF2","LowQualFinSF","BsmtHalfBath","KitchenAbvGr",
           "EnclosedPorch","3SsnPorch","ScreenPorch","PoolArea","MiscVal"]

outliers(data[feature])

In [17]:
data.drop(feature, axis = 1, inplace = True)

In [18]:
outliers(data.select_dtypes("number"))

In [19]:
features = data.select_dtypes("number").columns
for i in features:
    n25, n50, n75 = np.percentile(data[i], [25,50,75])
    iqr = n75 - n25
    lb = n25 - (1.5*iqr)
    ub = n75 + (1.5*iqr)
    outlier_index  = data[(data[i]<lb)|(data[i]>ub)].index
    for j in outlier_index:
        if data.loc[j,i]<lb:
            data.loc[j,i] = lb
        elif data.loc[j,i]>ub:
            data.loc[j,i] = ub

In [20]:
outliers(data.select_dtypes("number"))

In [21]:
data_num = data.select_dtypes("number")
data_cat = data.select_dtypes("object")

data_num.corr().style.background_gradient()

In [22]:
scaler = StandardScaler()
scaled = scaler.fit_transform(data_num.drop("SalePrice", axis = 1))
data_num[data_num.drop("SalePrice", axis = 1).columns] = scaled
data_num.corr().style.background_gradient()

In [23]:
## -- categorical data

cat_columns = data_cat.columns
cat_dct = dict()
for i in cat_columns:
    counts = data_cat[i].nunique()
    cat_dct.update({i:counts})

unique_count = pd.DataFrame(cat_dct, index = ["counts"]).T
optimum_count = unique_count[unique_count["counts"]<=8].index
optimum_count

In [24]:
data_cat = data_cat[optimum_count]
data_cat_encoded = pd.get_dummies(data_cat, drop_first = True)
data_cat_encoded.head(3)

In [25]:
final_data = pd.concat([data_num, data_cat_encoded], axis = 1)
final_data.head()

In [26]:
final_data.shape

In [27]:
report = ProfileReport(final_data)
report

In [28]:
x = final_data.drop("SalePrice", axis = 1)
y = final_data["SalePrice"]

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 0)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [29]:
x_train_const, x_test_const = sm.add_constant(x_train), sm.add_constant(x_test)
model = sm.OLS(y_train, x_train_const).fit()
model.summary()

In [30]:
# -- no autocorrelation.
# -- multicollinearity is there.
# -- residuals are not normally distributed.

import statsmodels.stats.api as smapi

smapi.het_goldfeldquandt(model.resid, x_train_const)

# -- no heteroscedasticity. 

In [31]:
p_val = model.pvalues.to_frame().rename(columns = {0:"p_val"})
less_p_val = p_val[p_val["p_val"] > 0.05].index
less_p_val

In [32]:
model = sm.OLS(y_train, x_train_const.drop(less_p_val, axis = 1)).fit()
model.summary()

In [33]:
p_val2 = model.pvalues.to_frame().rename(columns = {0:"p_val"})
sig_features = p_val2[p_val2["p_val"] < 0.05].index
sig_features

In [34]:
model = sm.OLS(y_train, x_train_const[sig_features]).fit()
model.summary()

In [35]:
model = sm.OLS(y_train, x_train_const[sig_features].drop(["GarageQual_TA"], axis = 1)).fit()
model.summary()

In [36]:
# -- from cond.no the model is being affected by multicollinearity

trained_features = x_train_const[sig_features].drop(["GarageQual_TA"], axis = 1).columns

vif_dct = dict()
#[vif_dct.update(trained_features[i]: variance_inflation_factor(x_train_const[trained_features].values,i)) for i in range(len(trained_features))]
for i in range(len(trained_features)):
    vif = variance_inflation_factor(x_train_const[trained_features].values,i)
    feature = trained_features[i]
    vif_dct.update({feature:vif})

pd.DataFrame(vif_dct, index = ["vif"]).T

In [37]:

trained_features = x_train_const[sig_features].drop(["GarageQual_TA", "MSZoning_FV", "RoofMatl_CompShg", "KitchenQual_TA", "RoofMatl_Membran"], axis = 1).columns

vif_dct = dict()
#[vif_dct.update(trained_features[i]: variance_inflation_factor(x_train_const[trained_features].values,i)) for i in range(len(trained_features))]
for i in range(len(trained_features)):
    vif = variance_inflation_factor(x_train_const[trained_features].values,i)
    feature = trained_features[i]
    vif_dct.update({feature:vif})

pd.DataFrame(vif_dct, index = ["vif"]).T

In [38]:
model = sm.OLS(y_train, x_train_const[trained_features]).fit()
model.summary()

In [39]:
p_val3 = model.pvalues.to_frame().rename(columns = {0:"p_val"})
sig_features = p_val3[p_val3["p_val"] < 0.05].index
sig_features

In [40]:
model = sm.OLS(y_train, x_train_const[sig_features]).fit()
model.summary()

In [41]:
model = sm.OLS(y_train, x_train_const[sig_features]).fit()
model.summary()

In [42]:
linear = LinearRegression()

In [43]:
linear_rfecv = RFECV(estimator=linear,scoring = "r2")

In [44]:
linear_rfecv.fit(x_train_const, y_train)
linear_rfecv.ranking_

In [45]:
ranked = pd.DataFrame({
    "features":x_train_const.columns,
    "rank":linear_rfecv.ranking_
})
features = ranked[ranked["rank"] == 1]["features"].values
features

In [46]:
model = sm.OLS(y_train, x_train_const[features]).fit()
model.summary()

In [47]:
p_val4 = model.pvalues.to_frame().rename(columns = {0:"p_val"})
sig_features = p_val4[p_val4["p_val"] < 0.05].index
sig_features

In [48]:
model = sm.OLS(y_train, x_train_const[sig_features]).fit()
model.summary()

### MODEL BUILT WITH RFECV FEATURES

In [49]:
p_val5 = model.pvalues.to_frame().rename(columns = {0:"p_val"})
sig_features = p_val5[p_val5["p_val"] < 0.05].index
sig_features

In [50]:
model = sm.OLS(y_train, x_train_const[sig_features]).fit()
model.summary()

# SFS

In [51]:
linear_sfs = [
    'MSSubClass','LotArea','OverallQual','OverallCond','YearBuilt',
    'YearRemodAdd','MasVnrArea','BsmtFinSF1','GrLivArea','TotRmsAbvGrd',
    'Fireplaces','GarageCars','WoodDeckSF','OpenPorchSF','MSZoning_RM',
    'Street_Pave','LotConfig_FR2','Condition2_Feedr','BldgType_TwnhsE',
    'HouseStyle_1Story','HouseStyle_SFoyer','HouseStyle_SLvl','RoofMatl_Tar&Grv',
    'MasVnrType_None','MasVnrType_Stone','ExterQual_TA','Foundation_PConc',
    'BsmtQual_Fa','BsmtQual_Gd','BsmtQual_TA','BsmtCond_Gd','BsmtCond_TA',
    'BsmtExposure_Gd','BsmtFinType1_GLQ','BsmtFinType2_BLQ','Electrical_FuseP',
    'Functional_Typ','GarageType_Basment','GarageQual_Po','PavedDrive_Y',
    'SaleCondition_Alloca','SaleCondition_Normal','SaleCondition_Partial']

In [52]:
features = list(linear_sfs)
model = sm.OLS(y_train, x_train_const[features]).fit()
model.summary()

In [53]:
model = sm.OLS(y_train, x_train_const[features].drop(["YearBuilt","WoodDeckSF","LotConfig_FR2","HouseStyle_SFoyer","Foundation_PConc","BsmtQual_Fa","BsmtFinType2_BLQ","SaleCondition_Alloca","GarageType_Basment","GarageQual_Po"], axis = 1)).fit()
model.summary()

In [54]:
p_val6 = model.pvalues.to_frame().rename(columns = {0:"p_val"})
sig_features = p_val6[p_val6["p_val"] < 0.05].index
sig_features

In [55]:
model = sm.OLS(y_train, x_train_const[sig_features]).fit()
model.summary()

In [56]:
y_train_pred = model.predict(x_train_const[sig_features])
y_test_pred = model.predict(x_test_const[sig_features])

print(f"train_r2_score   : {r2_score(y_train,y_train_pred)} ")
print(f"test_r2_score    : {r2_score(y_test, y_test_pred)} ")
print(f"train_rmse_score : {mean_squared_error(y_train,y_train_pred, squared=False)} ")
print(f"test_rmse_score  : {mean_squared_error(y_test, y_test_pred, squared=False)} ")

In [57]:
# -- note : for bias - we use lasso
lasso = Lasso(alpha = 10)
lasso.fit(x_train_const[sig_features], y_train)
y_train_pred = lasso.predict(x_train_const[sig_features])
y_test_pred = lasso.predict(x_test_const[sig_features])

print(f"train_r2_score   : {r2_score(y_train,y_train_pred)} ")
print(f"test_r2_score    : {r2_score(y_test, y_test_pred)} ")
print(f"train_rmse_score : {mean_squared_error(y_train,y_train_pred, squared=False)} ")
print(f"test_rmse_score  : {mean_squared_error(y_test, y_test_pred, squared=False)} ")
score = cross_val_score(lasso, x_train_const[sig_features], y_train, cv = 5, scoring="r2")
print(f"scores           : {score}")
print(f"bias             : {1 - np.mean(score)}")
print(f"variance         : {np.std(score)/np.std(score)}")

In [58]:
# -- for variance -- we use ridge

ridge = Ridge(alpha=100)
ridge.fit(x_train_const[sig_features], y_train)
y_train_pred = ridge.predict(x_train_const[sig_features])
y_test_pred = ridge.predict(x_test_const[sig_features])

print(f"train_r2_score   : {r2_score(y_train,y_train_pred)} ")
print(f"test_r2_score    : {r2_score(y_test, y_test_pred)} ")
print(f"train_rmse_score : {mean_squared_error(y_train,y_train_pred, squared=False)} ")
print(f"test_rmse_score  : {mean_squared_error(y_test, y_test_pred, squared=False)} ")
score = cross_val_score(ridge, x_train_const[sig_features], y_train, cv = 5, scoring="r2")
print(f"scores           : {score}")
print(f"bias             : {1 - np.mean(score)}")
print(f"variance         : {np.std(score)/np.std(score)}")

In [59]:
params = {"alpha" : np.arange(0,150,5)}
ridge = Ridge()
grd_search = GridSearchCV(estimator=ridge, param_grid=params, cv = 5, scoring="r2")
grd_search.fit(x_train_const[sig_features], y_train)
grd_search.best_params_

In [60]:
pd.DataFrame(grd_search.cv_results_)

In [61]:
dtr = DecisionTreeRegressor()
dtr.fit(x_train_const[sig_features], y_train)
y_train_pred = dtr.predict(x_train_const[sig_features])
y_test_pred = dtr.predict(x_test_const[sig_features])

print(f"train_r2_score   : {r2_score(y_train,y_train_pred)} ")
print(f"test_r2_score    : {r2_score(y_test, y_test_pred)} ")
print(f"train_rmse_score : {mean_squared_error(y_train,y_train_pred, squared=False)} ")
print(f"test_rmse_score  : {mean_squared_error(y_test, y_test_pred, squared=False)} ")
score = cross_val_score(dtr, x_train_const[sig_features], y_train, cv = 5, scoring="r2")
print(f"scores           : {score}")
print(f"bias             : {1 - np.mean(score)}")
print(f"variance         : {np.std(score)/np.std(score)}")

In [62]:
linear = LinearRegression()
linear.fit(x_train_const[sig_features], y_train)
y_train_pred = linear.predict(x_train_const[sig_features])
y_test_pred = linear.predict(x_test_const[sig_features])

print(f"train_r2_score   : {r2_score(y_train,y_train_pred)} ")
print(f"test_r2_score    : {r2_score(y_test, y_test_pred)} ")
print(f"train_rmse_score : {mean_squared_error(y_train,y_train_pred, squared=False)} ")
print(f"test_rmse_score  : {mean_squared_error(y_test, y_test_pred, squared=False)} ")
score = cross_val_score(linear, x_train_const[sig_features], y_train, cv = 5, scoring="r2")
print(f"scores           : {score}")
print(f"bias             : {1 - np.mean(score)}")
print(f"variance         : {np.std(score)/np.std(score)}")

## TEST DATA

In [63]:
test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
test.head(3)

In [64]:
test.shape

In [65]:
quantitative = test.select_dtypes("number")
qualitative = test.select_dtypes("object")
quantitative.head(3)

In [66]:
qualitative.head(3)

In [67]:
# -- qualitative features:

#qualitative.fillna(qualitative.mode, inplace = True)

In [68]:
opt_qual = qualitative[optimum_count].fillna(qualitative[optimum_count].mode)
qual_dummy = pd.get_dummies(qualitative[optimum_count], drop_first=True)
qual_dummy.head()

In [69]:
# -- quantitative:

col1 = x_train_const.columns
opt_quan = list()
for i in quantitative.columns:
    if i in col1:
        opt_quan.append(i)

quan_filtered = quantitative[opt_quan]
quan_filtered.head()

In [70]:
test_scaled = scaler.transform(quan_filtered)
quan_filtered[quan_filtered.columns] = test_scaled
quan_filtered.head()

In [71]:
# -- final_test_data:

quan_filtered.shape, qual_dummy.shape

In [72]:
final_test_data = pd.concat([quan_filtered, qual_dummy], axis = 1)
final_test_data

In [73]:
test_const = sm.add_constant(final_test_data)
final_test_features = test_const[sig_features]
final_test_features.head(10)

In [74]:
predictions = model.predict(final_test_features)

In [75]:
output = pd.DataFrame({'Id': final_test_features.index, 'SalePrice': predictions})
output = output.set_index('Id', drop=True)
output.to_csv('my_submission.csv')