In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df = pd.concat([df_train,df_test],sort=False)

In [4]:
df_0 = df.copy()

# Filling null values

In [5]:
df["LotFrontage"].fillna(0, inplace = True)
df["MasVnrArea"].fillna(0, inplace = True)
df["BsmtFinSF1"].fillna(0, inplace = True)
df["BsmtFinSF2"].fillna(0, inplace = True)
df["BsmtUnfSF"].fillna(0, inplace = True)
df["TotalBsmtSF"].fillna(0, inplace = True)
df["GarageCars"].fillna(0, inplace = True)
df["GarageArea"].fillna(0, inplace = True)

In [6]:
df["Alley"].fillna('No', inplace = True)
df["MasVnrType"].fillna('No', inplace = True)
df["GarageType"].fillna('No', inplace = True)
df["MiscFeature"].fillna('No', inplace = True)
df["BsmtQual"].fillna('No', inplace = True)
df["BsmtCond"].fillna('No', inplace = True)
df["BsmtExposure"].fillna('No', inplace = True)
df["BsmtFinType1"].fillna('No', inplace = True)
df["BsmtFinType2"].fillna('No', inplace = True)
df["FireplaceQu"].fillna('No', inplace = True)
df["PoolQC"].fillna('No', inplace = True)
df["Fence"].fillna('No', inplace = True)

In [7]:
null_num =['BsmtFullBath', 'BsmtHalfBath']
null_obj = ['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'Electrical', 'KitchenQual', 'Functional', 'SaleType']

In [8]:
for x in null_num:
    df[x].fillna(df[x].median(), inplace = True)

In [9]:
for x in null_obj:
    df[x].fillna(df[x].mode()[0], inplace = True)

In [10]:
i = 0
for x in range(len(df.columns)):
    if df.iloc[:,x].isnull().sum() > 0:
        i += 1
print(i)

5


In [11]:
null_com = ['GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond']

In [12]:
for x in null_com:
    df[x].fillna(0, inplace=True)

In [13]:
for x in range(df.shape[0]):
    for y in null_com:
        if df.iloc[x,df.columns.get_loc("GarageType")] == 'No':
            df.iloc[x,df.columns.get_loc(y)] = 0
        elif df.iloc[x,df.columns.get_loc("GarageType")] != 'No' and df.iloc[x,df.columns.get_loc(y)] == 'No':
            df.iloc[x,df.columns.get_loc(y)] = df[y].median()

In [14]:
columns_numeric = list(df.dtypes[(df.dtypes=='int64') | (df.dtypes=='float64') ].index)
len(columns_numeric)

38

In [15]:
columns_object = list(df.dtypes[df.dtypes=='object'].index)
len(columns_object)

43

# Feature Encoding

In [16]:
df2 = df.copy()
for x in columns_object:
    temp = pd.get_dummies(df2[x],prefix=x)
    df2 = pd.concat([df2,temp],axis=1)
    df2.drop(x,axis=1,inplace=True)
df2.shape

(2919, 304)

In [17]:
df2.drop(['Id'],axis=1,inplace=True)

In [18]:
df2.shape

(2919, 303)

# Feature Selection

In [19]:
x = df2.drop(['SalePrice'],axis=1)

In [20]:
y = df2['SalePrice']

In [21]:
corr = x.corr(method='pearson')

In [22]:
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.9:
            if columns[j]:
                columns[j] = False
selected_columns = x.columns[columns]
x2 = x[selected_columns]

In [23]:
x[selected_columns]

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal
0,60,65.0,8450,7,5,2003,2003,196.0,706.0,0.0,...,0,0,0,0,1,0,0,0,0,1
1,20,80.0,9600,6,8,1976,1976,0.0,978.0,0.0,...,0,0,0,0,1,0,0,0,0,1
2,60,68.0,11250,7,5,2001,2002,162.0,486.0,0.0,...,0,0,0,0,1,0,0,0,0,1
3,70,60.0,9550,7,5,1915,1970,0.0,216.0,0.0,...,0,0,0,0,1,1,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655.0,0.0,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,21.0,1936,4,7,1970,1970,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,1
1455,160,21.0,1894,4,5,1970,1970,0.0,252.0,0.0,...,0,0,0,0,1,1,0,0,0,0
1456,20,160.0,20000,5,7,1960,1996,0.0,1224.0,0.0,...,0,0,0,0,1,1,0,0,0,0
1457,85,62.0,10441,5,5,1992,1992,0.0,337.0,0.0,...,0,0,0,0,1,0,0,0,0,1


In [24]:
x2_train = x2.iloc[:1460,:]
y_train = y.iloc[:1460]

In [29]:
df3 = x2.copy()
df3['SalePrice'] = y

In [30]:
corr = df3.corr(method='pearson')['SalePrice']

In [32]:
flag = 0
for x in range(len(corr)):
    if corr[x] < 0.05 and corr[x] > -0.05:
        flag += 1
        print(f"Dropping column: {df2.columns[x]}: {corr[x]}")
        df3 = df3.drop([x2.columns[x]], axis=1)
        print()
print(f"flag: {flag}")

Dropping column: BsmtFinSF2: -0.011378121450215216

Dropping column: LowQualFinSF: -0.02560613000068015

Dropping column: BsmtHalfBath: -0.016844154297359294

Dropping column: 3SsnPorch: 0.04458366533574792

Dropping column: MiscVal: -0.02118957964030379

Dropping column: MoSold: 0.04643224522381936

Dropping column: YrSold: -0.028922585168730426

Dropping column: MSZoning_RM: -0.04103553550004982

Dropping column: Street_Grvl: 0.04103553550004989

Dropping column: Alley_No: -0.027655379955633733

Dropping column: LotShape_IR2: 0.036720475499535554

Dropping column: LandContour_HLS: 0.04552825032749539

Dropping column: LandContour_Low: -0.02754457933083911

Dropping column: LandContour_Lvl: 0.014314296147248953

Dropping column: Utilities_AllPub: -0.014314296147248952

Dropping column: Utilities_NoSeWa: 0.004144823294072492

Dropping column: LotConfig_CulDSac: -0.0068588907827759355

Dropping column: LotConfig_FR2: 0.018185562475503233

Dropping column: LandSlope_Gtl: 0.04298115994437

In [33]:
x = df3.drop(['SalePrice'],axis=1)

In [34]:
x_train = x.iloc[:1460,:]
y_train = y.iloc[:1460]
x_test = x.iloc[1460:,:]
y_test = y.iloc[1460:]

# Correlation of target variables with all the features

In [35]:
for t in range(1,19):
    df3 = x2.copy()
    df3['SalePrice'] = y
    corr = df3.corr(method='pearson')['SalePrice']
    flag = 0
    for x in range(len(corr)):
        if corr[x] < 0.05 and corr[x] > -0.05:
            flag += 1

            df3 = df3.drop([x2.columns[x]], axis=1)
    print(t/100)
    # print(f"{flag}")

0.01
0.02
0.03
0.04
0.05
0.06
0.07
0.08
0.09
0.1
0.11
0.12
0.13
0.14
0.15
0.16
0.17
0.18


# Low Variance Filter

In [37]:
df11 = df3.copy()
var = df11.var()
i = 0
for x in range(len(var)):
    if var[x] < t/1000:
        i += 1
        # print(var[x:x+1])
        df11 = df11.drop([df3.columns[x]], axis=1)
        #print(f"dropping: {df3.columns[x]}")
print(f'Columns dropped:{i}')

# Model

# Splitting of data

In [39]:
dataset = df11

In [40]:
x_train = dataset.iloc[:1460,:]
y_train = dataset.iloc[:1460]
x_test = dataset.iloc[1460:,:]
y_test = dataset.iloc[1460:]

In [41]:
from sklearn.model_selection import train_test_split

In [43]:
x_tr, x_val, y_tr, y_val = train_test_split(dataset.iloc[:1460,:], dataset.iloc[:1460], test_size = 0.3, random_state = 0)

# Standardizing the Data

In [None]:
'''
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_tr)
x_tr_scaled = scaler.transform(x_tr)
x_val_scaled = scaler.transform(x_val)
'''

# Gradient Boosting

In [44]:
from sklearn.ensemble import GradientBoostingRegressor as GB

# ElasticCV

In [45]:
from sklearn.linear_model import ElasticNetCV

In [46]:
ECV = ElasticNetCV()

In [None]:
paragrid = { l1_ratio=[0.5,0.7,0.9,0.95],
            n_alphas=[100,200,300]
            }