# Importing Packages

In [None]:
#### Import Modules
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="ticks", color_codes=True, font_scale=1.5)
color = sns.color_palette()
sns.set_style('darkgrid')
import pylab 
%matplotlib inline

import warnings
warnings.simplefilter(action = 'ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)


from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, GridSearchCV

from scipy import stats

import statsmodels.api as sm
from statsmodels.formula.api import ols

# Reading Data

In [None]:
df = pd.read_excel("data/HousingPrice.xls")
df.head()

In [None]:
df.info()

In [None]:
df.columns

##### Fixing Column Names

In [None]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('/', '_').str.replace(')', '')
df.columns

In [None]:
df.set_index('pid', inplace=True)  ### Setting Index 
df.drop('order', axis =1, inplace=True) ### Dropping Unneccessary column

In [None]:
df.describe()

## Step1 - Treatment of Continous Variables

In [None]:
# Density Plot and Histogram of Target Variable
sns.distplot(df.sale_price, hist=True, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4});

In [None]:
numeric_cols = df.select_dtypes(np.number).columns

In [None]:
df_numeric = df[numeric_cols]

In [None]:
def des_stat(df, pred=None): 
    obs = df.shape[0]
    counts = df.apply(lambda x: x.count())
    distincts = df.apply(lambda x: x.unique().shape[0])
    nulls = df.apply(lambda x: x.isnull().sum())
    missing_ratio = (df.isnull().sum()/ obs) * 100
    
    mean_value = np.round(df.mean(),2)
    median_value = df.median()
    
    std_value = np.round(df.std(),2)
    min_value = df.min()
    max_value = df.max()
    
    skewness = np.round(df.skew(),2)
    kurtosis = np.round(df.kurt(),2) 
    
    print('Data shape:', df.shape)
    
    if pred is None:
        cols = ['counts', 'distincts', 'nulls', 'missing ratio', 'mean_value', 
                'median_value', 'min_value',  'max_value', 'std_value', 'skewness', 'kurtosis']
        str = pd.concat([counts, distincts, nulls, missing_ratio, mean_value, median_value, 
                         min_value, max_value, std_value, skewness, kurtosis], axis = 1)

    else:
        corr = df.corr()[pred]
        string = pd.concat([counts, distincts, nulls, missing_ratio, mean_value, median_value, 
                         min_value, max_value, std_value, skewness, kurtosis, corr], axis = 1, sort=False)
        corr_col = 'corr_'  + pred
        cols = ['counts', 'distincts', 'nulls', 'missing ratio', 'mean_value', 'median_value', 
                'min_value', 'max_value', 'std_value', 'skewness', 'kurtosis', corr_col ]
    string.columns = cols
    print('___________________________')
    return string

In [None]:
details = des_stat(df_numeric, 'sale_price')
display(details.sort_values(by='corr_sale_price', ascending=False))

In [None]:
### Year sold is not Continous so make it as Category
df['yr_sold'] = df.yr_sold.astype('object')

In [None]:
# Density Plot and Histogram of Lot Area
sns.distplot(df.lot_area, hist=True, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4});

## Correlation Between Continous Variables

In [None]:
df.select_dtypes(np.number).corr().sort_values(by = 'sale_price', ascending =False)

##### Outlier Removal for Lot Area

In [None]:
q75, q25 = np.percentile(df.lot_area, [75,25])
iqr = q75 - q25
upper_whisker = q75 + 1.5 * iqr
lower_whisker = q25 - 1.5 * iqr
def check(x, ul, ll):
    if ul>=x>=ll:
        return x

In [None]:
df.lot_area.mean()

In [None]:
print("Percentage of Outliers in tripduration:",len(df[df.lot_area.apply(check, args = (upper_whisker, lower_whisker)).isnull()]['lot_area'])/len(df) * 100)

In [None]:
mean_lot_area = df[df.lot_area.apply(check, args = (upper_whisker, lower_whisker)).notnull()]['lot_area'].mean()
print (mean_lot_area)

In [None]:
def transform_lot_area(x):
    if x > upper_whisker:
        return mean_lot_area
    return x

df['lot_area_mean'] = df['lot_area'].apply(lambda x: transform_lot_area(x))
df['lot_area_mean'].plot.hist(bins=100, title='Frequency distribution of mean transformed Trip duration');

In [None]:
# Density Plot and Histogram of all arrival delays
sns.distplot(df.lot_area_mean, hist=True, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4});

In [None]:
df.corr()['sale_price'].sort_values(ascending =False)

### Can Lot area corr with Sales cab be improved?

In [None]:
def transform_lot_area(x):
    if x > upper_whisker:
        return upper_whisker
    return x

df['lot_area_upper'] = df['lot_area'].apply(lambda x: transform_lot_area(x))
df['lot_area_upper'].plot.hist(bins=100, title='Frequency distribution of Upper Limit transformed Trip duration');

In [None]:
# Density Plot and Histogram of all arrival delays
sns.distplot(df.lot_area_upper, hist=True, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4});

In [None]:
df.corr()['sale_price'].sort_values(ascending =False)

In [None]:
df['first_second_flr'] = df.f_flr + df.s_flr
df.corr()['sale_price'].sort_values(ascending =False)

In [None]:
#### Dropping Low Correlation and unneccessary columns
df.drop(['lot_area', 'overall_cond', 'f_flr', 's_flr', 'lot_area_mean'], axis=1, inplace=True)

In [None]:
df.corr()['sale_price'].sort_values(ascending =False)

# Step-2 Treatment of Nominal/Categorical Variables

In [None]:
df.select_dtypes('object').columns

In [None]:
pd.factorize(df.central_air)

In [None]:
df['central_air_num'] = pd.factorize(df.central_air)[0]
df.head()

In [None]:
df.corr()['sale_price'].sort_values(ascending =False)

In [None]:
fig = plt.figure(figsize=(20, 15))
sns.set(font_scale=1.5)

fig1 = fig.add_subplot(221); sns.boxplot(x='central_air', y='sale_price', data=df[['central_air', 'sale_price']]);

#### ANOVA Test

##### 1) Central Air Condition

In [None]:
mod1 = ols('sale_price ~ central_air', data=df).fit()
sm.stats.anova_lm(mod1)

##### 2) MS Zoning

In [None]:
df.ms_zoning.value_counts(normalize=True, dropna=False)

In [None]:
fig = plt.figure(figsize=(20, 15))
sns.set(font_scale=1.5)

fig1 = fig.add_subplot(221); sns.boxplot(x='ms_zoning', y='sale_price', data=df[['ms_zoning', 'sale_price']]);

In [None]:
mod2 = ols('sale_price ~ ms_zoning', data=df).fit()
sm.stats.anova_lm(mod2)

##### 3) Lot Shape

In [None]:
df.lot_shape.value_counts(normalize=True, dropna=False)

In [None]:
fig = plt.figure(figsize=(20, 15))
sns.set(font_scale=1.5)

fig1 = fig.add_subplot(221); sns.boxplot(x='lot_shape', y='sale_price', data=df[['lot_shape', 'sale_price']]);

In [None]:
mod3= ols('sale_price ~ lot_shape', data=df).fit()
sm.stats.anova_lm(mod3)

In [None]:
df.lot_shape.replace({'Reg': 'Reg', 'IR1': 'IR' , 'IR2': 'IR' , "IR3": 'IR'}, inplace=True)
df.lot_shape.value_counts(normalize=True, dropna=False)

In [None]:
mod3= ols('sale_price ~ lot_shape', data=df).fit()
sm.stats.anova_lm(mod3)

##### 4) Utilities

In [None]:
df.utilities.value_counts(normalize=True, dropna=False)

In [None]:
fig = plt.figure(figsize=(20, 15))
sns.set(font_scale=1.5)

fig1 = fig.add_subplot(221); sns.boxplot(x='utilities', y='sale_price', data=df[['utilities', 'sale_price']]);

In [None]:
mod4= ols('sale_price ~ utilities', data=df).fit()
sm.stats.anova_lm(mod4)

##### 5) Building Type

In [None]:
df.bldg_type.value_counts(normalize=True, dropna=False)

In [None]:
fig = plt.figure(figsize=(20, 15))
sns.set(font_scale=1.5)

fig1 = fig.add_subplot(221); sns.boxplot(x='bldg_type', y='sale_price', data=df[['bldg_type', 'sale_price']]);

In [None]:
df.bldg_type.replace({'1Fam': 'Family', 'TwnhsE': 'TwnhsE' , 'Twnhs': 'Others' , 
                      "Duplex": 'Others', '2fmCon': 'Others'}, inplace=True)
df.bldg_type.value_counts(normalize=True, dropna=False)

In [None]:
fig = plt.figure(figsize=(20, 15))
sns.set(font_scale=1.5)

fig1 = fig.add_subplot(221); sns.boxplot(x='bldg_type', y='sale_price', data=df[['bldg_type', 'sale_price']]);

In [None]:
mod5= ols('sale_price ~ bldg_type', data=df).fit()
sm.stats.anova_lm(mod5)

##### 6) House Style

In [None]:
df.house_style.value_counts(normalize=True, dropna=False)

In [None]:
fig = plt.figure(figsize=(20, 15))
sns.set(font_scale=1.5)

fig1 = fig.add_subplot(221); sns.boxplot(x='house_style', y='sale_price', data=df[['house_style', 'sale_price']]);

In [None]:
mod6= ols('sale_price ~ house_style', data=df).fit()
sm.stats.anova_lm(mod6)

##### 7) Foundation

In [None]:
df.foundation.value_counts(normalize=True, dropna=False)

In [None]:
fig = plt.figure(figsize=(20, 15))
sns.set(font_scale=1.5)

fig1 = fig.add_subplot(221); sns.boxplot(x='foundation', y='sale_price', data=df[['foundation', 'sale_price']]);

In [None]:
mod7= ols('sale_price ~ foundation', data=df).fit()
sm.stats.anova_lm(mod7)

##### 8) Basement Quality

In [None]:
df.bsmt_qual.value_counts(normalize=True, dropna=False)

In [None]:
df['bsmt_qual'] = np.where(df.bsmt_qual.isnull(), df.bsmt_qual.mode(), df.bsmt_qual)

In [None]:
df.bsmt_qual.value_counts(normalize=True, dropna=False)

In [None]:
fig = plt.figure(figsize=(20, 15))
sns.set(font_scale=1.5)

fig1 = fig.add_subplot(221); sns.boxplot(x='bsmt_qual', y='sale_price', data=df[['bsmt_qual', 'sale_price']]);

In [None]:
mod8= ols('sale_price ~ bsmt_qual', data=df).fit()
sm.stats.anova_lm(mod8)

In [None]:
df.bsmt_qual.replace({'TA': 'TA', 'Gd': 'Gd' , 'Ex': 'Ex' , 
                      "Fa": 'TA', 'Po': 'TA'}, inplace=True)

In [None]:
mod8= ols('sale_price ~ bsmt_qual', data=df).fit()
sm.stats.anova_lm(mod8)

In [None]:
df.kitchen_qual.value_counts(normalize=True, dropna=False)


In [None]:
fig = plt.figure(figsize=(20, 15))
sns.set(font_scale=1.5)

fig1 = fig.add_subplot(221); sns.boxplot(x='kitchen_qual', y='sale_price', data=df[['kitchen_qual', 'sale_price']]);

##### 9) Kitchen Quality

In [None]:
df.kitchen_qual.value_counts(normalize=True, dropna=False)

In [None]:
fig = plt.figure(figsize=(20, 15))
sns.set(font_scale=1.5)

fig1 = fig.add_subplot(221); sns.boxplot(x='kitchen_qual', y='sale_price', data=df[['kitchen_qual', 'sale_price']]);

In [None]:
df.kitchen_qual.replace({'TA': 'TA', 'Gd': 'Gd' , 'Ex': 'Ex' , 
                      "Fa": 'TA', 'Po': 'TA'}, inplace=True)

In [None]:
fig = plt.figure(figsize=(20, 15))
sns.set(font_scale=1.5)

fig1 = fig.add_subplot(221); sns.boxplot(x='kitchen_qual', y='sale_price', data=df[['kitchen_qual', 'sale_price']]);

In [None]:
mod9= ols('sale_price ~ kitchen_qual', data=df).fit()
sm.stats.anova_lm(mod9)

##### Whether Kitchen Quality and Basement Quality Correlated Each other??? If So how you identify it??
![](img/ques_confuse.jpg)

### CHI-SQUARE Test

In [None]:
contingency_table = pd.crosstab(
    df.bsmt_qual,
    df.kitchen_qual,
    margins = True
)
contingency_table

In [None]:
f_obs = np.array([contingency_table.iloc[0][0:3].values,
                  contingency_table.iloc[1][0:3].values,
                  contingency_table.iloc[2][0:3].values])
f_obs

In [None]:
stats.chi2_contingency(f_obs)[0:3]

##### 10) Year Sold - Dropped

In [None]:
df.yr_sold.value_counts()

In [None]:
fig = plt.figure(figsize=(20, 15))
sns.set(font_scale=1.5)

fig1 = fig.add_subplot(221); sns.boxplot(x='yr_sold', y='sale_price', data=df[['yr_sold', 'sale_price']]);

##### 11) Condition

In [None]:
df.condition_1.value_counts(normalize=True, dropna=False)

In [None]:
df.condition_2.value_counts(normalize=True, dropna=False)

In [None]:
df[['condition_1', 'condition_2']].head()

In [None]:
df['condition_3'] = np.where(df.condition_1 == df.condition_2, np.nan, df.condition_2)
df[['condition_1', 'condition_2', 'condition_3']].head()

In [None]:
df['cond']= np.where(df.condition_3.isnull(), df.condition_1, df.condition_1 + '|' + df.condition_3)
df.cond.head()

In [None]:
cond = df.cond.str.get_dummies('|').copy()

In [None]:
drop_cols = ['utilities', 'condition_1', 'condition_2',  'condition_3',  'cond', 'yr_sold', 'central_air_num']

In [None]:
df.drop(drop_cols, axis=1, inplace=True)

In [None]:
df_final = df.merge(cond, left_index=True, right_index=True)

In [None]:
df_final.info()

##### Converting Dummies Variables

In [None]:
df_final = pd.get_dummies(df_final)
df_final.info()

In [None]:
df_final.columns

In [None]:
cols_ord = ['overall_qual', 'year_built', 'year_remod_add', 
       'lot_area_upper', 'first_second_flr', 'Artery',
       'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNe', 'RRNn',
       'ms_zoning_A (agr)', 'ms_zoning_C (all)', 'ms_zoning_FV',
       'ms_zoning_I (all)', 'ms_zoning_RH', 'ms_zoning_RL', 'ms_zoning_RM',
       'lot_shape_IR', 'lot_shape_Reg', 'bldg_type_Family', 'bldg_type_Others',
       'bldg_type_TwnhsE', 'house_style_1.5Fin', 'house_style_1.5Unf',
       'house_style_1Story', 'house_style_2.5Fin', 'house_style_2.5Unf',
       'house_style_2Story', 'house_style_SFoyer', 'house_style_SLvl',
       'foundation_BrkTil', 'foundation_CBlock', 'foundation_PConc',
       'foundation_Slab', 'foundation_Stone', 'foundation_Wood',
       'bsmt_qual_Ex', 'bsmt_qual_Gd', 'bsmt_qual_TA', 'central_air_N',
       'central_air_Y', 'kitchen_qual_Ex', 'kitchen_qual_Gd',
        'kitchen_qual_TA', 'sale_price']

In [None]:
df_final = df_final[cols_ord].copy()

# Building Model

In [None]:
df_final.head()

In [None]:
X = df_final.iloc[:, 0:-1].values
y = df_final.iloc[:, -1].values

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 50)

In [None]:
print("X_train Shape:", X_train.shape)
print("X_test Shape:", X_test.shape)
print("y_train Shape:", y_train.shape)
print("y_test Shape:", y_test.shape)

#### Base Model

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test) 

In [None]:
print("R2 Training Score for the Baseline Model:", np.round(r2_score(y_train, y_pred_train),2))

print("R2 Test Score for the Baseline Model:", np.round(r2_score(y_test, y_pred_test),2))


In [None]:
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_test))
print('___________________________')
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test)))

#### Ridge Model

In [None]:
ridge = Ridge().fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge.score(X_test, y_test)))


In [None]:
ridge10 = Ridge(alpha=2).fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge10.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge10.score(X_test, y_test)))

#### Lasso

In [None]:
lasso = Lasso().fit(X_train, y_train)
print("Training set score: {:.2f}".format(lasso.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(lasso.coef_ != 0)))

In [None]:
lasso001 = Lasso(alpha=0.01, max_iter=100000).fit(X_train, y_train)
print("Training set score: {:.2f}".format(lasso001.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso001.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(lasso001.coef_ != 0)))

In [None]:
param_grid = {'alpha': [10, 50, 75, 90, 100, 150, 500, 1000],
'max_iter': [500, 700, 1000, 2000, 3000]}
print("Parameter grid:\n{}".format(param_grid))

In [None]:
grid_search = GridSearchCV(Lasso(), param_grid)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print("Test set score: {:.2f}".format(grid_search.score(X_test, y_test)))
print("Best parameters: {}".format(grid_search.best_params_))
y_pred_test = grid_search.predict(X_test)

print("Mean Squared Error:", mean_squared_error(y_test, y_pred_test))
print('___________________________')
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test)))