In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.stats import norm
from scipy import stats
from scipy.stats import skew
from scipy.stats.stats import pearsonr

pd.set_option('display.max_columns', 81)



# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
#print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.


In [None]:
#Preparing TRAINING DATASET
# --------------- loading the training dataset as dfhouse ---------------
dfhouse = pd.read_csv('../input/train.csv')
dfhouse_raw = dfhouse.copy()     #saving a copy just in case
#dfhouse.head()
#dfhouse.describe()
#dfhouse.info()



**** Analysis of the predictor variable 'SalePrice**** 


In [None]:

# ----------------- GRAPHICAL analysis of the predictor variable 'SalePrice' -----------------
saleprice = dfhouse['SalePrice']
saleprice_raw = dfhouse_raw['SalePrice']
sns.distplot(saleprice)   #shows that the distribution has a right skew with a positive skew meaning that most of the houses were sold at lower prices thus hitting a low average
#sns.swarmplot(dfhouse['SalePrice'])   #shows that most of the sales price range within 100K-400K with outliers etending beyond 400k
#Inference: remove the outliers



In [None]:
print(dfhouse['SalePrice'].describe())
print(dfhouse['SalePrice'].skew())

* * * **Relationship between OverallQu and SalePrice **

In [None]:
dfhouse['OverallQual'].describe()
sns.jointplot(dfhouse.OverallQual, dfhouse['SalePrice'], data=dfhouse, kind="reg")
# INFERENCE: OverallQual and SalePrice share a linear relationship 


* * * **Relationship between YearBuilt and SalePrice **

In [None]:
dfhouse['YearBuilt'].describe()
sns.jointplot(dfhouse.YearBuilt, dfhouse['SalePrice'], data=dfhouse, kind="reg")
# INFERENCE: YearBuilt and SalePrice share an exponential relationship with the sale price proportionally increaing with the improvement in years.


* * * **Relationship between TotRmsAbvGrd and SalePrice **

In [None]:
dfhouse['TotRmsAbvGrd'].describe()
sns.jointplot(dfhouse.TotRmsAbvGrd, dfhouse.SalePrice, data=dfhouse)
#normal distribution with majority of values in the center, slightly left skewed

In [None]:
#scatter plot grlivarea/saleprice
var = 'GrLivArea'
data = pd.concat([dfhouse['SalePrice'], dfhouse[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));
#shows that houses with minimal living area has lesser pricings


In [None]:
#scatter plot totalbsmtsf/saleprice
var = 'TotalBsmtSF'
data = pd.concat([dfhouse['SalePrice'], dfhouse[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));
#linear,proportional relationship

**Ceating a *CORRELATION* Matrix to deduce relationships between various features**

In [None]:
#zoomed heatmap to consider the valueable features
#saleprice correlation matrix
dftemp = dfhouse_raw
corrmat = dftemp.corr()

k=10
i=1
for i in range(1,8,10):
    f, ax = plt.subplots(figsize=(15, 10))
    cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
    cm = np.corrcoef(dftemp[cols].values.T)
    sns.set(font_scale=1.25)
    hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
    plt.show()
    dftemp = dftemp.drop(dftemp.columns[[i, i+10]], axis=1)
    





**DATA CLEANING**

In [None]:
#deleting extreme outlier points from GrLvArea column

dfhouse.sort_values(by = 'GrLivArea', ascending = False)[:2]

dfhouse = dfhouse.drop(dfhouse[dfhouse['Id'] == 1299].index)
dfhouse = dfhouse.drop(dfhouse[dfhouse['Id'] == 524].index)


In [None]:
#applying log transformations on SalesPrice data to make sure that we have a normally didstributed feature

dfhouse['SalePrice'] = np.log(dfhouse['SalePrice'])
sns.distplot(dfhouse['SalePrice'], fit=norm)
fig = plt.figure()
res = stats.probplot(dfhouse['SalePrice'], plot=plt)

In [None]:
#applying log transformations on GrLivArea data to make sure that we have a normally didstributed feature
#inspecting
sns.distplot(dfhouse['GrLivArea'], fit=norm)
fig = plt.figure()
res = stats.probplot(dfhouse['GrLivArea'], plot=plt)

#log transformation
dfhouse['GrLivArea'] = np.log(dfhouse['GrLivArea'])
sns.distplot(dfhouse['GrLivArea'], fit=norm)
fig = plt.figure()
res = stats.probplot(dfhouse['GrLivArea'], plot=plt)


In [None]:
#Since TotoalBsmtSF has a lot of zeros (houses having no basements) so applying log transformation to them will not produce correct results

#create column for new variable (one is enough because it's a binary categorical feature)
#if area>0 it gets 1, for area==0 it gets 0
#df_train['HasBsmt'] = pd.Series(len(df_train['TotalBsmtSF']), index=df_train.index)
#df_train['HasBsmt'] = 0 
#df_train.loc[df_train['TotalBsmtSF']>0,'HasBsmt'] = 1

dfhouse['HasBsmt'] = pd.Series(len(dfhouse['TotalBsmtSF']), index=dfhouse.index)
dfhouse['HasBsmt'] = 0
dfhouse.loc[dfhouse['TotalBsmtSF']>0, 'HasBsmt'] = 1

#log transformation on non zero data
dfhouse.loc[dfhouse['HasBsmt']==1,'TotalBsmtSF'] = np.log(dfhouse['TotalBsmtSF'])

#histogram and normal probability plot
sns.distplot(dfhouse[dfhouse['TotalBsmtSF']>0]['TotalBsmtSF'], fit=norm);
fig = plt.figure()
res = stats.probplot(dfhouse[dfhouse['TotalBsmtSF']>0]['TotalBsmtSF'], plot=plt)





**COMPUTING AND FILLING IN MISSING VALUES**

In [None]:
#convert categorical variable into dummy
dfhouse = pd.get_dummies(dfhouse)
print(dfhouse.ix[20, :])
#filling in the missing values with the mean
dfhouse = dfhouse.fillna(dfhouse.mean())
print(dfhouse.MSZoning_RL.dtype)

> **PREPARING CLEANING THE TEST DATA**

In [None]:
dfhouseTest = pd.read_csv('../input/test.csv')
dfhouseTest_raw = dfhouseTest.copy()

#convert categorical variable into dummy
dfhouseTest = pd.get_dummies(dfhouseTest)
print(dfhouse.ix[20, :])

#filling in the missing values with the mean
dfhouseTest = dfhouseTest.fillna(dfhouseTest.mean())
print(dfhouseTest.MSZoning_RL.dtype)

In [None]:
#dfhouse = dfhouse.drop(columns = ['SalePrice','Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'])
#dfhouseTest = dfhouseTest.drop(columns = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'])
dfhouseTest.info()
#dfhouseTest.shape()

**LINEAR REGRESSION**

In [None]:
from sklearn.linear_model import LinearRegression
#making the dimensions for the input and output test data

#dropping inconsistent rows
dfhouse_temp = dfhouse#.drop([1458, 1459]) #--- aleady dropped so commented to stop dropping further rows
dfhouseTest_temp = dfhouseTest#.drop([1458])
saleprice = saleprice#.drop([1458, 1459])

#dropping inconsistent columns
cols_to_drop = dfhouse_temp.columns.difference(dfhouseTest_temp.columns)
dfhouse_temp = dfhouse_temp.drop(columns = cols_to_drop)

#printing to check the dimensions
print(dfhouse_temp.shape)
print(saleprice.shape)
print(dfhouseTest_temp.shape)

#print(dfhouseTest_temp.dtypes)

#fitting and observing scores
regressor = LinearRegression()
regressor.fit(dfhouse_temp, saleprice)
predicted_result = regressor.predict(dfhouseTest_temp)
print(regressor.score(dfhouseTest_temp, predicted_result))
#print(predicted_result.shape)

# xtrain = np.array(dfhouse_temp).reshape(-1, 1)
# ytrain = np.array(saleprice)
# xtest = np.array(dfhouseTest_temp).reshape(-1, 1)
# ytest = np.array(predicted_result)

# print(xtrain.shape)
# print(ytrain.shape)

# #Visualizing the training Test Results 
# plt.scatter(xtrain, ytrain, color= 'red')
# plt.plot(xtrain, regressor.predict(xtrain), color = 'blue')
# plt.title ("Visuals for Training Dataset")
# plt.xlabel("Space")
# plt.ylabel("Price")
# plt.show()

# # #Visualizing the Test Results 
# plt.scatter(xtest, ytest, color= 'red')
# plt.plot(xtrain, regressor.predict(xtrain), color = 'blue')
# plt.title("Visuals for Test DataSet")
# plt.xlabel("Space")
# plt.ylabel("Price")
# plt.show()


In [None]:
#submission code
submission = pd.DataFrame({'train_sales':saleprice,'test_sales':predicted_result})
submission.to_csv('submissionHouses.csv', index=False)


In [None]:
#applying ridge regression to improve model
# from sklearn.linear_model import Ridge
from sklearn.linear_model import Ridge

#dropping inconsistent rows
dfhouse_temp = dfhouse#.drop([1458, 1459]) #--- aleady dropped so commented to stop dropping further rows
dfhouseTest_temp = dfhouseTest#.drop([1458])
saleprice = saleprice#.drop([1458, 1459])

#dropping inconsistent columns
cols_to_drop = dfhouse_temp.columns.difference(dfhouseTest_temp.columns)
dfhouse_temp = dfhouse_temp.drop(columns = cols_to_drop)


#converting df into nd numpy arrays 
X_train = dfhouse_temp.values
y_train = saleprice.values
X_test = dfhouseTest_temp.values

#print(X_train.ndim())

#for train set
ridreg = LinearRegression()
ridreg.fit(X_train, y_train)

alpha1 = Ridge(alpha=0.01)
alpha1.fit(X_train, y_train)

alpha2 = Ridge(alpha = 100)
alpha2.fit(X_train, y_train)

#for test set
y_test = ridreg.predict(X_test)
ridreg.fit(X_test, y_test)

alpha1 = Ridge(alpha=0.01)
alpha1.fit(X_test, y_test)

alpha2 = Ridge(alpha = 100)
alpha2.fit(X_test, y_test)

#checking scores for each
Ridge_alpha1_score_train = alpha1.score(X_train, y_train)
Ridge_alpha1_score_test = alpha1.score(X_test, y_test)
Ridge_alpha2_score_train = alpha2.score(X_train, y_train)
Ridge_alpha2_score_test = alpha2.score(X_test, y_test)


print("ridge regression train score 1 alpha: ", Ridge_alpha1_score_train)
print("ridge regression test score 1 alpha: ", Ridge_alpha1_score_test)
print("ridge regression train score 2 alpha: ", Ridge_alpha2_score_train)
print("ridge regression test score 2 alpha: ", Ridge_alpha2_score_test)



In [None]:
# #implementing lasso regression
from sklearn.linear_model import Lasso

lasso1 = Lasso(alpha=1, max_iter = 10e5)
lasso1.fit(X_train, y_train)
train_score1 = lasso1.score(X_train, y_train)
y_test1 = lasso1.predict(X_test)
test_score1 = lasso1.score(X_test, y_test1)

print("Alpha=1, train data score ",train_score1)
print("Alpha=1, test data score ",test_score1)

lasso10 = Lasso(alpha=100, max_iter = 10e5)
lasso10.fit(X_train, y_train)
train_score10 = lasso10.score(X_train, y_train)
y_test10 = lasso10.predict(X_test)
test_score10 = lasso10.score(X_test, y_test10)

print("Alpha=10, train data score ",train_score10)
print("Alpha=10, test data score ",test_score10)


In [None]:
dfhouseTest_temp.to_csv('housePredTestFile.csv',index=False)