In [1]:
#Importing Libraries
import pandas as pd

In [2]:
import numpy as np

In [3]:
#Making DataFrame from our available DataSet
house_price = pd.read_csv('/dbfs/FileStore/tables/train_housePrice.csv')

In [4]:
#Number of Features 
print len(house_price.columns)

In [5]:
#Number of Data Instances
print len(house_price)

In [6]:
#Checking the datatype of each feature available to us
house_price.dtypes

In [7]:
#What are the names of features?
house_price.columns

In [8]:
#Detailed description for data
house_price.describe()

In [9]:
#importing graph and visualisation libraries
import matplotlib.pyplot as plt

In [10]:
import seaborn as sns

In [11]:
#Plotting the house price 

In [12]:
plt.figure()
plt.subplots(figsize=(12,8))
plt.scatter(range(house_price.shape[0]),np.sort(house_price['SalePrice']))
display()

In [13]:
#Checking the distribution for SalePrice

In [14]:
plt.figure()
sns.distplot(house_price['SalePrice'],kde=True)
display()

In [15]:
print("Skewness:",house_price['SalePrice'].skew())
print("Kutosis:",house_price['SalePrice'].kurt())

In [16]:
#Making Sale Price Normally Distributed

In [17]:
plt.figure()
sns.distplot(np.log(house_price['SalePrice']),kde=True)
display()

In [18]:
print np.log(house_price['SalePrice']).mean()
print "Skewness after Logged SalePrice:",np.log(house_price['SalePrice']).skew()
print "Kutosis after Logged SalePrice:",np.log(house_price['SalePrice']).kurt()

In [19]:
#Checking the mean price of houses sold in particular year it was built

In [20]:
plt.figure()
plt.subplots(figsize=(18,10))
sns.barplot(house_price['YearBuilt'],house_price['SalePrice'])
plt.xticks(rotation='vertical')
display()

In [21]:
#Plotting the number of missing values for each feature

In [22]:
dftypes_df = house_price.dtypes.reset_index()

In [23]:
dftypes_df.columns = ['count','column_type']

In [24]:
dftypes_df = dftypes_df.groupby('column_type').aggregate('count').reset_index()

In [25]:
dftypes_df

In [26]:
missing_type = house_price.isnull().sum().reset_index()

In [27]:
missing_type.columns = ['Column_Name','Num_of_Missing_Value']

In [28]:
missing_type

In [29]:
plt.figure()
plt.subplots(figsize=(18,15))
sns.barplot(missing_type['Column_Name'],missing_type['Num_of_Missing_Value'])
plt.xticks(rotation="vertical")
plt.ylabel('Number of missing values')
plt.xlabel('Features')
display()

In [30]:
#correlation matrix
corrmat = house_price.corr()
plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)
display()

In [31]:
k=10
plt.figure()
plt.subplots(figsize=(10,8))
sns.set(font_scale=1.25)
corrmat_col = corrmat.nlargest(k,'SalePrice').index
corr_map_saleprice = np.corrcoef(house_price[corrmat_col].values.T)
sns.heatmap(corr_map_saleprice,square=True,xticklabels=corrmat_col.values,yticklabels=corrmat_col.values,cbar=True, annot=True)
display()

In [33]:
#!pip install xgboost

In [34]:
#Using XgBoost to calculate the feature importance with respect to target variable

In [35]:
from sklearn import preprocessing,model_selection
import xgboost as xgb

In [36]:
for f in house_price.columns:
  if house_price[f].dtype == 'object':
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(house_price[f].values))
    house_price[f] = lbl.transform(list(house_price[f].values))

In [37]:
house_price.dtypes

In [38]:
train_y = house_price.SalePrice.values
train_X = house_price.drop(['SalePrice','Id'],axis=1)

In [39]:
xgb_params = {
    'eta':0.05,
    'max_depth':8,
    'subsample':0.7,
    'colsample_bytree':0.7,
    'objective':'reg:linear',
    'eval_metric':'rmse',
    'silent':1
}

dtrain = xgb.DMatrix(train_X,train_y,feature_names=train_X.columns.values)
model = xgb.train(dict(xgb_params,silent=0),dtrain,num_boost_round=100)

fig,ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(model,max_num_features=50,height=0.8,ax=ax)
display()