# Housing Price Prediction

In [4]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# import Libraries

In [5]:
#import dataset with pandas
import pandas as pd
# visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import numpy as np
warnings.filterwarnings("ignore")

# section 1: Exploratory Data Analysis

Train and Test dataframes

In [6]:
train_df = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv')
test_df = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')
train_df.head()

In [7]:
# size of dataset:

print("Dataset has {0} Rows(size)  and {1} Columns(features)".format(train_df.shape[0],train_df.shape[1]))

In [8]:
#datset information
train_df.info()

In [9]:
# Numecial and Categorical variables
numeric_cols = train_df.select_dtypes(include = ['int64','float64'])
categorical_cols =  train_df.select_dtypes(include = ['object'])

In [10]:
print("N.O. of numeric columns: {0}".format(numeric_cols.shape[1]))
numeric_cols.columns

In [11]:
print("N.O. of categorical columns: {0}".format(categorical_cols.shape[1]))
categorical_cols.columns

# Correlation with numeric values

In [12]:
plt.figure(figsize=(30,30))
sns.heatmap(numeric_cols.corr(),annot=True)
plt.show()

### Missing Values

In [13]:
null_cols = [f for f in train_df.columns if train_df[f].isnull().sum()/len(train_df) >0]
null_cols_to_drop = [f for f in train_df.columns if train_df[f].isnull().sum()/len(train_df) > 0.75]
print(null_cols_to_drop)

### droping values with 75 % null values

In [14]:
#test and train df dropping columns
train_df = train_df.drop(null_cols_to_drop,axis=1)
test_df = test_df.drop(null_cols_to_drop,axis=1)

In [15]:
for n in null_cols:
    if n not in null_cols_to_drop:
        print(n,(train_df[n].isnull().sum()/len(train_df))*100,'%')

In [16]:
#updating numeric and categorical columns
num_cols = [i for i in numeric_cols.columns if i not in null_cols_to_drop]
cat_cols = [i for i in categorical_cols.columns if i not in null_cols_to_drop]


### Temporal variables


In [17]:
year_cols = [f for f in num_cols if 'Yr' in f or 'Year' in f]
year_cols

In [21]:
numeric_cols.groupby('YrSold')['SalePrice'].median().plot()
plt.show()

 un-usuall vaule of sale price drop for recent years

In [22]:
fig=plt.subplots(figsize=(12, 12))

for i, feature in enumerate(['GarageYrBlt','YearBuilt','YearRemodAdd', 'YrSold']):
    plt.subplot(4, 2, i+1)
    sns.scatterplot(train_df[feature], train_df['SalePrice'])
    plt.tight_layout()

# Numerical values is further classified as discrete and continouse

In [23]:
# Discrete Features
des_num = [ f for f in num_cols if len(numeric_cols[f].unique())<25]
des_num

### Discrete values vs SalePrice

In [24]:
fig=plt.subplots(figsize=(12, 12))

for i, feature in enumerate(des_num):
    plt.subplot(6, 3, i+1)
    sns.barplot(train_df[feature], train_df['SalePrice'])
    plt.tight_layout()

In [25]:
#continouse variable
con_num = [f for f in num_cols if f not in des_num+['Id']+year_cols]
con_num

In [26]:
for i in con_num:
    train_df[i].hist(bins=25)
    plt.ylabel('Count')
    plt.xlabel(i)
    plt.show()

values doesnot have normal distribution curve lets use log transform on values to visualise the datapoints

In [27]:
# log
data = train_df.copy()
for i in con_num:
    if 0 in data[i].unique():
        pass
    else:
        data[i]  = np.log(data[i])
        plt.scatter(data[i],np.log(data['SalePrice']))
        plt.xlabel(i)
        plt.ylabel('SalePrice')
        plt.show()

### Outlier values

In [28]:
data = train_df.copy()
for i in con_num:
    if 0 in data[i].unique():
        pass
    else:
        data[i]  = np.log(data[i])
        plt.boxplot(data[i])
        plt.xlabel(i)
        plt.ylabel('SalePrice')
        plt.show()

## Categorical columns visualisation

In [29]:
data = train_df.copy()
for i in cat_cols:
    data.groupby(i)['SalePrice'].median().plot(kind='bar')
    plt.show()

# Section 2 : Feature Engineering

### imputation

In [30]:
#impute  data wth missing for categorucal values
def cat_impute(data,categorical):
    data[categorical] =  data[categorical].fillna('missing')
    return data

In [31]:
# for numerical values impute with a flag column and median values
def num_impute(data,numeric):
    for i in numeric :
        if  i != 'SalePrice': 
            data[i+'_NAN'] = np.where(data[i].isnull(),1,0)
            m = data[i].median()
            data[i] = data[i].fillna(m)
    return data

In [32]:
# datatime temporal variables relat with year sold
def temp_impute(dataset):
    for i in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
        dataset[i] = dataset[i].fillna(0)
        dataset[i] = dataset['YrSold'] - dataset[i]
    
    return dataset


In [33]:
dataset  = train_df.copy()
dataset_test = test_df.copy()

In [34]:
dataset = temp_impute(dataset)
dataset[['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']]

In [35]:
dataset_test = temp_impute(dataset_test)


Imputation of categorical and numerical values

In [36]:
num_cols = [x for x in num_cols if x not in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt','YrSold']]
dataset = num_impute(dataset,num_cols)
dataset_test = num_impute(dataset_test,num_cols)
cat_cols = [x for x in cat_cols if x not in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt','YrSold']]
dataset = cat_impute(dataset,cat_cols)
dataset_test = cat_impute(dataset_test,cat_cols)

In [37]:
dataset

### Log Transformation withb respect th EDA on following variables this will help in fitting the normal distribution cure and model

In [38]:
#log normal distribution

log_cols = ['LotFrontage','LotArea','1stFlrSF','GrLivArea','SalePrice']

for f in dataset.columns:
    if f in log_cols:
        dataset[f] = np.log(dataset[f])
dataset

In [39]:
#log normal distribution

log_cols_test = ['LotFrontage','LotArea','1stFlrSF','GrLivArea']

for f in dataset_test.columns:
    if f in log_cols:
        dataset_test[f] = np.log(dataset_test[f])
dataset_test

# Feature Scaling

In [40]:
feature_scale=[feature for feature in num_cols if feature not in ['Id','SalePrice']]
from sklearn.preprocessing import MinMaxScaler
scaler= MinMaxScaler()
scaler.fit(dataset[feature_scale],feature_scale)

In [41]:
dataset[feature_scale] = scaler.transform(dataset[feature_scale])
dataset_test[feature_scale] = scaler.transform(dataset_test[feature_scale])

In [42]:
dataset

### Label encoder for categorical values

In [43]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(cat_cols)
dataset[cat_cols] = le.transform(cat_cols)

In [44]:
dataset_test[cat_cols] = le.transform(cat_cols)

### vector transform of dependent and independent variables

In [47]:
Y = dataset['SalePrice'].values
X = dataset.drop(['Id','SalePrice'],axis=1).values
X_test = dataset_test.drop(['Id'],axis=1).values

In [48]:
# reshaping to fit in sklearn 
Y = Y.reshape(-1,1)

# Fitting liner regression

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X_train,x_test,Y_train,y_test = train_test_split(X,Y,test_size=0.33, random_state= 1 )
reg = LinearRegression()
reg.fit(X_train, Y_train)

In [50]:
# Base model prediction
reg.score(x_test,y_test)*100

### Cross Validaton score
with r2 metrics

In [55]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(LinearRegression(), X,Y, cv=8,scoring='r2')
scores.mean()*100

In [56]:
reg.coef_

some of the values have ver less significane with respect to co-efficents

## Feature Eleimination

In [58]:
Y_RFE = dataset['SalePrice']
X_RFE = dataset.drop(['Id','SalePrice'],axis=1)

In [59]:
from sklearn.feature_selection import RFECV
rfecv = RFECV(estimator=LinearRegression(), step=1, cv=10, scoring='r2')
rfecv.fit(X_RFE,Y_RFE)

In [60]:
print('Optimal number of features: {}'.format(rfecv.n_features_))

In [61]:
    X_RFE.columns[rfecv.support_]

In [62]:
X_RFE = X_RFE[X_RFE.columns[rfecv.support_]]

In [63]:
scores = cross_val_score(LinearRegression(),X_RFE, Y_RFE, cv=11,scoring='r2')
scores.mean()

#### there seem to be no significant increase with recurseive feature limination so will be using the base model

In [65]:
predict = reg.predict(X_test)
predict_sale = pd.DataFrame(np.exp(predict))
res  = dataset_test['Id']
r = pd.DataFrame(res)
r['SalePrice'] = predict_sale


In [66]:
r

In [67]:
r.to_csv('submission.csv',index=False)