Import Libraries

In [461]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

Checking data

In [462]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')


Five Number Summery

In [463]:
train.head()

In [464]:
test.head()

In [465]:
print('train data shape: ', train.shape)
print('test data shape: ', test.shape)

In [466]:
print('trainig data info:\n ', train.info())

In [467]:
print('testing data info : \n', test.info())

Summary of training dataset

In [468]:
print('Summary of trainig data: \n')
train.describe()

Summary of testing data set

In [469]:
print('Summary of test data: \n')
test.describe()

Plotting Sale Price Distribution 

In [470]:
print('Sale Price Distribution')
sns.displot(train['SalePrice'], kde = True)
plt.ylabel('Frequency')
plt.title('SalePrice distribution')
plt.plot()

Plotting Q-Q Plot of Sale Price 

In [471]:
from scipy import stats
plt.figure()
res = stats.probplot(train['SalePrice'], plot = plt)

So, Our Target Variable(Sale Price) is right skewed. So we will apply Log transformation on our target variable data. But before that we need to make sure that is our data values must be greater than zero 

In [472]:
negative_values = [] #Creating empty list to Store values
for values in train['SalePrice']:
    if values <= 0: # Checking values negative or zero in SalePrice Column
        negative_values.append(values) # Storing values in the emty list container

negative_values #Printing Negative and zero values


Checking missing values in % in training dataset

In [473]:
features_with_na_train = [features for features in train.columns if train[features].isnull().sum() > 1]
for feature in features_with_na_train:
    print(feature, np.round(train[feature].isnull().mean(), 2), '% missing values')
    

Checking missing values in % in testing dataset

In [474]:
features_with_na_test = [features for features in test.columns if test[features].isnull().sum() > 1]
for feature in features_with_na_test:
    print(feature, np.round(test[feature].isnull().mean()), 2, '% missing values')

Checking missing values and plotting same

In [475]:
missing_values_train = train.isnull().sum().sort_values(ascending=False)
missing_values_train

Plotting missing values in training dataset:

In [476]:
x_axis = missing_values_train[:10]
y_axis = missing_values_train[:10].index
plt.figure(figsize=(10, 8))
sns.barplot(x_axis, y_axis)
plt.title('missing values in training data')
plt.plot()

Checking test dataset missing values and Plotting the same 

In [477]:
missing_values_test = test.isnull().sum()
missing_values_test

In [478]:
x_axis = missing_values_test[:10]
y_axis = missing_values_test[:10].index
plt.figure(figsize=(10, 8))
sns.barplot(x_axis, y_axis)
plt.title('plotting missing values in test dataset')
plt.plot()

In [479]:
train.shape, test.shape

Since there are lots of missing values in training data set we need to find relationship between sale price and the features

In [480]:
for feature in features_with_na_train:
    data_train = train.copy()
    #If values are missing then 1 otherwise 0
    data_train[feature] = np.where(data_train[feature].isnull(), 1, 0)
    #lets calculate the median Sale Price where the information is missing or present
    data_train.groupby(feature)['SalePrice'].median().plot.bar(color = ['red', 'blue'])
    
    plt.title(feature)
    plt.show()

Numerical variables in Train dataset:

In [481]:
numerical_features_train = [feature for feature in train.columns if train[feature].dtypes != 'O']
print('Number of Numerical Feature of train data: ', len(numerical_features_train))
train[numerical_features_train].head()

Numerical variables in Test dataset

In [482]:
numerical_features_test = [feature for feature in test.columns if train[feature].dtypes != 'O']
print('numerical features of test data: ', len(numerical_features_test))
test[numerical_features_test].head()

Temopral Features in train dataset

In [483]:
temporal_features_train = [feature for feature in numerical_features_train if 'Yr' in feature or 'Year' in feature]
temporal_features_train

In [484]:
for feature in temporal_features_train:
    print(feature, train[feature].unique())

temporal feature in test dataset

In [485]:
temporal_features_test = [feature for feature in numerical_features_test if 'Yr' in feature or 'Year' in feature]
temporal_features_test

In [486]:
for feature in temporal_features_test:
    print(feature, test[feature].unique())

Lets analyze the Temporal Datetime Variables
We will check whether there is a relation between the year and the house sold and the sales price for train dataset

In [487]:
train.shape, test.shape

In [488]:
train.groupby('YrSold')['SalePrice'].median().plot()
plt.xlabel('Year Sold')
plt.ylabel('Median House Price')
plt.title('House price vs YearSold')

Here we will compare the difference between All years feature with SalePrice

In [489]:
for feature in temporal_features_train:
    if feature != 'YrSold':
        data_train = train.copy()
        
        data_train[feature] = data_train['YrSold']-data_train[feature]
        plt.scatter(data_train[feature], data_train['SalePrice'])
        
        plt.xlabel(feature)
        plt.ylabel('SalePrice')
        plt.show()

In [490]:
train.shape, test.shape

Numerical variables are usually of 2 type. Continous variable and Discrete Variables. So extracting Discrete Features in training data

In [491]:
discrete_features_train = [feature for feature in numerical_features_train if len(train[feature].unique()) < 25 and feature not in temporal_features_train + ['Id']]
print('Discrete Variables Count: {}'.format(len(discrete_features_train)))

In [492]:
discrete_features_train

In [493]:
train[discrete_features_train].head()

In [494]:
from matplotlib import cm
cmap = cm.get_cmap('Spectral')
for feature in discrete_features_train:
    data_train = train.copy()
    data_train.groupby(feature)['SalePrice'].median().plot.bar( cmap = cmap, stacked= False)
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.title(feature)
    plt.show()

In [495]:
train.shape, test.shape

Discrete feartures in Test dataset

In [496]:
discrete_features_test = [feature for feature in numerical_features_test if len(test[feature].unique()) < 25 and feature not in temporal_features_test + ['Id']]
discrete_features_test

In [497]:
test[discrete_features_test].head()

Continous feature of training dataset

In [498]:
continuos_feature_train = [feature for feature in numerical_features_train if feature not in discrete_features_train + ['Id'] + temporal_features_train] 
print('continuos feature Count {}'.format(len(continuos_feature_train)))
continuos_feature_train

In [499]:
train[continuos_feature_train].head()

Plotting Continous features

In [500]:
for feature in continuos_feature_train:
    data_train = train.copy()
    data_train[feature].hist(bins = 25)
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.title(feature)
    plt.show()

Continous Features of test dataset

In [501]:
continuos_feature_test = [feature for feature in numerical_features_test if feature not in temporal_features_test + discrete_features_test + ['Id']]
print('Total number of continous feature in test data: {}'.format(len(continuos_feature_test)))
continuos_feature_test

Applying Log Transformation to our continuos features of training data

In [502]:
for feature in continuos_feature_train:
    data_train = train.copy()
    if 0 in data_train[feature].unique():
        pass
    else:
        data_train[feature] = np.log(data_train[feature])
        data_train['SalePrice']=np.log(data_train['SalePrice'])
        plt.scatter(data_train[feature], data_train['SalePrice'])
        plt.xlabel(feature)
        plt.ylabel('SalePrice')
        plt.title(feature)
        plt.show()

In [503]:
train.shape, test.shape

Applying Log Transformation to our continuos features of testing data

In [504]:
for feature in continuos_feature_test:
    data_test = test.copy()
    if 0 in data_test[feature].unique():
        pass
    else:
        data_test[feature] = np.log(data_test[feature])
         

Outliers in Continuos Feature of training dataset

In [505]:
for feature in continuos_feature_train:
    data_train = train.copy()
    if 0 in data_train[feature].unique():
        pass
    else:
        data_train[feature] = np.log(data_train[feature])
        data_train.boxplot(column=feature)
        plt.title(feature)
        plt.ylabel(feature)
        plt.show()

Outliers in continuos feature of test dataset

In [506]:
for feature in continuos_feature_test:
    data_test = test.copy()
    if 0 in data_test[feature].unique():
        pass
    else:
        data_test[feature] = np.log(data_test[feature])
        data_test.boxplot(column=feature)
        plt.title(feature)
        plt.ylabel(feature)
        plt.show()

extracting categorical features from trainig dataset

In [507]:
categorical_feature_train = [feature for feature in train.columns if data_train[feature].dtypes == 'O']
categorical_feature_train

Extracting categorical features from test dataset

In [508]:
categorical_feature_test = [feature for feature in test.columns if data_test[feature].dtypes == 'O']
categorical_feature_test

In [509]:
test[categorical_feature_train].head()

In [510]:
train[categorical_feature_test].head()

In [511]:
train.shape, test.shape

Finding Number of categories in each categorical features in training dataset

In [512]:
for feature in categorical_feature_train:
    print('The feature is {} and number of categories are {}'.format(feature, len(test[feature].unique())))

In [513]:
for feature in categorical_feature_train:
    data_train = train.copy()
    data_train.groupby(feature)['SalePrice'].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.title(feature)
    plt.show()

Finding number categories in each categorical feature of testing dataset

In [514]:
test[categorical_feature_test].head()

In [515]:
train.shape, test.shape

In [516]:
for feature in categorical_feature_test:
    print('The feature is {} and number of categories are {}'.format(feature, len(test[feature].unique())))

Finding missing values training dataset

In [517]:
categorical_feature_nan_train = [feature for feature in train.columns if train[feature].isnull().sum() > 1 and train[feature].dtypes == 'O']
for feature in categorical_feature_nan_train:
    print('{}: {}% missing values'.format(feature, np.round(train[feature].isnull().mean(), 4)))

Finding missing values in testing dataset

In [518]:
categorical_feature_nan_test = [feature for feature in test.columns if test[feature].isnull().sum() > 1 and test[feature].dtypes == 'O']
for feature in categorical_feature_nan_test:
    print('{}: {}% missing values'.format(feature, np.round(test[feature].isnull().mean(), 4)))

Finding missing values in numerical feature of training dataset

In [519]:
numerical_features_nan_train = [feature for feature in train.columns if train[feature].isnull().sum() > 1 and train[feature].dtypes != 'O']
for feature in numerical_features_nan_train:
    print('{}: {}% missing values'.format(feature, np.round(train[feature].isnull().mean(), 4)))

Finding missing values in numerical feature of testing dataset

In [520]:
numerical_features_nan_test = [feature for feature in test.columns if test[feature].isnull().sum() > 1 and test[feature].dtypes != 'O']
for feature in numerical_features_nan_test:
    print('{}: {}% missing values are'.format(feature, np.round(test[feature].isnull().mean(), 4)))

In [521]:
train.shape, test.shape

Replacing missing values with median value in trainig dataset and testing dataset respectively

trainig dataset

In [522]:
for feature in numerical_features_nan_train:
    median_value = train[feature].median()

    train[feature + 'nan'] = np.where(train[feature].isnull(), 1, 0)
    train[feature].fillna(median_value, inplace=True)
train[numerical_features_nan_train].isnull().sum()

In [523]:
train.shape, test.shape

In [524]:
train[numerical_features_nan_train].head(15)

In [525]:
train.shape, test.shape

In [526]:
train.head(50)

Testing Dataset

In [527]:
for feature in numerical_features_nan_test:
    median_value = test[feature].median()

    test[feature + 'nan'] = np.where(test[feature].isnull(), 1, 0)
    test[feature].fillna(median_value, inplace=True)
test[numerical_features_nan_test].isnull().sum()

In [528]:
test[numerical_features_nan_test].head(15)

In [529]:
test.head()

In [530]:
train.shape, test.shape

Temporal features extraction of trainig dataset

In [531]:
for feature in ['YearBuilt','YearRemodAdd','GarageYrBlt']:
    train[feature] = train['YrSold'] - train[feature]

train[['YearBuilt','YearRemodAdd','GarageYrBlt']].head()

Temporal feature extraction of testing dataset

In [532]:
for feature in ['YearBuilt','YearRemodAdd','GarageYrBlt']:
    test[feature]  = test['YrSold'] - test[feature]
test[['YearBuilt','YearRemodAdd','GarageYrBlt']].head()

Applying log transformation on numerical feature of training dataset

In [533]:
numerical_features_train = ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice']
for feature in numerical_features_train:
    train[feature] = np.log(train[feature])

In [534]:
train[['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice']].head()

Applying log transformation on numerical features of testing dataset

In [535]:
numerical_features_test = ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea']
for feature in numerical_features_test:
    test[feature] = np.log(test[feature])

In [536]:
test[['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea']].head()

Handling categorical features of training dataset

In [537]:
train.head()

In [538]:
categorical_feature_train = [feature for feature in train.columns if train[feature].dtypes == 'O']
categorical_feature_train

In [539]:
train[categorical_feature_train].head()

In [540]:
train[categorical_feature_train].head()

replacing missing values of categorical features with mode in training dataset

In [541]:
for feature in categorical_feature_train:
   
    train[feature] = train[feature].replace(' ', np.nan)
    train[feature] = train[feature].fillna(train[feature].mode().iloc[0])


In [542]:
train[categorical_feature_train].head(5)

In [543]:
train[categorical_feature_train].isnull().sum().head()

Replacing categorical feature missing values in test dataset

In [544]:
for feature in  categorical_feature_test:
    test[feature] = test[feature].fillna(test[feature].mode().iloc[0])

In [545]:
test[categorical_feature_test].isnull().sum().head()

Label encoding for training dataset

In [546]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()

In [547]:
for feature in categorical_feature_train:
    train[feature] = lb.fit_transform(train[feature])


In [548]:
train[categorical_feature_train].head()

In [549]:
test.drop(['BsmtFullBathnan', 'BsmtHalfBathnan'], axis =1, inplace=True)

Label encoding for test dataset

In [550]:
for feature in categorical_feature_test:
    test[feature] = test[feature].map(lambda s: '<unknown>' if s not in lb.classes_ else s)
    lb.classes_ = np.append(lb.classes_, '<unknown>')
    test[feature] = lb.transform(test[feature])

In [551]:
test[categorical_feature_test].isnull().sum().head()

In [552]:
test[categorical_feature_test].head()

In [553]:
test.dropna(inplace=True)

Feature scaling on train dataset

In [554]:
feature_scale_train = [feature for feature in train.columns if feature not in ['Id', 'SalePrice']]
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit_transform(train[feature_scale_train])

Feature scaling on test dataset

In [555]:
feature_scale_test = [feature for feature in test.columns if feature not in ['Id']]

scaler.transform(test[feature_scale_test])

In [556]:
train.shape, test.shape

In [557]:
train[feature_scale_train].head()

In [558]:
test[feature_scale_test].head()

In [559]:
test.head()

Converting feature scaled values into dataframe

In [560]:
new_train = pd.concat([train[['SalePrice',]].reset_index(drop = True),
pd.DataFrame(scaler.transform(train[feature_scale_train]), columns=feature_scale_train)], axis =1)

In [561]:
new_train.head()

In [562]:
new_test = pd.DataFrame(scaler.transform(test[feature_scale_test]), columns=feature_scale_test)
new_test.head()

## Creating Models for Regression

In [563]:
from sklearn.model_selection import train_test_split

In [564]:
X = new_train.drop(['SalePrice'], axis = 1)
y = new_train[['SalePrice']]

In [565]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [566]:
from sklearn.metrics import r2_score, mean_squared_error

### Model 1 Linear regression

In [567]:
from sklearn.linear_model import LinearRegression
model_1 = LinearRegression()
model_1.fit(X_train, y_train)


In [568]:
model_1_pred = model_1.predict(X_test)
model_1_pred

In [569]:
r2_score(model_1_pred, y_test)

In [570]:
mean_squared_error(y_test, model_1_pred)

model 2 Random Forest

In [571]:
from sklearn.ensemble import RandomForestRegressor
model_2 = RandomForestRegressor()
model_2.fit(X_train, y_train)

In [572]:
model_2_pred = model_2.predict(X_test)
model_2_pred

In [573]:
r2_score(y_test, model_2_pred)

In [574]:
mean_squared_error(y_test, model_2_pred)

Model 3 Decision Tree

In [575]:
from sklearn.tree import DecisionTreeRegressor
model_3 = DecisionTreeRegressor()
model_3.fit(X_train, y_train)


In [576]:
model_3_pred = model_3.predict(X_test)
model_3_pred

In [577]:
r2_score(y_test, model_3_pred)

In [578]:
mean_squared_error(y_test, model_2_pred)

Model 4 SVM regressor

kernel SVM Linear

In [579]:
from sklearn.svm import SVR
model_4 = SVR(kernel='linear')
model_4.fit(X_train, y_train)

In [580]:
model_4_pred = model_4.predict(X_test)
model_4_pred

In [581]:
r2_score(y_test, model_4_pred)

In [582]:
mean_squared_error(y_test, model_4_pred)

kernel SVM rbf

In [583]:
model_4 = SVR(kernel = 'rbf')
model_4.fit(X_train, y_train)

In [584]:
model_4_pred = model_4.predict(X_test)
model_4_pred

In [585]:
r2_score(y_test, model_4_pred)

In [586]:
mean_squared_error(y_test, model_4_pred)

kernel SVM poly

In [587]:
model_4 = SVR(kernel = 'poly')
model_4.fit(X_train, y_train)

In [588]:
model_4_pred = model_4.predict(X_test)
model_4_pred

In [589]:
r2_score(y_test, model_4_pred)

In [590]:
mean_squared_error(y_test, model_4_pred)

kernel SVM sigmoid

In [591]:
model_4 = SVR(kernel = 'sigmoid')
model_4.fit(X_train, y_train)

In [592]:
model_4_pred = model_4.predict(X_test)
model_4_pred

In [593]:
r2_score(y_test, model_4_pred)

In [594]:
mean_squared_error(y_test, model_4_pred)

Now checking our model on fresh test data set

In [595]:
X_test = new_test

Model 1 Linear reg (New test Data)

In [596]:
from sklearn.linear_model import LinearRegression
model_1 = LinearRegression()
model_1.fit(X_train, y_train)
model_1 = model_1.predict(X_test)

In [597]:
model_1

Model 2 RandomForest (New test Data)

In [598]:
from sklearn.ensemble import RandomForestRegressor
model_2 = RandomForestRegressor()
model_2.fit(X_train, y_train)
model_2 = model_2.predict(X_test)


In [599]:
model_2

model 3 Decisiom tree 

In [600]:
from sklearn.tree import DecisionTreeRegressor
model_3 = DecisionTreeRegressor()
model_3.fit(X_train, y_train)
model_3 = model_3.predict(X_test)
model_3

Model 4 SVM : here we will rbf kernel as we have previously calculated r2 score and mean square error on svm model. so rbf kernel giving most accurate result among other kernels

In [601]:
model_4 = SVR(kernel = 'rbf')
model_4.fit(X_train, y_train)
model_4 = model_4.predict(X_test)
model_4

In [602]:
new_test_sub = test.copy()
new_test_sub.drop(new_test_sub.iloc[:, 1:], axis =1, inplace = True)

In [603]:
saleprice_exp = np.exp(model_2)

In [604]:
sale_price = pd.DataFrame(saleprice_exp)

In [605]:
new_test_sub['SalePrice'] = sale_price

In [606]:
new_test_sub.head()

In [608]:
new_test_sub['SalePrice'].isnull().sum()

In [616]:
new_test_sub.drop(index = [1457,1458], inplace = True)

In [617]:
new_test_sub['SalePrice'].isnull().sum()

In [618]:
new_test_sub.to_csv('submission.csv', index = False)

So we can conclude from above that random forest has the mose r2 score and also less mean square error