# Problem statement - House Price Prediction

# Analytic Approach - Regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

In [3]:
df = pd.read_csv("D:Dataset/HousePrices.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'D:Dataset/HousePrices.csv'

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
df.info()

# Data Cleaning

## Filling missing values

In [None]:
df.drop('Id',axis=1,inplace=True)

In [None]:
df.isnull().sum()

In [None]:
missing_values = df.isnull().sum()

# Calculate the missing value percentage for each column
missing_percentage = (missing_values / len(df)) * 100

# Display the result
print(missing_percentage)

In [None]:
df['LotFrontage']= df['LotFrontage'].fillna(df['LotFrontage'].median())
df['LotFrontage'].isnull().sum()

In [None]:
print(df['Alley'].unique())

In [None]:
df['Alley'] = df['Alley'].fillna('NA')
print(df['Alley'].unique())
df['Alley'].isnull().sum()

In [None]:
print(df['MasVnrType'].unique())

In [None]:
df['MasVnrType'] = df['MasVnrType'].replace(np.nan,df['MasVnrType'].mode()[0])
print(df['MasVnrType'].unique())
df['MasVnrType'].isnull().sum()

In [None]:
df['MasVnrArea']= df['MasVnrArea'].fillna(df['MasVnrArea'].median())
df['MasVnrArea'].isnull().sum()

In [None]:
print(df['BsmtQual'].unique())

In [None]:
df['BsmtQual'] = df['BsmtQual'].fillna('NA')
print(df['BsmtQual'].unique())
df['BsmtQual'].isnull().sum()

In [None]:
print(df['BsmtCond'].unique())

In [None]:
df['BsmtCond'] = df['BsmtCond'].fillna('NA')
print(df['BsmtCond'].unique())
df['BsmtCond'].isnull().sum()

In [None]:
print(df['BsmtExposure'].unique())

In [None]:
df['BsmtExposure'] = df['BsmtExposure'].fillna('NA')
print(df['BsmtExposure'].unique())
df['BsmtExposure'].isnull().sum()

In [None]:
print(df['BsmtFinType1'].unique())

In [None]:
df['BsmtFinType1'] = df['BsmtFinType1'].fillna('NA')
print(df['BsmtFinType1'].unique())
df['BsmtFinType1'].isnull().sum()

In [None]:
print(df['BsmtFinType2'].unique())

In [None]:
df['BsmtFinType2'] = df['BsmtFinType2'].fillna('NA')
print(df['BsmtFinType2'].unique())
df['BsmtFinType2'].isnull().sum()

In [None]:
print(df['Electrical'].unique())

In [None]:
df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])
print(df['Electrical'].unique())
df['Electrical'].isnull().sum()

In [None]:
print(df['FireplaceQu'].unique())

In [None]:
df['FireplaceQu'] = df['FireplaceQu'].fillna('NA')
print(df['FireplaceQu'].unique())
df['FireplaceQu'].isnull().sum()

In [None]:
print(df['GarageType'].unique())

In [None]:
df['GarageType'] = df['GarageType'].fillna('NA')
print(df['GarageType'].unique())
df['GarageType'].isnull().sum()

In [None]:
df['GarageYrBlt']= df['GarageYrBlt'].fillna(df['GarageYrBlt'].median())
df['GarageYrBlt'].isnull().sum()

In [None]:
print(df['GarageFinish'].unique())

In [None]:
df['GarageFinish'] = df['GarageFinish'].fillna('NA')
print(df['GarageFinish'].unique())
df['GarageFinish'].isnull().sum()

In [None]:
print(df['GarageQual'].unique())

In [None]:
df['GarageQual'] = df['GarageQual'].fillna('NA')
print(df['GarageQual'].unique())
df['GarageQual'].isnull().sum()

In [None]:
print(df['GarageCond'].unique())

In [None]:
df['GarageCond'] = df['GarageCond'].fillna('NA')
print(df['GarageCond'].unique())
df['GarageCond'].isnull().sum()

In [None]:
print(df['PoolQC'].unique())

In [None]:
df['PoolQC'] = df['PoolQC'].fillna('NA')
print(df['PoolQC'].unique())
df['PoolQC'].isnull().sum()

In [None]:
print(df['Fence'].unique())

In [None]:
df['Fence'] = df['Fence'].fillna('NA')
print(df['Fence'].unique())
df['Fence'].isnull().sum()

In [None]:
print(df['MiscFeature'].unique())

In [None]:
df['MiscFeature'] = df['MiscFeature'].fillna('NA')
print(df['MiscFeature'].unique())
df['MiscFeature'].isnull().sum()

In [None]:
df.sample(10)

## Checking coorelation

In [None]:
plt.figure(figsize=(30, 20))  # Set the figure size
colormap = sns.diverging_palette(10, 220, as_cmap = True)

# Create a mask for the upper triangle
corr_matrix = df.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

sns.heatmap(corr_matrix, mask=mask, cmap=colormap, annot=True, fmt=".2f", linewidths=0.5, vmax=0.5, annot_kws={'fontsize':9})
plt.title('Correlation Heatmap', fontsize=14)
plt.xticks(rotation=90, ha='right')  # Rotate x-axis labels for better visibility
plt.yticks(rotation=0)
plt.show()

In [None]:
correlation_matrix = df.corr()

# Extract the correlations of each column with the target variable
correlations = correlation_matrix['Property_Sale_Price'].abs().sort_values(ascending=False).round(2)

# Display the correlations in decreasing order
print(correlations)

In [None]:
threshold = 0.3

# Find the columns to drop
columns_to_drop = correlation_matrix.columns[correlation_matrix['Property_Sale_Price'].abs() < threshold]

# Drop the columns from the DataFrame
df_dropped = df.drop(columns_to_drop, axis=1)

# Display the resulting DataFrame
df_dropped.shape #19 columns removed

In [None]:
df_dropped.head()

In [None]:
plt.figure(figsize=(16, 12))  # Set the figure size
colormap = sns.diverging_palette(10, 220, as_cmap = True)

# Create a mask for the upper triangle
corr_matrix = df_dropped.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

sns.heatmap(corr_matrix, mask=mask, cmap=colormap, annot=True, fmt=".2f", linewidths=0.5, vmax=0.5, annot_kws={'fontsize':11})
plt.title('Correlation Heatmap', fontsize=20)
plt.xticks(rotation=90, ha='right')  # Rotate x-axis labels for better visibility
plt.yticks(rotation=0)
plt.show()

# Checking distribution of data

In [None]:
sns.pairplot(df_dropped,diag_kind='kde')

In [None]:
# Determine the number of rows and columns for the subplot grid
num_columns = 2  # Number of columns for the subplot grid
num_rows = -(-len(df.columns) // num_columns)  # Round up the division result to the nearest integer

colors = sns.color_palette('husl', df.shape[1])

# Create a figure and axes for the subplots
fig, axes = plt.subplots(num_rows, num_columns, figsize=(18,275))

# Flatten the axes array for ease of iteration
axes = axes.flatten()

# Iterate over the columns and create histplot with KDE
for i, column in enumerate(df.columns):
    ax = axes[i] if i < len(axes) else None  # Handle cases where there are more columns than subplots
    if ax is not None:
        sns.histplot(data=df, x=column, kde=True, color=colors[i], line_kws={'color': 'black'}, ax=ax)
        ax.set_title(column,fontsize=20)
        
        ax.set_xticklabels(ax.get_xticklabels(), rotation=55,fontsize=11)
        ax.set_yticklabels(ax.get_yticklabels(),fontsize=11)
        ax.set_xlabel(column,fontsize=15)
        ax.set_ylabel('Count',fontsize=15)
        
# Remove any unused subplots
if len(df.columns) < len(axes):
    for j in range(len(df.columns), len(axes)):
        fig.delaxes(axes[j])

# Adjust the layout of subplots and spacing
plt.tight_layout()

# Display the subplots
plt.show()

# Checking count of each categorical columns

In [None]:

# Select the categorical columns from the DataFrame
categorical_columns = df.select_dtypes(include='object').columns

# Determine the number of rows and columns for the subplot grid
num_columns = 2  # Number of columns for the subplot grid
num_rows = -(-len(categorical_columns) // num_columns)  # Round up the division result to the nearest integer

# Create a figure and axes for the subplots figsize=(15, 300)
fig, axes = plt.subplots(num_rows, num_columns, figsize=(18,200))

# Flatten the axes array for ease of iteration
axes = axes.flatten()

# Iterate over the categorical columns and create countplots
for i, column in enumerate(categorical_columns):
    ax = axes[i] if i < len(axes) else None  # Handle cases where there are more columns than subplots
    if ax is not None:
        # Calculate value counts for the current column
        value_counts = df[column].value_counts()

        # Create a countplot for the column
        sns.countplot(data=df, x=column, ax=ax)

        # Calculate the value percentages
        total_count = value_counts.sum()
        value_percentages = value_counts / total_count * 100

        # Add value percentage text to the countplot
        for j, p in enumerate(ax.patches):
            ax.text(p.get_x() + p.get_width() / 2, p.get_height() + 5, f"{value_percentages[j]:.1f}%", ha='center',fontsize=10)
        
        #ax.set_title(column,fontsize=20)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=55,fontsize=11)
        ax.set_yticklabels(ax.get_yticklabels(),fontsize=11)
        ax.set_xlabel(column,fontsize=15)
        ax.set_ylabel('Count',fontsize=15)
        
if len(categorical_columns) < len(axes):
    for j in range(len(categorical_columns), len(axes)):
        fig.delaxes(axes[j])        
        
# Adjust the layout of subplots and spacing
fig.subplots_adjust(hspace=0.3)

# Display the subplots
plt.show()

# Standardization

In [None]:
std = StandardScaler()

cols = ['OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'FullBath',
       'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea',
       'WoodDeckSF', 'OpenPorchSF']

for i in cols:
    df_dropped[i] = std.fit_transform(df_dropped[[i]])

# Encoding

In [None]:
df_dropped = pd.get_dummies(df_dropped)

In [None]:
df_dropped.dtypes

In [None]:
df_dropped.shape

# Model building

In [None]:
X1 = df_dropped.drop('Property_Sale_Price',axis=1)
y1 = df_dropped['Property_Sale_Price']

X_train,X_test,y_train,y_test = train_test_split(X1,y1,test_size=0.3,random_state=0)

# Linear Regression

In [None]:
model_1 = LinearRegression()

model_1.fit(X_train,y_train)

print('train - ', model_1.score(X_train,y_train))
print('test - ', model_1.score(X_test,y_test))

# Decision Tree Regressor

In [None]:
model_2 = DecisionTreeRegressor()
model_2.fit(X_train,y_train)

print('train - ', model_2.score(X_train,y_train))
print('test - ', model_2.score(X_test,y_test))

In [None]:
model_2 = DecisionTreeRegressor(max_depth=5)
model_2.fit(X_train,y_train)

print('train - ', model_2.score(X_train,y_train))
print('test - ', model_2.score(X_test,y_test))

In [None]:
model_2 = DecisionTreeRegressor(criterion="friedman_mse",max_depth=7)
model_2.fit(X_train,y_train)

print('train - ', model_2.score(X_train,y_train))
print('test - ', model_2.score(X_test,y_test))

# Random Forest Regressor

In [None]:
model_3 = RandomForestRegressor()
model_3.fit(X_train,y_train)

print('train - ', model_3.score(X_train,y_train))
print('test - ', model_3.score(X_test,y_test))

In [None]:
model_3 = RandomForestRegressor(n_estimators=101,criterion='friedman_mse', max_depth= 9)
model_3.fit(X_train,y_train)

print('train - ', model_3.score(X_train,y_train))
print('test - ', model_3.score(X_test,y_test))

In [None]:
model_3 = RandomForestRegressor(n_estimators=101,criterion='absolute_error', max_depth= 10)
model_3.fit(X_train,y_train)

print('train - ', model_3.score(X_train,y_train))
print('test - ', model_3.score(X_test,y_test))

In [None]:
model_3 = RandomForestRegressor(n_estimators=100,criterion='friedman_mse', max_depth= 15)
model_3.fit(X_train,y_train)

print('train - ', model_3.score(X_train,y_train))
print('test - ', model_3.score(X_test,y_test))

In [None]:
model_3 = RandomForestRegressor(n_estimators=80,criterion='absolute_error', max_depth=15)
model_3.fit(X_train,y_train)

print('train - ', model_3.score(X_train,y_train))
print('test - ', model_3.score(X_test,y_test))

# Scores of models :

### 1. Linear Regression : train = 0.97, test =  -1.47

### 2. Decision Tree Regression : train = 0.87, test =  0.77

### 3. Random Forest Regression : train = 0.97, test =  0.91


### Interpretation : Random forest regressor model yeilds high score compared to other regressor models

