In [3]:
import pandas as pd
import plotly.express as px
import pandas as pd
import numpy as np


1	Id	To count the records.

2	MSSubClass	 Identifies the type of dwelling involved in the sale.

3	MSZoning	Identifies the general zoning classification of the sale.

4	LotArea	 Lot size in square feet.

5	LotConfig	Configuration of the lot

6	BldgType	Type of dwelling

7	OverallCond	Rates the overall condition of the house

8	YearBuilt	Original construction year

9	YearRemodAdd	Remodel date (same as construction date if no remodeling or additions).

10	Exterior1st	Exterior covering on house

11	BsmtFinSF2	Type 2 finished square feet.

12	TotalBsmtSF	Total square feet of basement area

13	SalePrice	To be predicted

In [4]:
data = pd.read_excel('Data/HousePricePrediction.xlsx')

In [5]:
data.shape

(2919, 13)

In [6]:
data.dtypes

Id                int64
MSSubClass        int64
MSZoning         object
LotArea           int64
LotConfig        object
BldgType         object
OverallCond       int64
YearBuilt         int64
YearRemodAdd      int64
Exterior1st      object
BsmtFinSF2      float64
TotalBsmtSF     float64
SalePrice       float64
dtype: object

In [7]:
data.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           4
LotArea            0
LotConfig          0
BldgType           0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
Exterior1st        1
BsmtFinSF2         1
TotalBsmtSF        1
SalePrice       1459
dtype: int64

In [8]:
data = data.dropna()

In [9]:
fig = px.histogram(data, x='SalePrice', title='Distribution of Sale Price')
fig.show()

In [12]:
# numerical data
numerical_data = data.select_dtypes(include=['int64', 'float64'])

# categorical data
categorical_data = data.select_dtypes(include=['object'])

fig = px.bar(categorical_data, x='MSZoning', title='Number of Houses in Each Neighborhood')
fig.show()

In [13]:
# create a pie chart for the 'LotConfig' column
fig = px.pie(categorical_data, names='LotConfig', title='Distribution of Houses by Lot Configuration')
fig.show()

In [14]:
# create bar plots for all categorical columns
for column in categorical_data.columns:
    fig = px.bar(categorical_data, x=column, title=f'Number of Houses by {column}')
    fig.show()

In [15]:
# Calculate the correlation matrix
correlation_matrix = numerical_data.corr()

# Create the heatmap using Plotly Express
fig = px.imshow(correlation_matrix,
                x=correlation_matrix.columns,
                y=correlation_matrix.columns,
                color_continuous_scale='RdBu_r',  # Red-Blue diverging color scale
                zmin=-1,
                zmax=1,
                aspect="auto",
                title='Correlation Heatmap of Numerical Variables')

# Update the layout for better readability
fig.update_layout(
    xaxis_title="",
    yaxis_title="",
    xaxis={'side': 'top'},  # Move x-axis labels to the top
    width=800,
    height=700
)

# Add correlation values as text annotations
for i, row in enumerate(correlation_matrix.values):
    for j, value in enumerate(row):
        fig.add_annotation(
            x=correlation_matrix.columns[j],
            y=correlation_matrix.columns[i],
            text=f"{value:.2f}",
            showarrow=False,
            font=dict(size=8)
        )

# Show the plot
fig.show()