In [1]:
import plotly.express as px
from scipy.stats import f_oneway
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [2]:
missing_value = ["N/a","Na","Nan","n/a",np.nan]

In [5]:
import pandas as pd
train_df = pd.read_csv('/content/train.csv',na_values = missing_value)
test_df = pd.read_csv('/content/test.csv',na_values = missing_value)

In [6]:
train_df = train_df.drop('Id', axis = 1)


In [7]:
train_df.SalePrice.describe()


count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [8]:
fig = px.histogram(train_df, x="SalePrice")
fig.show()

In [9]:
train_df['SalePrice'] = np.log(train_df['SalePrice'])
fig = px.histogram(train_df, x="SalePrice")
fig.show()

In [10]:
train_df.dtypes.value_counts()

object     43
int64      33
float64     4
dtype: int64

In [11]:
train_df_cat = train_df.select_dtypes(include='object')
train_df_cat['SalePrice'] = train_df['SalePrice']
train_df_num = train_df.select_dtypes(include='number')
train_df_num['SalePrice'] = train_df['SalePrice']

In [12]:
def ANOVA_Test(df, feature):
    category_groups = df.groupby(feature)['SalePrice'].apply(list)
    return f_oneway(*category_groups)

In [13]:
p_vals = {}
for col in train_df_cat.columns:
    p_vals[col] = ANOVA_Test(train_df_cat, col).statistic
p_vals.pop('SalePrice')


Each of the input arrays is constant;the F statistic is not defined or infinite



inf

In [14]:
sorted(p_vals, key=p_vals.get, reverse=True)[:5]

['ExterQual', 'KitchenQual', 'BsmtQual', 'GarageFinish', 'CentralAir']

In [15]:
corr = train_df_num.corr(method='pearson')
corr = corr.SalePrice.apply(abs).to_dict()
corr.pop('SalePrice')
sorted(corr, key=corr.get, reverse=True)[:5]

['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF']

In [16]:
features = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', 'ExterQual', 'KitchenQual', 'BsmtQual', 'GarageFinish', 'CentralAir', 'SalePrice']
train_df[features].head(5)

Unnamed: 0,OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,ExterQual,KitchenQual,BsmtQual,GarageFinish,CentralAir,SalePrice
0,7,1710,2,548,856,Gd,Gd,Gd,RFn,Y,12.247694
1,6,1262,2,460,1262,TA,TA,Gd,RFn,Y,12.109011
2,7,1786,2,608,920,Gd,Gd,Gd,RFn,Y,12.317167
3,7,1717,3,642,756,TA,Gd,TA,Unf,Y,11.849398
4,8,2198,3,836,1145,Gd,Gd,Gd,RFn,Y,12.429216


In [17]:
fig = make_subplots(rows=2, cols=5)
fig.add_trace(
    go.Scatter(x=train_df[features]['SalePrice'], y=train_df[features]['OverallQual'], mode='markers', name='SalePrice vs OverallQual'),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=train_df[features]['SalePrice'], y=train_df[features]['GrLivArea'], mode='markers', name='SalePrice vs GrLivArea'),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(x=train_df[features]['SalePrice'], y=train_df[features]['GarageCars'], mode='markers', name='SalePrice vs GarageCars'),
    row=1, col=3
)

fig.add_trace(
    go.Scatter(x=train_df[features]['SalePrice'], y=train_df[features]['GarageArea'], mode='markers', name='SalePrice vs GarageArea'),
    row=1, col=4
)

fig.add_trace(
    go.Scatter(x=train_df[features]['SalePrice'], y=train_df[features]['TotalBsmtSF'], mode='markers', name='SalePrice vs TotalBsmtSF'),
    row=1, col=5
)

fig.add_trace(
    go.Scatter(x=train_df[features]['SalePrice'], y=train_df[features]['ExterQual'], mode='markers', name='SalePrice vs ExterQual'),
    row=2, col=1
)

fig.add_trace(
    go.Scatter(x=train_df[features]['SalePrice'], y=train_df[features]['KitchenQual'], mode='markers', name='SalePrice vs KitchenQual'),
    row=2, col=2
)

fig.add_trace(
    go.Scatter(x=train_df[features]['SalePrice'], y=train_df[features]['BsmtQual'], mode='markers', name='SalePrice vs BsmtQual'),
    row=2, col=3
)

fig.add_trace(
    go.Scatter(x=train_df[features]['SalePrice'], y=train_df[features]['GarageFinish'], mode='markers', name='SalePrice vs GarageFinish'),
    row=2, col=4
)

fig.add_trace(
    go.Scatter(x=train_df[features]['SalePrice'], y=train_df[features]['CentralAir'], mode='markers', name='SalePrice vs CentralAir'),
    row=2, col=5
)

# update yaxis
fig.update_yaxes(title_text="OverallQual", row=1, col=1)
fig.update_yaxes(title_text="GrLivArea", row=1, col=2)
fig.update_yaxes(title_text="GarageCars", row=1, col=3)
fig.update_yaxes(title_text="GarageArea", row=1, col=4)
fig.update_yaxes(title_text="TotalBsmtSF", row=1, col=5)
fig.update_yaxes(title_text="ExterQual", row=2, col=1)
fig.update_yaxes(title_text="KitchenQual", row=2, col=2)
fig.update_yaxes(title_text="BsmtQual", row=2, col=3)
fig.update_yaxes(title_text="GarageFinish", row=2, col=4)
fig.update_yaxes(title_text="CentralAir", row=2, col=5)

# update xaxis
fig.update_xaxes(title_text="SalePrice", row=1, col=1)
fig.update_xaxes(title_text="SalePrice", row=1, col=2)
fig.update_xaxes(title_text="SalePrice", row=1, col=3)
fig.update_xaxes(title_text="SalePrice", row=1, col=4)
fig.update_xaxes(title_text="SalePrice", row=1, col=5)
fig.update_xaxes(title_text="SalePrice", row=2, col=1)
fig.update_xaxes(title_text="SalePrice", row=2, col=2)
fig.update_xaxes(title_text="SalePrice", row=2, col=3)
fig.update_xaxes(title_text="SalePrice", row=2, col=4)
fig.update_xaxes(title_text="SalePrice", row=2, col=5)

fig.update_layout(height=1000, width=2000, title_text="Side By Side Subplots")
fig.show()