In [2]:
import numpy as np 
import pandas as pd
import os
import scipy
from scipy.stats import shapiro
from scipy.stats import normaltest
from scipy.stats import anderson
from matplotlib import pyplot as plt
from statsmodels.graphics.gofplots import qqplot



In [3]:
# !pip install git+https://github.com/shakedzy/dython.git

In [4]:
tt = pd.read_csv('/kaggle/input/housing-train-and-test/tt.csv')

In [5]:
tt = tt.drop('State', axis=1).copy()
tt = tt.astype({'Zip': 'object'}).copy()

In [6]:
train = tt.loc[:47438].copy()

In [7]:
train.tail()

Unnamed: 0,Id,Sold_Price,Type,Year_built,Heating,Cooling,Parking,Lot,Bedrooms,Bathrooms,...,Zip,Days_Listed,Days_Sold,Address_RdType,Address_NoLen,Summary_Len,Schools_Score,Schools_Distance,Total_Appliances,Hookups
47434,47434,159000.0,SingleFamily,1965.0,Central,Central,Garage,20908.8,3,2.0,...,922,179,0,Blvd,4,291,3.0,2.1,2,Y
47435,47435,255000.0,MobileManufactured,1999.0,Central,Central,Garage,,3,2.0,...,940,222,0,,4,799,6.0,2.8,0,Y
47436,47436,2300000.0,SingleFamily,1919.0,Central,Central,No_Garage,6756.0,3,2.0,...,900,91,2132,Ave,4,805,5.666667,0.966667,5,N
47437,47437,500000.0,SingleFamily,2017.0,Central,Central,Garage,5945.0,3,3.0,...,957,213,190,Ln,4,985,7.333333,0.9,0,N
47438,47438,760000.0,SingleFamily,1948.0,Central,,Garage,8250.0,2,1.0,...,907,120,0,Ave,4,570,8.666667,0.933333,0,N


# Demark quantitative and qualitative

In [8]:
quantitative = [f for f in train.columns if train.dtypes[f] != 'object']
quantitative.remove('Sold_Price')
quantitative.remove('Id')
qualitative = [f for f in train.columns if train.dtypes[f] == 'object']

In [9]:
quantitative, qualitative

(['Year_built',
  'Lot',
  'Bedrooms',
  'Bathrooms',
  'Full_bathrooms',
  'Total_interior_livable_area',
  'Total_spaces',
  'Garage_spaces',
  'Tax_assessed_value',
  'Annual_tax_amount',
  'Listed_Price',
  'Last_Sold_Price',
  'Days_Listed',
  'Days_Sold',
  'Address_NoLen',
  'Summary_Len',
  'Schools_Score',
  'Schools_Distance',
  'Total_Appliances'],
 ['Type',
  'Heating',
  'Cooling',
  'Parking',
  'Flooring',
  'Zip',
  'Address_RdType',
  'Hookups'])

# Testing for normality

In [10]:
def normality_tests(df, quantitative):
    Feature = []
    Shapiro_Wilk = []
    K2_Test = []
    Anderson_Darling = []
    for x in quantitative:
        Feature.append(x)
        Shapiro_Wilk.append(shapiro(df[x].dropna())[1])
        K2_Test.append(normaltest(df[x].dropna())[1])
        result = anderson(df[x].dropna())
        Anderson_Darling.append('Reject H0' if result.statistic < result.critical_values[2] else 'Failed to reject H0')
    print(f"H0: Distribution is Gaussian. If p > 0.05 means it is very likely, not certain, the distribution is Gaussian.\nShapiro-Wilk p-value may not be accurate when N > 5000. The DataFrame has {train.shape[0]} values.\nAnderson-Darling at 0.05 signifigance level.")
    out = pd.DataFrame(data=[Shapiro_Wilk, K2_Test, Anderson_Darling], index=['Shapiro-Wilk','K^2 Test','Anderson-Darling'], columns=Feature)
    out = out.transpose().copy()
    out = out.style.applymap(lambda x: "background-color: red" if type(x) != str and x<0.05 or type(x) == str and x =='Reject H0' else "background-color: green")
    return out

In [11]:
normality_tests(train, quantitative)



H0: Distribution is Gaussian. If p > 0.05 means it is very likely, not certain, the distribution is Gaussian.
Shapiro-Wilk p-value may not be accurate when N > 5000. The DataFrame has 47439 values.
Anderson-Darling at 0.05 signifigance level.


Unnamed: 0,Shapiro-Wilk,K^2 Test,Anderson-Darling
Year_built,0.0,0.0,Failed to reject H0
Lot,0.0,0.0,Failed to reject H0
Bedrooms,0.0,0.0,Failed to reject H0
Bathrooms,0.0,0.0,Failed to reject H0
Full_bathrooms,0.0,0.0,Failed to reject H0
Total_interior_livable_area,0.0,0.0,Failed to reject H0
Total_spaces,0.0,0.0,Failed to reject H0
Garage_spaces,0.0,0.0,Failed to reject H0
Tax_assessed_value,0.0,0.0,Failed to reject H0
Annual_tax_amount,0.0,0.0,Failed to reject H0


In [12]:
def normality_graphic(x):
    h= plt.hist(x, bins='auto')
    qq=qqplot(x, line='45')
    return h[2], qq

#need to expand hist to show full range of values ... take total number of unique values and divide by x? some function of unique and range?

In [None]:
normality_graphic(train.Lot)