# Homogenity test: Levene's test:

- H0 = variances are equal
- H1 = variances are not equal

In [1]:
# import the required libraries
import pandas as pd
import numpy as np
from scipy import stats

# make 2 lists with non equal variances
f_p = [100, 220, 240, 120, 80, 450, 100, 231, 330] # fruit price
m_p = [35000, 200000, 80000, 45000, 400000, 4000, 5500,15000] # mobile price

# apply levene test
# st, p = 
stats.levene(f_p, m_p)

# # interpret the p value
# alpha = 0.05
# if p > alpha:
#     print('Sample variances are equal (fail to reject H0)')
# else:
#     print('Sample variances are not equal (reject H0)')


LeveneResult(statistic=4.193449296896273, pvalue=0.05850852313340556)

# Chi Square test:
1. Goodness of fit.
2. Independence test
3. Homogenity test / correlation for categorical data

- H0 = Null hypothesis = The two categorical variables have no relationship (independent)
- H1 = Alternative hypothesis = There is a relationship (dependent) between two categorical variables

In [2]:
# make a data frame with categorical columns
df = pd.read_csv("car_data.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,...,engine_type,num_of_cylinders,engine_size,fuel_system,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,1,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,...,dohc,four,130,mpfi,9.0,111,5000,21,27,13495
1,2,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,...,dohc,four,130,mpfi,9.0,111,5000,21,27,16500
2,3,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,...,ohcv,six,152,mpfi,9.0,154,5000,19,26,16500
3,4,audi,gas,std,four,sedan,fwd,front,99.8,176.6,...,ohc,four,109,mpfi,10.0,102,5500,24,30,13950
4,5,audi,gas,std,four,sedan,4wd,front,99.4,176.6,...,ohc,five,136,mpfi,8.0,115,5500,18,22,17450


In [4]:
print(df['fuel_type'].unique())
print(df['body_style'].unique())

['gas' 'diesel']
['convertible' 'hatchback' 'sedan' 'wagon' 'hardtop']


In [5]:
# apply chi square test to check relation between fuel type and body style
st, p, dof, e = stats.chi2_contingency(pd.crosstab(df['fuel_type'], df['body_style']))

# # interpret the p value
alpha = 0.05
print( ' p value =', p)
if p > alpha:
    print('There is no relationship between fuel type and body style (fail to reject H0)')
else:
    print('There is a relationship between fuel type and body style (reject H0)')

 p value = 0.038304347063053835
There is a relationship between fuel type and body style (reject H0)


# t-test:
- Independent sample t-test / 1 sample t-test
- Two sample t-test
  - Paired
  - Un-paired
- ANOVA
  - 1 way
  - 2 way
- MANOVA
  - 1 way
  - 2 way
- ANCOVA
- MANCOVA

In [7]:
df['horsepower'] = (df['horsepower'] != '?')

In [8]:
df['horsepower'] = df['horsepower'].astype('int64')

- H0 = Sample mean is equal to 100
- H1 = Sample mean is not equal to 100 

In [9]:
# Apply one sample t test
# import the required libraries
import pandas as pd
import numpy as np
from scipy import stats

# apply t test
st, p = stats.ttest_1samp(df[['horsepower']], 100)

# # interpret the p value
alpha = 0.05
if p > alpha:
    print('Sample mean is equal to 100 (fail to reject H0)')
else:
    print('Sample mean is not equal to 100 (reject H0)')
    

Sample mean is not equal to 100 (reject H0)


In [None]:
# define ANOVA
def anova_test(df, group, target):
    # apply ANOVA test
    st, p = stats.f_oneway(df[df[group] == 'gas'][target], df[df[group] == 'diesel'][target])
    # interpret the p value
    alpha = 0.05
    if p > alpha:
        print('There is no relationship between fuel type and body style (fail to reject H0)')
    else:
        print('There is a relationship between fuel type and body style (reject H0)')

# Correlation
1. Correlation ( pearson, etc)
2. Regression 

In [19]:
# correaltion test
df.corr()

Unnamed: 0.1,Unnamed: 0,wheel_base,length,width,height,curb_weight,engine_size,compression_ratio,horsepower,city_mpg,highway_mpg
Unnamed: 0,1.0,0.129729,0.170636,0.052387,0.25596,0.071962,-0.03393,0.150276,-0.047803,0.01594,0.011255
wheel_base,0.129729,1.0,0.874587,0.795144,0.589435,0.776386,0.569329,0.249786,0.043896,-0.470414,-0.544082
length,0.170636,0.874587,1.0,0.841118,0.491029,0.877728,0.68336,0.158414,-0.041138,-0.670909,-0.704662
width,0.052387,0.795144,0.841118,1.0,0.27921,0.867032,0.735433,0.181129,-0.029787,-0.642704,-0.677218
height,0.25596,0.589435,0.491029,0.27921,1.0,0.295572,0.067149,0.261214,0.035625,-0.04864,-0.107358
curb_weight,0.071962,0.776386,0.877728,0.867032,0.295572,1.0,0.850594,0.151362,0.006892,-0.757414,-0.797465
engine_size,-0.03393,0.569329,0.68336,0.735433,0.067149,0.850594,1.0,0.028971,-0.012168,-0.653658,-0.67747
compression_ratio,0.150276,0.249786,0.158414,0.181129,0.261214,0.151362,0.028971,1.0,0.036136,0.324701,0.265201
horsepower,-0.047803,0.043896,-0.041138,-0.029787,0.035625,0.006892,-0.012168,0.036136,1.0,0.033757,-0.003595
city_mpg,0.01594,-0.470414,-0.670909,-0.642704,-0.04864,-0.757414,-0.653658,0.324701,0.033757,1.0,0.971337


## Parametric tests:
- 1. T test
- 2. Z test
- 3. ANOVA
- 4. Chi square test
- 5. Correlation test
- 6. Regression test

## Non-parametric tests
- 1. Mann Whitney U test
- 2. Kruskal Wallis H test
- 3. Friedman test
- 4. Chi square test
- 5. Spearman rank correlation test
- 6. Kendall rank correlation test

In [15]:
# apply ANOVA test
Factors = df[['make','fuel_type','body_style']]
Response = df['price']
Covariate = df['horsepower']