In [1]:
# The Wald test (also called the Wald Chi-Squared Test) 
# is a way to find out if explanatory variables in a model are significant.

#The test can be used for a multitude of different models including those with
#binary variables or continuous variables.

# If the Wald test shows that the parameters for certain explanatory variables are zero, 
# you can remove the variables from the model.

# If the test shows the parameters are not zero, you should include the variables in the model.

# Explanatory Variable & Response Variable


An explanatory variable is a type of independent variable. The two terms are often used interchangeably. But there is a subtle difference between the two. When a variable is independent, it is not affected at all by any other variables. When a variable isn’t independent for certain, it’s an explanatory variable.

In [2]:
# Explanatory : One or more variables, which explains output
# The response variable is the focus of a question in a study or experiment. 
# An explanatory variable is one that explains changes in that variable.

In [None]:
# Statsmodel

In [6]:
# Data Manupulation
import numpy as np
import pandas as pd
# Plotting graphs
import matplotlib.pyplot as plt

# CSV File - Read
df = pd.read_csv('C:\\Users\\HP\\Downloads\LGD_DATA.csv')
#  ,header=None ,skiprows=1 , names=['Cal', 'Pr', 'Fat', 'sod', 'Fib', 'Rting']
#print(df.head(5)) 
# pd.get_dummies allows to convert a categorical variable into dummy variables
# Create a set of dummy variables from the sex variable
df_Gender = pd.get_dummies(df['Gender'])
df_Married = pd.get_dummies(df['Married'])
# Join the dummy variables to the main dataframe
df_new = pd.concat([df, df_Gender], axis=1)
df_new = pd.concat([df_new, df_Married], axis=1)         
#df_new.head()
dfClean = df_new[['Age','Number of Vehicles','F','Single','Losses in Thousands']].copy()
#print(dfClean.head(5))

dfClean = dfClean.rename(columns={'Number of Vehicles': 'nVeh', 'Losses in Thousands': 'LTh'})
#print(dfClean.head(5)) 

import numpy as np
import statsmodels.api as sm 
import statsmodels.formula.api as smf
results = smf.ols('LTh ~ Age + nVeh + F + Single', data=dfClean).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                    LTh   R-squared:                       0.302
Model:                            OLS   Adj. R-squared:                  0.302
Method:                 Least Squares   F-statistic:                     1652.
Date:                Sat, 23 Feb 2019   Prob (F-statistic):               0.00
Time:                        08:47:53   Log-Likelihood:            -1.0360e+05
No. Observations:               15290   AIC:                         2.072e+05
Df Residuals:                   15285   BIC:                         2.072e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    639.2675      6.657     96.035      0.0

In [7]:
results.wald_test_terms()

<class 'statsmodels.stats.contrast.WaldTestResults'>
                     F                     P>F  df constraint  df denom
Intercept  9222.760335                     0.0              1   15285.0
Age        4335.206519                     0.0              1   15285.0
nVeh          2.254576      0.1332406337770106              1   15285.0
F           745.975340  2.077775699530809e-160              1   15285.0
Single     1547.284981                1.9e-322              1   15285.0

In [10]:
from scipy.stats import wald
mean, var, skew, kurt = wald.stats(moments='mvsk')
var

array(1.)

In [None]:
# Hypothesis Testing
# mean, var, skew, kurt = wald.stats(moments='mvsk')
# Useful in Linear Regression etc.

In [11]:
# T-test


In [13]:
# Likehood Ratio test 
# Does the model that includes the variable(s) in question tell us more about the outcome (or response) variable than a model that 
# does not include the variable(s)?

# Likelihood ratio test to test whether restricted model is correct

In [14]:
# Likelihood Ratio Test (Wilks Test),

In [22]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

def lrtest(llmin, llmax):
    lr = 2 * (llmax - llmin)
    p = stats.chisqprob(lr, 1) # llmax has 1 dof more than llmin
    return lr, p

# import example dataset
data = sm.datasets.get_rdataset("dietox", "geepack").data
print(data.head())
# fit time only to pig weight
md = smf.mixedlm("Weight ~ Time", data, groups=data["Pig"])
mdf = md.fit(reml=False)
print(mdf.summary())
llf = mdf.llf


# fit time and litter to pig weight
mdlitter = smf.mixedlm("Weight ~ Time + Litter", data, groups=data["Pig"])
mdflitter = mdlitter.fit(reml=False)
print(mdflitter.summary())
llflitter = mdflitter.llf

lr, p = lrtest(llf, llflitter)
print('LR test, p value: {:.2f}, {:.4f}'.format(lr, p))

     Weight       Feed  Time   Pig  Evit  Cu  Litter
0  26.50000        NaN     1  4601     1   1       1
1  27.59999   5.200005     2  4601     1   1       1
2  36.50000  17.600000     3  4601     1   1       1
3  40.29999  28.500000     4  4601     1   1       1
4  49.09998  45.200001     5  4601     1   1       1
         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: Weight    
No. Observations: 861     Method:             ML        
No. Groups:       72      Scale:              11.3525   
Min. group size:  11      Likelihood:         -2402.9325
Max. group size:  12      Converged:          Yes       
Mean group size:  12.0                                  
--------------------------------------------------------
             Coef.  Std.Err.    z    P>|z| [0.025 0.975]
--------------------------------------------------------
Intercept    15.724    0.783  20.083 0.000 14.189 17.258
Time          6.943    0.033 208.071 0.000  6.877  7.008
Group Va

In [17]:
#The llf attribute is generated for each model—this is the log likelihood statistic. 
#The likelihood ratio test then compares the log likelihood values and tests
#whether the alternative model is
#significantly different to the null model.

In [18]:
# Since the likelihood ratio test was not statistically significant, the litter the pig was born in does not explain its weight, so we reject the alternative model because the growth in pig weight is 
# sufficiently explained by time.

In [19]:
#The likelihood ratio test compares how well a model with a potential predictor explains an
#outcome, compared to a model without the predictor. That is, the test indicates 
#whether a potential predictor is associated with an outcome.
#If the predictor is not associated with the outcome,
# we reject the alternative model in favour of the null model.

In [23]:
# The Pearson’s chi-squared statistical hypothesis is an example of 
#a test for independence between categorical variables.

In [24]:
#        Science,	Math,	Art
#Male         20,      30,    15
#Female       20,      15,    30

# Historically, 
#a table summarization of two categorical variables in this form is called a contingency table.
#stat, p, dof, expected = chi2_contingency(table)

#Historically, 
#a table summarization of two categorical variables in this form is called a contingency table.

In [25]:
# chi-squared test with similar proportions
from scipy.stats import chi2_contingency
from scipy.stats import chi2
# contingency table
table = [	[10, 20, 30],
			[6,  9,  17]]
print(table)
stat, p, dof, expected = chi2_contingency(table)
print('dof=%d' % dof)
print(expected)
# interpret test-statistic
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
if abs(stat) >= critical:
	print('Dependent (reject H0)')
else:
	print('Independent (fail to reject H0)')
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
	print('Dependent (reject H0)')
else:
	print('Independent (fail to reject H0)')

[[10, 20, 30], [6, 9, 17]]
dof=2
[[10.43478261 18.91304348 30.65217391]
 [ 5.56521739 10.08695652 16.34782609]]
probability=0.950, critical=5.991, stat=0.272
Independent (fail to reject H0)
significance=0.050, p=0.873
Independent (fail to reject H0)
