# Chi-square testing with scipy

In [21]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import math


In [22]:
data = np.array([0.74, 0.58, 0.57, 0.81,
    0.98, 0.69, 0.53, 0.93,
    0.73, 0.62, 0.66, 0.13, 0.12, 0.56,
    0.98, 0.15, 0.47, 0.72, 0.18, 0.46,
    0.42, 0.31, 0.24, 0.17, 0.95,
    0.01, 0.36, 0.88, 0.01, 0.37,
    0.37, 0.67, 0.60, 0.13, 0.14,
    0.56, 0.41, 0.46, 0.32, 0.36])
data.size

40

In [23]:
c = math.ceil( math.sqrt(data.size) )
c

7

## Obtained frequencies for a given number of classes

In [24]:
def getOF( dataArr, c , maxVal):
    dataArr = np.sort(dataArr)
    delta = maxVal / c
    of = np.zeros( c , dtype=int)

    index = 0
    for i in range( c ):
        while dataArr[index] <= (i+1)*delta:
            of[i] += 1
            index += 1
            if index >= dataArr.size:
                break
    return of


In [25]:
observedF = getOF(data, c, 1)
observedF

array([6, 4, 8, 7, 6, 4, 5])

## Expected Frequencies for a given number of classes in uniform distribution

In [26]:
def getEF( c , n):
    ef = np.array( [n/c] * c )
    return ef

In [27]:
expectedF = getEF(c, data.size)
expectedF

array([5.71428571, 5.71428571, 5.71428571, 5.71428571, 5.71428571,
       5.71428571, 5.71428571])

## Chi2 test

In [28]:
chi2Calc = np.sum( (observedF - expectedF)**2 / expectedF )
chi2Calc

2.3499999999999996

In [29]:
criticVal = stats.chi2.ppf(q = 0.95, df = c-1)
print("criticVal: ", criticVal)

criticVal:  12.591587243743977


In [30]:
pVal = 1 - stats.chi2.cdf(x=chi2Calc,  # Find the p-value
                             df=c-1)
print("pVal: ", pVal)

pVal:  0.8848628827373295


## chi2 with Scipy

In [31]:
stats.chisquare( getOF(data, c, 1), getEF(c, data.size) )


Power_divergenceResult(statistic=2.3499999999999996, pvalue=0.8848628827373295)

# Poker testing

In [32]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import math


## Classification

Numbers are classified afther the number of repeated decimals they have.  
They can have either 0 or 1 repetitions (if both decimals are equal).

In [33]:
classifiedData = np.array([])
for i in data:
    firstDec = math.floor(i*1e1 % 10)
    secondDec = math.floor(i*1e2 % 10)
    if (firstDec == secondDec):
        classifiedData = np.append(classifiedData, 1)
    else:
        classifiedData = np.append(classifiedData, 0)
print(classifiedData)
print(classifiedData.sum())

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
2.0


In [34]:
# Observed frequencies:
# OF[0] =: Numbers with different decimals
# OF[1] =: Numbers with equal decimals
OF = pd.DataFrame([data.size - classifiedData.sum(), classifiedData.sum()])
print(OF)

      0
0  38.0
1   2.0


Expected Frequencies:
For two decimals, the probability of being equal is: $$(10*1) / (10*10)$$
$$= 1/10 = 0.1$$
Then, for $$40$$ values, the expected number of repetitions if $$4$$

In [35]:
EF = pd.DataFrame([40-4, 4])
print(EF)

    0
0  36
1   4


## Chi square test

In [36]:
# scipy way
stats.chisquare(OF, EF)

Power_divergenceResult(statistic=array([1.11111111]), pvalue=array([0.29184055]))

In [37]:
chi2Calc = ((OF - EF)**2 / EF).sum()
print("chi2Calc: ", chi2Calc[0])

chi2Calc:  1.1111111111111112


In [38]:
criticVal = stats.chi2.ppf(q = 0.95, df = 1)
print("criticVal: ", criticVal)

criticVal:  3.841458820694124


In [39]:
p_value = 1 - stats.chi2.cdf(x=chi2Calc,  # Find the p-value
                             df=1)
print("p_value: ", p_value)

p_value:  [0.29184055]


p_value is greater than 0.05, which is equivalent to say that 
our chi2Calc is less than the critical value criticVal, so the
data set passes the poker test.

# Kolmogorov-smirnov

In [41]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import math
