<a href="https://colab.research.google.com/github/Deffo0/College_assignments/blob/main/Statistics_Lab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Statistics Lab2

## Imports

In [63]:
import numpy as np
import pandas as pd 
from sklearn import datasets
from scipy.stats import chi2_contingency, chi2
import math

## Data Loading

In [41]:
def load_iris():
    """
    Loads Iris dataset
    
    Returns:
    iris (optional datatype): the Iris dataset
    """
    iris = datasets.load_iris()
    return iris

## Data Representation

In [42]:
def represent_data(iris):
    """
    Represents the data in a tabular form with column names as specified
    
    Parameters:
    iris (optional datatype): the Iris dataset in row format
    
    Returns:
    data (pandas dataframe): the new form of Iris as customized table
    """
    #The column names of the new table
    col_names = ['sepal length', 'sepal width', 'petal length', 'petal width', 'species']
    #Constructing the new table
    data = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
                        columns=col_names)
    print('Samples from the Data:')
    display(data)
    return data

## Contingency Table

In [134]:
def calculate_chi2_value(freq_table, expected_table):
    """
    Calculates the chi2 square value using both the observed and expected tables
    
    Parameters:
    freq_table (pandas dataframe): The table of observed count of each combination
    expected_table (pandas dataframe): The table of expected value for each cell 
    
    Returns:
    chi2_value (float): The value resulted from the summation of chi square independence test
    """
    chi2_value = 0.0
    freq = freq_table.to_numpy()
    exp = expected_table.to_numpy()
    for i in range(0, len(freq_table)):
      for j in range(0, len(freq_table.columns)):
       chi2_value = chi2_value + (pow((freq[i,j] - exp[i,j]),2))/(exp[i,j])
            
    return chi2_value

In [137]:
def calculate_p_value(chi2_value, df):
    """
    Calculates the probability p-value where P{chi > chi2_value} = p-value
    
    Parameters:
    chi2_value (float): The value resulted from the summation of chi square independence test
    df (int): The degrees of freedom of the resulted distribution
    
    Returns:
    p_value (float): The probability of the resulted chi2_value
    """
    p_value = chi2.pdf(chi2_value,df)
    return p_value

In [101]:
def tabulate(data, label1, label2):
    """
    Constructs the contingency table of the dataset with respect to the given labels
    
    Parameters:
    data (pandas dataframe): Raw data to extract the two columns from
    label1 (string): The name of the 1st column
    label2 (string): The name of the 2nd column
    
    Returns:
    freq_table (pandas dataframe): The table of observed count of each combination
    chi2_value (float): The summed value from the independence test
    p_value (float): The probability of the resulted chi2_value
    df (int): The degrees of freedom of the resulted distribution
    expected_table: The corresponding table of the observed table
    """
    arr1 = data[label1]
    arr2 = data[label2]
    wanted_data = data[[label1,label2]]
  #---------------------------------------------
    freq_table = freq_table = pd.crosstab(arr1,arr2)
  #---------------------------------------------
    s1 = 0
    s2 = 0
    for i in range(0, len(arr1)):
      s1 = s1 + (arr1[i]-np.mean(arr1))**2
    s1 = (1/(len(arr1)-1))*s1
    for j in range(0, len(arr2)):
      s2 = s2 + (arr2[j]-np.mean(arr2))**2
    s2 = (1/(len(arr2)-1))*s2
    df = (((s1**2)/len(arr1)+(s2**2)/len(arr2))**2)/((((s1**2)/len(arr1))**2/(len(arr1)-1))+(((s2**2)/len(arr2))**2/(len(arr2)-1)))
  #---------------------------------------------
    row_sum=freq_table.sum(axis=1).to_numpy()
    col_sum=freq_table.sum(axis=0).to_numpy()
    total = sum(row_sum)
    exp_t = [[0]*len(freq_table.columns)]*len(freq_table)
    for a in range(0,len(freq_table)):
      for b in range(0,len(freq_table.columns)):
        exp_t[a][b]=(row_sum[a]*col_sum[b])/total

    expected_table = pd.DataFrame(data=exp_t)
    
    chi2_value = calculate_chi2_value(freq_table, expected_table)
    
    p_value = calculate_p_value(chi2_value, df)
    



    print('The observed tabel:')
    display(freq_table)
    
    return freq_table, chi2_value, p_value, df, expected_table

## Work Assertion

In [141]:
def assert_results(freq_table, chi2_value, p_value, df, expected_table):
    """
    Checks if the above functions are working well according to the built-in function results
    
    Parameters:
    freq_table (pandas dataframe): The table of observed count of each combination
    chi2_value (float): The summed value from the independence test
    p_value (float): The probability of the resulted chi2_value
    df (int): The degrees of freedom of the resulted distribution
    expected_table: The corresponding table of the observed table
    
    Returns:
    good (boolean): true if and only if the 4 tests passed.
    """
    good = True
    
    stat, p, dof, expected = chi2_contingency(freq_table)
    
    #Check the expected values
    for i in range(0,len(expected_table)):
      for j in range(0,2):
         if(abs(expected[i][j] - expected_table[i][j])<0.1):
            good = True
         else:
            good = False
         
    #Check the statistic value
    if(abs(stat - chi2_value)<0.1):
        good = True
    else:
        good = False
    
    #Check the degrees of freedom
    if(abs(dof - dof)<0.1):
        good = True
    else:
        good = False
        
    #Check the p-value
    if(abs(p - p_value)<0.1):
        good = True
    else:
        good = False
        
    return good

## Driver Code

In [142]:
#Loading the dataset
iris = load_iris()

#Tabulating the raw data
data = represent_data(iris)

#Define the level of significance
alpha = 0.1

col_names = data.columns
n = len(col_names)
#Traverse the features except the target label
for j in range(0, n-1):
    print('Working on the target label(species) vs. ' + col_names[j])
    
    #Apply the independence test
    freq_table, chi2_value, p_value, df, expected_table = tabulate(data, 'species', col_names[j])
    print('\t-chi square sum: ' + str(chi2_value))
    print('\t-p-value: ' + str(p_value))
    print('\t-degrees of freedom: ' + str(df))
    
    #Check the results
    good = assert_results(freq_table, chi2_value, p_value, df, expected_table)
    
    print('\t-Are these results correct? --- ' + str(good))
    if p_value < alpha:
        print('\t-The target label is dependent on the feature: ' + col_names[j])
    else:
        print('\t-The target label: is independent of the label: ' + col_names[j])
    print('\n')

Samples from the Data:


Unnamed: 0,sepal length,sepal width,petal length,petal width,species
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0
146,6.3,2.5,5.0,1.9,2.0
147,6.5,3.0,5.2,2.0,2.0
148,6.2,3.4,5.4,2.3,2.0


Working on the target label(species) vs. sepal length
The observed tabel:


sepal length,4.3,4.4,4.5,4.6,4.7,4.8,4.9,5.0,5.1,5.2,5.3,5.4,5.5,5.6,5.7,5.8,5.9,6.0,6.1,6.2,6.3,6.4,6.5,6.6,6.7,6.8,6.9,7.0,7.1,7.2,7.3,7.4,7.6,7.7,7.9
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
0.0,1,3,1,4,2,5,4,8,8,3,1,5,2,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1.0,0,0,0,0,0,0,1,2,1,1,0,1,5,5,5,3,2,4,4,2,3,2,1,2,3,1,1,1,0,0,0,0,0,0,0
2.0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,3,1,2,2,2,6,5,4,0,5,2,3,0,1,3,1,1,1,4,1


	-chi square sum: 156.2666666666666
	-p-value: 3.2912400173376496e-13
	-degrees of freedom: 297.8629741728686
	-Are these results correct? --- True
	-The target label is dependent on the feature: sepal length


Working on the target label(species) vs. sepal width
The observed tabel:


sepal width,2.0,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,3.0,3.1,3.2,3.3,3.4,3.5,3.6,3.7,3.8,3.9,4.0,4.1,4.2,4.4
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
0.0,0,0,1,0,0,0,0,0,1,6,4,5,2,9,6,3,3,4,2,1,1,1,1
1.0,1,2,3,3,4,3,5,6,7,8,3,3,1,1,0,0,0,0,0,0,0,0,0
2.0,0,1,0,0,4,2,4,8,2,12,4,5,3,2,0,1,0,2,0,0,0,0,0


	-chi square sum: 89.546287046287
	-p-value: 1.0930241093638807e-08
	-degrees of freedom: 172.72588484835958
	-Are these results correct? --- True
	-The target label is dependent on the feature: sepal width


Working on the target label(species) vs. petal length
The observed tabel:


petal length,1.0,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.9,3.0,3.3,3.5,3.6,3.7,3.8,3.9,4.0,4.1,4.2,4.3,4.4,4.5,4.6,4.7,4.8,4.9,5.0,5.1,5.2,5.3,5.4,5.5,5.6,5.7,5.8,5.9,6.0,6.1,6.3,6.4,6.6,6.7,6.9
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1
0.0,1,1,2,7,13,13,7,4,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1.0,0,0,0,0,0,0,0,0,0,1,2,2,1,1,1,3,5,3,4,2,4,7,3,5,2,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,3,3,7,2,2,2,3,6,3,3,2,2,3,1,1,1,2,1


	-chi square sum: 271.79999999999995
	-p-value: 3.722663238337531e-08
	-degrees of freedom: 162.79234356934526
	-Are these results correct? --- True
	-The target label is dependent on the feature: petal length


Working on the target label(species) vs. petal width
The observed tabel:


petal width,0.1,0.2,0.3,0.4,0.5,0.6,1.0,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2.0,2.1,2.2,2.3,2.4,2.5
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0.0,5,29,7,7,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1.0,0,0,0,0,0,0,7,3,5,13,7,10,3,1,1,0,0,0,0,0,0,0
2.0,0,0,0,0,0,0,0,0,0,0,1,2,1,1,11,5,6,6,3,8,3,3


	-chi square sum: 271.75
	-p-value: 0.012263470339036039
	-degrees of freedom: 292.0098004714687
	-Are these results correct? --- True
	-The target label is dependent on the feature: petal width


