# To calculate Covariance and perform a Chi-square test using both random and provided data.

In [85]:
import numpy as np
import pandas as pd

In [86]:
# making random arrays

array1 = np.random.rand(10)
array2 = np.random.rand(10)

In [87]:
array1, array2

(array([0.77393837, 0.41609724, 0.33851878, 0.11808794, 0.27158816,
        0.48653684, 0.36275012, 0.38356158, 0.27167097, 0.67959458]),
 array([0.71273069, 0.48386306, 0.8216963 , 0.13012237, 0.78279415,
        0.50614554, 0.25877091, 0.28854636, 0.24008784, 0.60921083]))

In [88]:
mean_arr1 = np.mean(array1)
mean_arr2 = np.mean(array2)

mean_arr1, mean_arr2

(0.41023445754067395, 0.4833968040244637)

In [89]:
n = len(array1)
n

10

In [90]:
def covariance(arr1, arr2):

    n = len(arr1)
    array1 = np.array(arr1)
    array2 = np.array(arr2)

    mean_arr1 = np.mean(array1)
    mean_arr2 = np.mean(array2)

    sum = 0
    for i in range(n):
        sum += (arr1[i] - mean_arr1) * (arr2[i] - mean_arr2)

    return sum/(n-1)


In [91]:
cov_val = covariance(array1, array2)

cov_val

0.022894531388349484

### Making contingency table


In [92]:
table = [[3,7],[4,9]]
table = np.array(table)
table

array([[3, 7],
       [4, 9]])

In [93]:
import scipy.stats as stats

def chi_square(table):

  row = len(table)
  col = len(table[0])

  row_sum = []
  col_sum = []

  for i in range(row):
    row_sum.append(np.sum(table[i]))

  for i in range(col):
    col_sum.append(np.sum(table[:,i]))

  expected_table = np.zeros((row,col))

  for i in range(row):
    for j in range(col):
      expected_table[i][j] = (row_sum[i] * col_sum[j]) / np.sum(table)


  chi_square_val = 0
  for i in range(row):
    for j in range(col):
      chi_square_val += (table[i][j] - expected_table[i][j])**2 / expected_table[i][j]

  # Degrees of Freedom
    dof = (row - 1) * (col - 1)

    # p-value
    p_value = stats.chi2.sf(chi_square_val, dof)

    return chi_square_val, p_value



### hypothesis testing

In [94]:
# let alpha val = 0.05

alpha = 0.05

chi_squared_val, p_value = chi_square(table)


In [95]:
print(f"Chi-Square Statistic: {chi_squared_val}" )
print(f"p-value: {p_value}")

Chi-Square Statistic: 0.0008928571428571507
p-value: 0.9761621859960804


In [96]:
if p_value < alpha:
    print("Reject the null hypothesis")
else:
    print("Fail to reject the null hypothesis")

Fail to reject the null hypothesis


cannot prove alternate hypothesis, assume independent

## Implementing on a dataset

In [97]:
data = pd.read_csv("loan_data.csv")


In [98]:
data.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [99]:
data.isna().sum()

Unnamed: 0,0
credit.policy,0
purpose,0
int.rate,0
installment,0
log.annual.inc,0
dti,0
fico,0
days.with.cr.line,0
revol.bal,0
revol.util,0


data seems consistent so skipping pre-processing

### lets find if interest rates affect loan repayment status

columns:- int.rate, not.fully.paid

In [100]:
columns = ["int.rate","installment"]
data[columns]

Unnamed: 0,int.rate,installment
0,0.1189,829.10
1,0.1071,228.22
2,0.1357,366.86
3,0.1008,162.34
4,0.1426,102.92
...,...,...
9573,0.1461,344.76
9574,0.1253,257.70
9575,0.1071,97.81
9576,0.1600,351.58


In [101]:
int_rate = data["int.rate"].values
installments = data["installment"].values

In [107]:
def min_max_normalize(arr):
    arr = np.array(arr)

    min_val = np.min(arr)
    max_val = np.max(arr)

    normalized_arr = (arr - min_val) / (max_val - min_val)

    return normalized_arr

In [108]:
int_rate = min_max_normalize(int_rate)
installments = min_max_normalize(installments)

cov_val = covariance(int_rate, installments)

cov_val

0.010617334369853003

In [109]:
contigency_table = [int_rate, installments]

contigency_table = np.array(contigency_table)

contigency_table

array([[0.37659847, 0.3011509 , 0.48401535, ..., 0.3011509 , 0.63938619,
        0.50639386],
       [0.87988794, 0.22991552, 0.37988253, ..., 0.08885091, 0.36335414,
        0.90620572]])

In [112]:
chi_squared_val , p_value = chi_square(contigency_table)

alpha = 0.05

print(f"Chi-squared value: {chi_squared_val}")
print(f"P-value: {1 - p_value}")

Chi-squared value: 355.4413638427395
P-value: 0.0


In [113]:
if p_value < alpha:
    print("Reject the null hypothesis")
else:
    print("Fail to reject the null hypothesis")

Fail to reject the null hypothesis


cannot prove alternate hypothesis, assume independent