In [133]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [207]:
df = pd.read_csv("./data/titanic.csv", delimiter=",")

In [208]:
contingency = pd.crosstab(index=df['Pclass'], columns=df['Survived'])

In [209]:
pd.crosstab(index=df['Pclass'], columns=df['Survived'], margins=True)

Survived,0,1,All
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,80,136,216
2,97,87,184
3,372,119,491
All,549,342,891


In [213]:
from scipy.stats import chi2_contingency

In [138]:
chi2, pvalue, degrees, expected = chi2_contingency(contingency)

In [139]:
chi2, degrees, pvalue

(102.88898875696056, 2, 4.549251711298793e-23)

In [210]:
N = contingency.sum()
Ni = contingency.sum(axis=1)
Nj = contingency.sum(axis=0)

In [211]:
Fij = (Ni.values.reshape(contingency.shape[0], 1) * Nj.values) / N.values.sum()

In [212]:
Fij

array([[133.09090909,  82.90909091],
       [113.37373737,  70.62626263],
       [302.53535354, 188.46464646]])

In [169]:
chi2 = (contingency.values - Fij) ** 2 / Fij

In [170]:
chi2.sum()

102.88898875696056

In [145]:
dof = (len(Ni) - 1) * (len(Nj) - 1)

In [146]:
dof

2

In [94]:
chi2.sum() < 6

False

In [52]:
corpus = [
    "Le petit chat boit du lait.",
    "Le petit chien boit de l’eau.",
    "La vache boit de l’eau mais ne boit pas de lait."
]

In [53]:
corpus

['Le petit chat boit du lait.',
 'Le petit chien boit de l’eau.',
 'La vache boit de l’eau mais ne boit pas de lait.']

In [54]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

In [56]:
vectorizer.get_feature_names_out()

array(['boit', 'chat', 'chien', 'de', 'du', 'eau', 'la', 'lait', 'le',
       'mais', 'ne', 'pas', 'petit', 'vache'], dtype=object)

In [66]:
occurrences = X.sum(axis=0) # occ matrix

In [195]:
import pandas as pd

df = pd.DataFrame(
    data=occurrences,
    columns=vectorizer.get_feature_names_out(),
    index=["Alex"]
)

display(df)

Unnamed: 0,boit,chat,chien,de,du,eau,la,lait,le,mais,ne,pas,petit,vache
Alex,4,1,1,3,1,2,1,2,2,1,1,1,2,1


In [196]:
Z = pd.DataFrame(
    data=[[2, 0, 1, 2]],
    columns=["oiseau", "chat", "boit", "eau"],
    index=["Unk"]
)

In [197]:
z = pd.concat([Z, df]).fillna(0)

In [200]:
z = z.T

In [201]:
z.sum()

Unk      5.0
Alex    23.0
dtype: float64

In [202]:
N = z.sum()
Ni = z.sum(axis=1)
Nj = z.sum(axis=0)

In [203]:
Fij = (Ni.values.reshape(z.shape[0], 1) * Nj.values) / N.values.sum()

In [204]:
Fij

array([[0.35714286, 1.64285714],
       [0.17857143, 0.82142857],
       [0.89285714, 4.10714286],
       [0.71428571, 3.28571429],
       [0.17857143, 0.82142857],
       [0.53571429, 2.46428571],
       [0.17857143, 0.82142857],
       [0.17857143, 0.82142857],
       [0.35714286, 1.64285714],
       [0.35714286, 1.64285714],
       [0.17857143, 0.82142857],
       [0.17857143, 0.82142857],
       [0.17857143, 0.82142857],
       [0.35714286, 1.64285714],
       [0.17857143, 0.82142857]])

In [205]:
chi2 = (z.fillna(0).values - Fij) ** 2 / Fij

In [206]:
chi2.sum()

15.728695652173911

In [214]:
chi2, pvalue, degrees, expected = chi2_contingency(z)

In [215]:
chi2

15.728695652173911

In [74]:
from collections import Counter
analyze = vectorizer.build_analyzer()

In [108]:
result = Counter()
for text in corpus:
    result += Counter(analyze(text))

result

Counter({'le': 2,
         'petit': 2,
         'chat': 1,
         'boit': 4,
         'du': 1,
         'lait': 2,
         'chien': 1,
         'de': 3,
         'eau': 2,
         'la': 1,
         'vache': 1,
         'mais': 1,
         'ne': 1,
         'pas': 1})

In [80]:
combined = result + Counter(analyze("L’oiseau mange la fourmi."))

In [81]:
portion = (len(result) / len(combined))

In [105]:
khicarre = 0
for mot, occs in combined.items():
    auteur_occs = result[mot]
    contestes_occs = combined[mot]
    
    auteur_occs_attendues = occs * portion
    contestes_occs_attendues = occs * (1 - portion)

    khicarre += (auteur_occs - auteur_occs_attendues) ** 2 / auteur_occs_attendues
    khicarre += (contestes_occs - contestes_occs_attendues) ** 2 / contestes_occs_attendues

In [107]:
khicarre, portion

(107.32142857142856, 0.8235294117647058)