In [37]:
import pandas as pd
import os

import scipy.stats as stats
import numpy as np

input_dir = "../data_clean/"

In [38]:
df_person = pd.read_pickle(os.path.join(input_dir, 'df_person.pkl'))
df_bio = pd.read_pickle(os.path.join(input_dir, 'df_bio.pkl'))
df_note = pd.read_pickle(os.path.join(input_dir, 'df_note.pkl'))
df_visit = pd.read_pickle(os.path.join(input_dir, 'df_visit.pkl'))
df_condition = pd.read_pickle(os.path.join(input_dir, 'df_condition.pkl'))
df_facteur_risque = pd.read_pickle(os.path.join(input_dir, 'df_facteur_risque.pkl'))

df_facteur_risque.info()

<class 'pandas.core.frame.DataFrame'>
Index: 959 entries, 0 to 993
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   person_id    959 non-null    float64
 1   imc          959 non-null    float64
 2   age          940 non-null    float64
 3   cancer_sein  959 non-null    object 
 4   alcool       959 non-null    object 
 5   fumeur       959 non-null    bool   
dtypes: bool(1), float64(3), object(2)
memory usage: 45.9+ KB


In [None]:
contingency_table = pd.crosstab(df_facteur_risque['cancer_sein'], df_facteur_risque['alcool'])

chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

print("Alcool")
print("Résultats du test du Chi² :")
print(f"Chi² = {chi2:.4f}")
print(f"p-value = {p:.4f}")

contingency_table = pd.crosstab(df_facteur_risque['cancer_sein'], df_facteur_risque['fumeur'])

chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

print("\n")
print("Fumeur")
print("Résultats du test du Chi² :")
print(f"Chi² = {chi2:.4f}")
print(f"p-value = {p:.4f}")

<class 'pandas.core.frame.DataFrame'>
Index: 20 entries, 6 to 891
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   person_id    20 non-null     float64
 1   imc          20 non-null     float64
 2   age          20 non-null     float64
 3   cancer_sein  20 non-null     object 
 4   alcool       20 non-null     object 
 5   fumeur       20 non-null     bool   
dtypes: bool(1), float64(3), object(2)
memory usage: 980.0+ bytes
None
Alcool
Résultats du test du Chi² :
Chi² = 4.1555
p-value = 0.0415


Fumeur
Résultats du test du Chi² :
Chi² = 0.0149
p-value = 0.9028


In [31]:
# Groupes
groupe_sain = df_facteur_risque[df_facteur_risque['cancer_sein'] == 0]
groupe_cancer = df_facteur_risque[df_facteur_risque['cancer_sein'] == 1]

# Age
m_age_sain = groupe_sain['age'].mean()
m_age_cancer = groupe_cancer['age'].mean()
std_age_sain = groupe_sain['age'].std()
std_age_cancer = groupe_cancer['age'].std()
n_sain = groupe_sain['age'].count()
n_cancer = groupe_cancer['age'].count()

conf_int_age_sain = stats.t.interval(0.95, df=n_sain-1, loc=m_age_sain, scale=std_age_sain/np.sqrt(n_sain))
conf_int_age_cancer = stats.t.interval(0.95, df=n_cancer-1, loc=m_age_cancer, scale=std_age_cancer/np.sqrt(n_cancer))

t_age, p_age = stats.ttest_ind(groupe_sain['age'].dropna(), groupe_cancer['age'].dropna(), equal_var=False)

m_imc_sain = groupe_sain['imc'].mean()
m_imc_cancer = groupe_cancer['imc'].mean()
std_imc_sain = groupe_sain['imc'].std()
std_imc_cancer = groupe_cancer['imc'].std()
n_imc_sain = groupe_sain['imc'].count()
n_imc_cancer = groupe_cancer['imc'].count()

conf_int_imc_sain = stats.t.interval(0.99, df=n_imc_sain-1, loc=m_imc_sain, scale=std_imc_sain/np.sqrt(n_imc_sain))
conf_int_imc_cancer = stats.t.interval(0.99, df=n_imc_cancer-1, loc=m_imc_cancer, scale=std_imc_cancer/np.sqrt(n_imc_cancer))

t_imc, p_imc = stats.ttest_ind(groupe_sain['imc'].dropna(), groupe_cancer['imc'].dropna(), equal_var=False)

print("ÂGE")
print(f"IC 99% âge sains : {conf_int_age_sain[0]:.2f} - {conf_int_age_sain[1]:.2f}")
print(f"IC 99% âge cancer : {conf_int_age_cancer[0]:.2f} - {conf_int_age_cancer[1]:.2f}")
print(f"Différence significative ? {'Oui' if p_age < 0.05 else 'Non'} (p-value = {p_age:.4f})")

print("\nIMC")
print(f"IC 99% IMC sains : {conf_int_imc_sain[0]:.2f} - {conf_int_imc_sain[1]:.2f}")
print(f"IC 99% IMC cancer : {conf_int_imc_cancer[0]:.2f} - {conf_int_imc_cancer[1]:.2f}")
print(f"Différence significative ? {'Oui' if p_imc < 0.05 else 'Non'} (p-value = {p_imc:.4f})")


ÂGE
IC 99% âge sains : 43.00 - 47.88
IC 99% âge cancer : 62.49 - 65.42
Différence significative ? Oui (p-value = 0.0000)

IMC
IC 99% IMC sains : 20.14 - 20.80
IC 99% IMC cancer : 24.41 - 24.93
Différence significative ? Oui (p-value = 0.0000)


In [32]:
path = input_dir + "fumeur_results.txt"

valeurs = {}
with open(path, "r", encoding="utf-8") as f:
    for ligne in f:
        cle, val = ligne.split(":", 1)  
        valeurs[cle.strip()] = int(val.strip())

tp = valeurs.get("True Positifs", 0) 
fp = valeurs.get("Faux Positifs", 0)
fn = valeurs.get("Faux Négatifs", 0)

precision = tp / (tp + fp)
rappel = tp / (tp + fn)

print(f"Précision NLP fumeur: {precision*100:.4f}%")
print(f"Rappel NLP fumeur: {rappel*100:.4f}%")

Précision NLP fumeur: 90.0000%
Rappel NLP fumeur: 81.8182%


In [33]:
path = input_dir + "alcool_results.txt"

valeurs = {}
with open(path, "r", encoding="utf-8") as f:
    for ligne in f:
        cle, val = ligne.split(":", 1)  
        valeurs[cle.strip()] = int(val.strip())

tp = valeurs.get("True Positifs", 0) 
fp = valeurs.get("Faux Positifs", 0)
fn = valeurs.get("Faux Négatifs", 0)

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
rappel = tp / (tp + fn) if (tp + fn) > 0 else 0

print(f"Précision NLP alcool: {precision*100:.4f}%")
print(f"Rappel NLP alcool: {rappel*100:.4f}%")

Précision NLP alcool: 0.0000%
Rappel NLP alcool: 0.0000%
