In [None]:
pip install mlxtend apyori

In [None]:
import pandas as pd, csv
from apyori import apriori, load_transactions
from mlxtend.frequent_patterns import apriori as mlx_apriori, fpgrowth
from mlxtend.preprocessing import TransactionEncoder

# Import des données

In [None]:
diabetes = pd.read_csv('diabetes_012_health_indicators_BRFSS2015.csv')
diabetes.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


# Traitement des données

## Création d'un champ calculé à partir de Veggies et Fruits

In [None]:
def determine_category(row):
    if row['Fruits'] == 1 and row['Veggies'] == 1:
        return 'fruits and vegetables'
    elif row['Fruits'] == 1:
        return 'fruits'
    elif row['Veggies'] == 1:
        return 'vegetables'
    else:
        return 'no fruits nor vegetables'

#diabetes = diabetes[diabetes.Diabetes_012 > 1]
diabetes['Alimentation'] = diabetes.apply(determine_category, axis=1)
cols = diabetes.columns.tolist()
position = cols.index('Veggies') + 1
cols.insert(position, cols.pop(cols.index('Alimentation')))
diabetes = diabetes[cols]

## Création d'un nouveau champ pour le BMI

In [None]:
def categorize_bmi(bmi):
    if bmi*100 < 1850:
        return 'underweight'
    elif 1850 <= bmi*100 < 2500:
        return 'normal weight'
    elif 2500 <= bmi*100 < 3000:
        return 'overweight'
    elif 3000 <= bmi*100 < 9999:
        return 'obese'
    else:
        return 'Invalid BMI'

diabetes['BodyMassIndex'] = diabetes['BMI'].apply(categorize_bmi)
cols = diabetes.columns.tolist()
position = cols.index('BMI') + 1
cols.insert(position, cols.pop(cols.index('BodyMassIndex')))
diabetes = diabetes[cols]

## Suppression des colonnes inutiles

In [None]:
diabetes_filtered = diabetes[diabetes['Diabetes_012'] >= 2]
diabetes = diabetes_filtered
diabetes.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,BodyMassIndex,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
8,2.0,1.0,1.0,1.0,30.0,obese,1.0,0.0,1.0,0.0,...,1.0,0.0,5.0,30.0,30.0,1.0,0.0,9.0,5.0,1.0
10,2.0,0.0,0.0,1.0,25.0,overweight,1.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,13.0,6.0,8.0
13,2.0,1.0,1.0,1.0,28.0,overweight,0.0,0.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,1.0,0.0,11.0,4.0,6.0
17,2.0,0.0,0.0,1.0,23.0,normal weight,1.0,0.0,0.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,7.0,5.0,6.0
23,2.0,1.0,0.0,1.0,27.0,overweight,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,13.0,5.0,4.0


In [None]:
diabetes.drop(['Diabetes_012', 'Fruits', 'Veggies', 'BMI', 'NoDocbcCost', 'PhysHlth', 'MentHlth', 'CholCheck', 'PhysActivity'], axis=1, inplace=True)
diabetes.head()

Unnamed: 0,HighBP,HighChol,BodyMassIndex,Smoker,Stroke,HeartDiseaseorAttack,Alimentation,HvyAlcoholConsump,AnyHealthcare,GenHlth,DiffWalk,Sex,Age,Education,Income
8,1.0,1.0,obese,1.0,0.0,1.0,fruits and vegetables,0.0,1.0,5.0,1.0,0.0,9.0,5.0,1.0
10,0.0,0.0,overweight,1.0,0.0,0.0,fruits and vegetables,0.0,1.0,3.0,0.0,1.0,13.0,6.0,8.0
13,1.0,1.0,overweight,0.0,0.0,0.0,vegetables,0.0,1.0,4.0,1.0,0.0,11.0,4.0,6.0
17,0.0,0.0,normal weight,1.0,0.0,0.0,no fruits nor vegetables,0.0,1.0,2.0,0.0,1.0,7.0,5.0,6.0
23,1.0,0.0,overweight,0.0,0.0,0.0,fruits and vegetables,0.0,1.0,1.0,0.0,0.0,13.0,5.0,4.0


In [None]:
diabetes.head()

Unnamed: 0,HighBP,HighChol,BodyMassIndex,Smoker,Stroke,HeartDiseaseorAttack,Alimentation,HvyAlcoholConsump,AnyHealthcare,GenHlth,DiffWalk,Sex,Age,Education,Income
8,1.0,1.0,obese,1.0,0.0,1.0,fruits and vegetables,0.0,1.0,5.0,1.0,0.0,9.0,5.0,1.0
10,0.0,0.0,overweight,1.0,0.0,0.0,fruits and vegetables,0.0,1.0,3.0,0.0,1.0,13.0,6.0,8.0
13,1.0,1.0,overweight,0.0,0.0,0.0,vegetables,0.0,1.0,4.0,1.0,0.0,11.0,4.0,6.0
17,0.0,0.0,normal weight,1.0,0.0,0.0,no fruits nor vegetables,0.0,1.0,2.0,0.0,1.0,7.0,5.0,6.0
23,1.0,0.0,overweight,0.0,0.0,0.0,fruits and vegetables,0.0,1.0,1.0,0.0,0.0,13.0,5.0,4.0


## Remplacement des valeurs par leur signification

In [None]:
diabetes.rename(columns={'HighBP': 'BloodPressure'}, inplace=True)
diabetes['BloodPressure'] = diabetes['BloodPressure'].replace({0: 'normal blood pressure', 1: 'high blood pressure'})

diabetes.rename(columns={'HighChol': 'Cholesterol'}, inplace=True)
diabetes['Cholesterol'] = diabetes['Cholesterol'].replace({1: 'normal cholesterol', 0: 'high cholesterol'})

diabetes['Smoker'] = diabetes['Smoker'].replace({0: 'does not smoke', 1: 'smokes'})

diabetes['Stroke'] = diabetes['Stroke'].replace({0: 'never had stroke', 1: 'had stroke'})

diabetes['HeartDiseaseorAttack'] = diabetes['HeartDiseaseorAttack'].replace({0: 'no heart disease/attack', 1: 'had/has heart disease/attack'})

diabetes.rename(columns={'HvyAlcoholConsump': 'AlcoholConsumption'}, inplace=True)
diabetes['AlcoholConsumption'] = diabetes['AlcoholConsumption'].replace({0: 'normal alcohol consumption', 1: 'heavy alcohol consumption'})

diabetes.rename(columns={'AnyHealthcare': 'Healthcare'}, inplace=True)
diabetes['Healthcare'] = diabetes['Healthcare'].replace({0: 'not covered by healthcare', 1: 'covered by healthcare'})

diabetes.rename(columns={'GenHlth': 'GeneralHealth'}, inplace=True)
diabetes['GeneralHealth'] = diabetes['GeneralHealth'].replace({1: 'excellent health', 2: 'very good health', 3: 'good health', 4: 'fair health', 5: 'poor health'})

diabetes.rename(columns={'DiffWalk': 'DifficultiesToWalk'}, inplace=True)
diabetes['DifficultiesToWalk'] = diabetes['DifficultiesToWalk'].replace({0: 'no difficulty to walk', 1: 'difficulties to walk'})

diabetes['Sex'] = diabetes['Sex'].replace({0: 'female', 1: 'male'})

diabetes['Age'] = diabetes['Age'].replace({1: '[18-24]', 2: '[25-29]', 3: '[30-34]', 4: '[35-39]', 5:'[40-44]', 6:'[45-49]', 7:'[50-54]', 8:'[55-59]', 9:'[60-64]', 10:'[65-69]', 11:'[70-74]', 12:'[75-79]', 13:'80+'})

diabetes['Education'] = diabetes['Education'].replace({1: 'never attended school', 2: 'grades 1 through 8', 3: 'grades 9 through 11', 4: 'grade 12 or GED', 5: 'college 1 year to 3 years', 6: 'college 4 years or more'})

diabetes['Income'] = diabetes['Income'].replace({1: '<10 000', 2: '[10 000-14 999]', 3: '[15 000-19 999]', 4: '[20 000-24 999]', 5: '[25 000-34 999]', 6: '[35 000-49 999]', 7: '[50 000-74 999]', 8: '75 000+'})

In [None]:
def enumerate_columns_values(df):
    for column in df.columns:
        # Récupérer les valeurs uniques dans la colonne
        unique_values = df[column].unique()
        # Convertir les valeurs uniques en chaîne de caractères pour l'affichage
        unique_values_str = ", ".join(map(str, unique_values))
        # Ajouter "\\ \hline" à la fin de chaque ligne imprimée
        print(f"{column} & {unique_values_str}\\\\ \hline")


enumerate_columns_values(diabetes)

BloodPressure & high blood pressure, normal blood pressure\\ \hline
Cholesterol & normal cholesterol, high cholesterol\\ \hline
BodyMassIndex & obese, overweight, normal weight, underweight\\ \hline
Smoker & smokes, does not smoke\\ \hline
Stroke & never had stroke, had stroke\\ \hline
HeartDiseaseorAttack & had/has heart disease/attack, no heart disease/attack\\ \hline
Alimentation & fruits and vegetables, vegetables, no fruits nor vegetables, fruits\\ \hline
AlcoholConsumption & normal alcohol consumption, heavy alcohol consumption\\ \hline
Healthcare & covered by healthcare, not covered by healthcare\\ \hline
GeneralHealth & poor health, good health, fair health, very good health, excellent health\\ \hline
DifficultiesToWalk & difficulties to walk, no difficulty to walk\\ \hline
Sex & female, male\\ \hline
Age & [60-64], 80+, [70-74], [50-54], [65-69], [75-79], [55-59], [30-34], [45-49], [35-39], [40-44], [25-29], [18-24]\\ \hline
Education & college 1 year to 3 years, college 4 yea

In [None]:
"""
f = open("diabetes.csv", "a")
f.write(diabetes.to_csv(sep=";"))
f.close()
"""

'\nf = open("diabetes.csv", "a")\nf.write(diabetes.to_csv(sep=";"))\nf.close()\n'

# Application des algorithmes de data mining

## Application de Apriori

In [None]:
# Conversion du DataFrame en une liste de transactions
transactions = diabetes.values.tolist()

# Application de l'algorithme apriori
results = list(apriori(transactions, min_support=0.6, min_confidence=0.8))

In [None]:
print(len(results))

27


### Préparation des résultats à l'exportation

In [None]:
# On enregistre les résultats au format CSV adapté pour la mise en place de la méthode de décision multicritère
results_list = []

with open('apriori_results.csv', mode='w', newline='') as file:
    writer = csv.writer(file, delimiter=';')
    
    # On écrire l'en-tête du fichier CSV
    writer.writerow(['Rule', 'Support', 'Confidence', 'Lift'])
    
    # On itère sur les éléments de résultats pour écrire chaque ligne dans le fichier CSV
    for i in results:
        if len(i[0]) >= 2 and list(i.ordered_statistics[0].items_base) != []: 
            premises = list(i.ordered_statistics[0].items_add)
            conclusions = list(i.ordered_statistics[0].items_base)
            premises = ' + '.join(f"{{{premise}}}" for premise in premises)
            conclusions = ' + '.join(f"{{{conclusion}}}" for conclusion in conclusions)
            if premises == '':
                rule = conclusions
            else:
                rule = premises + ' => ' + conclusions
            support = i[1]
            confidence = i.ordered_statistics[0][2]
            lift = i.ordered_statistics[0][3]

            # On écrit les données de cet élément dans le fichier CSV
            writer.writerow([rule, support, confidence, lift])

In [None]:
# Experimentation of several thresholds of support and confidence
support_tresh =    [0.5, 0.6, 0.7, 0.8, 0.9, 1]
confidence_tresh = [0.5, 0.6, 0.7, 0.8, 0.9]
number_of_rules_extracted = []

for supp_trsh in support_tresh:
    for conf_trsh in confidence_tresh:
        number_of_rules_extracted.append(len(list(apriori(transactions, min_support=supp_trsh, min_confidence=conf_trsh))))

number_of_rules_extracted

[65,
 62,
 60,
 57,
 53,
 31,
 31,
 29,
 27,
 25,
 18,
 18,
 18,
 16,
 16,
 7,
 7,
 7,
 7,
 7,
 4,
 4,
 4,
 4,
 4,
 0,
 0,
 0,
 0,
 0]

In [None]:
# Tableau de comparaison du nombre de règles d'association extraites en fonction des seuils

import itertools
combos = list(itertools.product(support_tresh, confidence_tresh))

df_tresholds_comparison = pd.DataFrame(combos, columns=['Support Treshold', 'Confidence Treshold'])
df_tresholds_comparison['Number of rules extracted'] = number_of_rules_extracted
df_tresholds_comparison


Unnamed: 0,Support Treshold,Confidence Treshold,Number of rules extracted
0,0.5,0.5,65
1,0.5,0.6,62
2,0.5,0.7,60
3,0.5,0.8,57
4,0.5,0.9,53
5,0.6,0.5,31
6,0.6,0.6,31
7,0.6,0.7,29
8,0.6,0.8,27
9,0.6,0.9,25


## Application de FP Growth

https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpgrowth/

In [None]:
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)
df

Unnamed: 0,75 000+,80+,<10 000,[10 000-14 999],[15 000-19 999],[18-24],[20 000-24 999],[25 000-34 999],[25-29],[30-34],...,normal cholesterol,normal weight,not covered by healthcare,obese,overweight,poor health,smokes,underweight,vegetables,very good health
0,False,False,True,False,False,False,False,False,False,False,...,True,False,False,True,False,True,True,False,False,False
1,True,True,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,True,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,True,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,True,False,False,True
4,False,True,False,False,False,False,True,False,False,False,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35341,False,False,True,False,False,False,False,False,False,False,...,True,False,False,True,False,False,False,False,True,False
35342,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,True,False,True,False,False,True
35343,False,True,False,False,False,False,True,False,False,False,...,True,False,False,False,True,True,False,False,False,False
35344,False,False,False,False,False,False,True,False,False,False,...,True,False,False,False,False,False,False,True,False,False


In [None]:
from mlxtend.frequent_patterns import fpgrowth

fpg_results = fpgrowth(df, min_support=0.6, use_colnames=True)
fpg_results

Unnamed: 0,support,itemsets
0,0.976461,(normal alcohol consumption)
1,0.959769,(covered by healthcare)
2,0.907543,(never had stroke)
3,0.752674,(high blood pressure)
4,0.670118,(normal cholesterol)
5,0.777118,(no heart disease/attack)
6,0.628784,(no difficulty to walk)
7,0.937815,"(covered by healthcare, normal alcohol consump..."
8,0.885362,"(normal alcohol consumption, never had stroke)"
9,0.87099,"(covered by healthcare, never had stroke)"


In [None]:
from pyspark.ml.fpm import FPGrowth

df = spark.createDataFrame([
    (0, [1, 2, 5]),
    (1, [1, 2, 3, 5]),
    (2, [1, 2])
], ["id", "items"])

fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
model = fpGrowth.fit(df)

# Display frequent itemsets.
model.freqItemsets.show()

# Display generated association rules.
model.associationRules.show()

# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model.transform(df).show()

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=ade5dc4b-c830-41c1-8ac7-10153fd0ab6e' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>