# Présentation des données

In [None]:
# Exo 1 - Préparation des données 

# importation des données
import pandas as pd 
import numpy as np 
# import du fichier
data = pd.read_csv("synthetic.csv")

# Visualisation des données
print(data.head())


In [None]:
# 1 - Nombre de colonnes (attributs) dans le DataFrame
num_attributes = data.shape[1]

# Afficher le nombre d'attributs
print(f"Le nombre d'attributs dans le fichier est : {num_attributes}")


In [None]:
# Type de données et valeurs manquantes
print(data.info())

In [None]:
# Avoir le nombre d'attributs dans le modèle
print(data.columns)
# 14 attributs dans le modèle

In [None]:
# Obtenir les classes uniques dans la colonne 'Class'
classes_uniques = data['Class'].unique()

# Nombre de classes différentes
num_classes = len(classes_uniques)

# Afficher le nombre de classes différentes
print(f"Le nombre de classes différentes dans les données est : {num_classes}")

In [None]:
# combien d'instances compte chaque classe?
nbr_instances = data['Class'].value_counts()
print(nbr_instances)

# Sortie 
# Class
# 1    908
# 0    674
# 2    472
# 3    244
# Name: count, dtype: int64

# Les données sont-elles linéairement séparables ?
Non, si on observe le schéma 1 on voit que les données ne le sont pas.
De plus si l'on choisit de les ranger par classe , on peut s'apercevoir que 


In [None]:
import matplotlib.pyplot as plt # import biblio matplot
plt.figure(figsize=(10, 6))
plt.scatter(data['Attr_A'], data['Attr_B'], c=data['Class'], alpha=0.5, cmap='viridis')
plt.xlabel('Attribut 1')
plt.ylabel('Attribut 2')
plt.title('Scatter Plot des attributs par classe')
plt.colorbar(label='Classe')
plt.show()

# On peut voir clairement que ce n'est pas divisible linéairement à l'état brut
# je pense que use image est vraiment mieux


## 5 et 6 (voir compte-rendu.md) 

# 2 Mise en oeuvre des modèles

In [None]:
# Choisir un attribut à analyser, par exemple 'Attr_A'
attribute = 'Attr_A'


# Calculer les quartiles pour l'attribut choisi
quartiles = data[attribute].quantile([0.25, 0.5, 0.75])

# Sort the attribute values and print them
sorted_attribute = data[attribute].sort_values()
print(sorted_attribute)
print(quartiles)
# Afficher les quartiles
print(f"Quartile 1 (Q1) de l'attribut '{attribute}': {quartiles[0.25]}")
print(f"Médiane (Q2) de l'attribut '{attribute}': {quartiles[0.5]}")
print(f"Quartile 3 (Q3) de l'attribut '{attribute}': {quartiles[0.75]}")


# 6 : 

# Partie 2 : Mise en oeuvre des modèles.

# Arbre de décision 


In [None]:
# Arbre de décision

# Calcul de l'entropie

"""
L'entropie est une mesure de l'incertitude associée à une variable aléatoire.
"""

def entropie(dataframe , attribut_cible):  
    # Calcul de la probabilité de chaque classe
    compte_classe = dataframe[attribut_cible].value_counts()
    #print(compte_classe)
    proba = compte_classe / compte_classe.sum()
    #print(proba) 
    # Calcul de l'entropie
    entropie = - (proba * np.log2(proba+ np.finfo(float).eps)).sum() # éviter log2(0)
    return entropie

# Test de la fonction
print(entropie(data, 'Attr_A'))


11.166163082646115
11.166163082645376

11.166163082646115
11.166163082645376

11.166163082646115
11.166163082645376

1.8608867211835993
1.860886721183598

In [None]:
import pandas as pd
import numpy as np

# Fonction pour calculer tous les quartiles d'un attribut donné
def calculate_quartiles(data, attribute):
    return data[attribute].quantile([0.25, 0.5, 0.75])

# Test de la fonction sur le DataFrame chargé

print(calculate_quartiles(data, 'Attr_A'))


In [None]:
data

In [None]:
data.sort_values(by="Attr_C")

In [None]:
# data.head()
sorted = data.sort_values(by="Attr_A")
print(len(sorted))

In [None]:
def gain_information(dataframe, attribut_cible, attribut_test):
    """
    Calculate the information gain from splitting the data based on a test attribute.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame containing the data to partition.
    attribut_cible (str): The target attribute we want to predict.
    attribut_test (str): The attribute whose gain we want to calculate.

    Returns:
    tuple: A tuple containing:
        - attribut_test (str): The test attribute.
        - max_gain (float): The maximum information gain obtained.
        - best_split_value (float): The split value that provides the best gain.
        - best_partitions (tuple): A tuple containing two DataFrames representing the lower and upper partitions
          resulting from the best split.
    """
    
    # Initial entropy of the target attribute
    entropie_initiale = entropie(dataframe, attribut_cible)

    # The gain, split_value and partitions initialized
    max_gain = 0
    best_split_value = None
    best_partitions = None

    # Check for no unique values in the attribute being tested
    if len(dataframe[attribut_test].unique()) <= 1:
        return None

    # Sorting data by the attribute to test
    sorted_data = dataframe.sort_values(by=attribut_test)

    # Unique values of the attribute to test, considering quartiles to reduce complexity
    quartiles = calculate_quartiles(sorted_data, attribut_test).to_list()

    # Adding the min and max values to cover the entire range of the attribute
    quartiles = [sorted_data[attribut_test].min()] + quartiles + \
        [sorted_data[attribut_test].max()]
    # Voir si je n'enlève pas min et max valeur

    # Iterating through the sorted unique values to find the best split
    for split_value in quartiles:
        # Partitioning the data based on the split value
        lower_partition = sorted_data[sorted_data[attribut_test] < split_value]
        upper_partition = sorted_data[sorted_data[attribut_test]
                                      >= split_value]

        # Calculating the weighted entropy for the partitions
        # Row counts.
        total_instances = len(sorted_data)
        lower_weight = len(lower_partition) / total_instances
        upper_weight = len(upper_partition) / total_instances

        # Computing the weighted_entropy
        weighted_entropy = (lower_weight * entropie(lower_partition, attribut_cible)) + \
                           (upper_weight * entropie(upper_partition, attribut_cible))

        # Information gain for the current split
        current_gain = entropie_initiale - weighted_entropy

        # If the current gain is greater than the max_gain, update max_gain and best_split_value
        if current_gain > max_gain:
            max_gain = current_gain
            best_split_value = split_value
            best_partitions = (lower_partition, upper_partition)

    # Returning the attribute, gain, split_value, and partitions as a tuple
    return attribut_test, max_gain, best_split_value, best_partitions


# Testing the function with an example attribute
# Let's use 'Attr_A' as the attribute to test and 'Class' as the target
test_gain_info = gain_information(data, 'Class', 'Attr_H')
test_gain_info

In [None]:
def find_best_gain(dataframe, attribut_cible):
    """
    Calculate the best information gain and corresponding split in a DataFrame.
    
    Parameters:
    dataframe (pd.DataFrame): The input data as a pandas DataFrame.
    target_attribute (str): The target attribute that we want to predict (e.g. 'Class').
    
    Returns:
    tuple: A tuple containing the following elements:
        - best_attribute (str): The attribute that yields the best information gain.
        - best_gain (float): The highest information gain observed.
        - best_split_value (float): The split value that produces the best gain.
        - best_partitions (tuple): A tuple containing the two partitions resulting from the best split.
    """
    # Initialize variables to track the best gain and the associated attribute
    best_gain = 0
    best_attribute = None
    best_split_value = None
    best_partitions = None

    # Iterate over all the attributes in the DataFrame, except the target attribute
    for test_attribute in dataframe.columns:
        if test_attribute == attribut_cible:
            continue  # Skip the target attribute

        # Calculate the information gain for the current attribute
        result = gain_information(dataframe, attribut_cible, test_attribute)

        # If the result is None, skip to the next attribute
        if result is None:
            continue

        # Unpack the result from gain_information
        _, current_gain, split_value, partitions = result

        # Update the variables if the current gain is higher than the best gain
        if current_gain > best_gain:
            best_gain = current_gain
            best_attribute = test_attribute
            best_split_value = split_value
            best_partitions = partitions

    # Return the best attribute, gain, split value, and partitions
    return best_attribute, best_gain, best_split_value, best_partitions


find_best_gain(data, 'Class')

In [None]:
from sklearn.model_selection import train_test_split

def split_data(dataframe, attribut_cible, test_size=0.2, random_state=42):
    """
    Splits the dataframe into training and testing sets.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame containing the data to split.
    attribut_cible (str): The target attribute we want to predict.
    test_size (float): The proportion of the data to include in the test split. Default is 0.2.
    random_state (int): Controls the shuffling applied to the data before applying the split. Default is 42.

    Returns:
    tuple: A tuple containing:
        - X_train (pd.DataFrame): The training features.
        - X_test (pd.DataFrame): The testing features.
        - y_train (pd.Series): The training target attribute.
        - y_test (pd.Series): The testing target attribute.
    """
    # Separate features and target attribute
    X = dataframe.drop(columns=[attribut_cible])
    y = dataframe[attribut_cible]
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [None]:
def calculate_metrics(y_true, y_pred):
    """
    Calculate performance metrics (accuracy, precision, recall, F1-score) for the model.

    Parameters:
    y_true (pd.Series): The true target values.
    y_pred (pd.Series): The predicted target values.

    Returns:
    dict: A dictionary containing the metrics:
        - accuracy (float): The accuracy of the model.
        - precision (float): The precision of the model.
        - recall (float): The recall of the model.
        - f1_score (float): The F1-score of the model.
    """
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    
    # Store metrics in a dictionary
    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }
    
    return metrics