In [1]:
'''my_functions.ipynb'''

'my_functions.ipynb'

In [2]:

import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram

#########################################################################################################
####################      1   -  TRAITEMENT DES DONNEES                             #####################
#########################################################################################################

def score_nutriaddi ( data ):
    # Les variables importantes ici sont 'nutrition_grade_fr'  et 'additives_n'
    data.loc[data['additives_n'] == 0 , 'additives_grp']   = ""
    data.loc[data['additives_n'] == 1 , 'additives_grp']   = "1"
    data.loc[(data['additives_n'] >= 2 )&(data['additives_n'] <= 5) , 'additives_grp']  = "2"
    data.loc[(data['additives_n'] >= 6 )&(data['additives_n'] <= 10) , 'additives_grp'] = "3"
    data.loc[data['additives_n'] >= 11 ,'additives_grp']   = "4"
    data["score_nutriaddi"]=data.loc[~(data["nutrition_grade_fr"].isna()&data["additives_n"].isna()),"nutrition_grade_fr"]\
                             + data.loc[~(data["nutrition_grade_fr"].isna()&data["additives_n"].isna()),"additives_grp"]
    return data
    
def multiple_boxplot ( data , labels ) :
    label = labels
    #MultipleBoxplot
    plt.figure(figsize=(12,2))
    plt.boxplot(data, labels=label, vert=False,whis=1.5 , patch_artist=True, showmeans =True , widths=0.4) 
    plt.ylabel('observed value')
    plt.title('Multiple Box Plot ')
    plt.show()

def unique_boxplot(data , label ) :
    data = data[~data.isna().values]
    print(f"{label} >> Size : {data.shape} - Mean : {round(data.mean(),2)} - Med : {round(data.median(),2)} -  IQR : {round(data.quantile(0.75) - data.quantile(0.25)  ,2)} ")
    plt.figure(figsize=(12,2))
    plt.boxplot(data, labels = label, vert =False, whis=1.5, showmeans =True, widths=0.4)
    plt.ylabel('')
    plt.title(f'Boxplot : {label[0]}' , size=15)
    plt.show()

def all_boxplots(data):
    data = data.iloc[:,(data.dtypes=="float64").values]
    for col in data.columns :  unique_boxplot( data[col] , [col] )

def hist_lign_col ( data , kde=False):
    plt.figure(figsize=(16,6))
    plt.subplot(1,2,1)
    if kde :
        sns.distplot( (data.isna().mean(axis=1)).values, bins=53, color='b', kde=kde)
    else :
        plt.hist( (data.isna().mean(axis=1)).values, bins=53, color='b',alpha=0.6)
    plt.xlabel('valeur manquantes (%)',size=16) ; plt.ylabel('Nb_Lignes',size=16)
    plt.subplot(1,2,2)
    if kde :
        sns.distplot( (data.isna().mean(axis=0)).values, bins=35, color='r',kde=kde)
    else :
        plt.hist( (data.isna().mean(axis=0)).values, bins=35, color='r',alpha=0.6)
    plt.xlabel('valeur manquantes (%)',size=16) ; plt.ylabel('Nb_Variables',size=16)
    plt.show()

def random_replace ( data , mask,std = 0.5) :
    """Generateur d'une liste de nombre aléatoires repartis dans un domaine entre la mediane et la moyenne des données"""
    return abs( np.random.normal( 0.5*(data.median( ) + data.mean( ) ),\
                                   ( std*(data.quantile( 0.8 ) - data.quantile( 0.2 )) ) ,\
                                   ( mask.sum() ) \
                                   ) )

def ecart(data):
    """Reçois une liste de donnée et renvoi une autre liste contenant la différence succéssive entre les données"""
    if type(data) == type(pd.DataFrame()) :
        data = data[~datas.isna().values].values
    else :
        data = data[~np.isnan(datas)]
    data = np.sort(data)
    out = data[1:] - data[:-1]
    return out

def concentration_gini ( data) :
    """Mesure de concentration des données selon le modèle de Gini"""
    data = data[~data.isna().values]
    data =  np.append(0, np.cumsum( np.sort(data) )/data.sum())
    gini = 2*( 0.5 - ( data.sum() - 0.5*data[-1] - 0.5*data[0] )/data.size  )
    return gini

def concentration_brice ( data ):
    """Mesure de concentration de données selon le modèle de Brice KENGNI ZANGUIM"""
    q_a , q_b, q_c = 0.8 , 0.5 , 0.2 # Quantilles extrêmes pris en compte
    data = data[~data.isna().values]
    box_over_domain = np.exp( -(q_a-q_b)*(q_a-q_c)*(q_b-q_c)  + \
                              ( np.quantile(data,q_a) - np.quantile(data,q_b) ) *  \
                              ( np.quantile(data,q_a) - np.quantile(data,q_c) ) *  \
                              ( np.quantile(data,q_b) - np.quantile(data,q_c) ) /  \
                              ( data.max() - data.min() )**3
                             )
    peer_to_peer_distance_mean =  1 #( data.max() - data.min()  )/data.size
    distance_to_min_mean =  (data - data.min()).var()/(( data.max() - data.min() )**2/12. ) 
    return np.sqrt( box_over_domain*peer_to_peer_distance_mean*distance_to_min_mean  )

def concentration ( data , mode = 'brice' , around = 5 ):
    nutriments_name = [i for i in data.columns if i[-5:]=='_100g' ]
    out = {}
    for i in nutriments_name :
        a =  data[i]
        if mode =='brice' :
            out[i] = round(concentration_brice(a) , around)
        elif mode =='gini' :
            out[i] = round(concentration_gini (a),around)
        elif mode == 'both':
            out[i] = round(concentration_brice(a)*concentration_gini (a)  , around)
    return out

def regression_lineaire ( data , graphic = True) :
    data = data.dropna()
    X = np.matrix( [ np.ones(data.shape[0]) , data[data.columns[0]] ] ).T
    Y =  np.matrix( data[ data.columns[1] ] ).T
    print(X.shape , Y.shape)
    param = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(Y)
    if graphic :
        plt.figure(figsize=(15,10))
        plt.title(f"Regression linéaire  : '{data.columns.values[0]}'  VS  '{data.columns.values[1]}'" , size = 22)
        plt.plot([0,max( data[data.columns[0]] )] ,
                 [param[0] , param.item(0)+param.item(1)*max( data[data.columns[0]] )] , ls = '--' ,lw=4 )
        plt.xlabel(data.columns.values[0],size=20)
        plt.ylabel(data.columns.values[1],size=20)
        plt.plot(X[:,-1], Y, 'go', markersize=4,)
        plt.show()
    return param

def seuil_valeurs_manquantes( ) :
    coupure = np.linspace(0,1,101)
    result = pd.DataFrame(index=coupure)
    for i in coupure:
        data = data_0.copy()
        data = data.iloc[:, (data.isna().mean() < i).values]
        data = data.dropna(axis=0)
        result.loc[i,'Nb_Individus'], result.loc[i,'Nb_Var'] = data.shape[0] , data.shape[1]
        result.loc[i,'Var_float'] = (data.dtypes=='float64').sum()
        result.loc[i,'Car_Obj'] = (data.dtypes=='object').sum()
        fig,ax = plt.subplots(figsize=(15,10))
        
    # Représentation graphique
    ax.plot(result.index ,result['Nb_Individus'], color='b',ls='solid',lw=5,alpha=0.7)
    ax.set_xlabel('Seuil_Valeur manquantes (%)',size=20)
    ax.set_ylabel('Taille_pop',size=20)
    for t1 in ax.get_yticklabels():
        t1.set_color('b')

    ax1 =ax.twinx()
    ax1.plot(result.index ,result['Nb_Var'], color='brown',ls='dashdot',lw=4,alpha=0.7)
    ax1.plot(result.index ,result['Var_float'], color='r',ls='dashed',lw=3,alpha=0.7)
    ax1.plot(result.index ,result['Car_Obj'], color='g',ls='dashed',lw=3,alpha=0.7)
    ax1.set_ylabel('Nb_Variables' , size =20)
    for t2 in ax1.get_yticklabels():
        t2.set_color('r')
    plt.show()
    return result

def data_segmentation( datas, observable, population_a, population_b, nbr_intervals  ) :
    """La fonction permet de segmenter deux variables quantitatives en plusieurs intervals
    Pour chaque interval de valeurs de la variable 'observable' on va renseigner le nombre 
    d'individus des variables 'population_a' et 'population_b' ayant des valeurs comprises dans l'interval en question
    """
    datas = datas.dropna()
    # Creation des données à utiliser
    bottom, top = min(datas[population_a].min(), datas[population_b].min()) , max(datas[population_a].max(), datas[population_b].max())
    interval = np.round( np.linspace(bottom,top,nbr_intervals) ,1 )
    interval[-1] += 0.1
    if interval[-1] < top : interval = np.append(interval , top+0.1)
    return  pd.DataFrame( {observable   : [f"[{interval[i]} ; {interval[i+1]}]" for i in range( interval.size -1) ],
                           population_a : [ datas.loc[(datas[population_a]>=interval[i])&(datas[population_a]<interval[i+1]) ,\
                                                         population_a].size for i in range( interval.size -1)],
                           population_b : [ datas.loc[(datas[population_b]>=interval[i])&(datas[population_b]<interval[i+1]) ,\
                                                        population_b].size for i in range( interval.size -1)]
                               })

def pyramide_graphique( datas, observable, population_a, population_b, segmenter = False, nbr_intervals=10, kind ="barh" , save = True) :
    # Segmentation des données si demandées
    if segmenter: datas = data_segmentation(datas, observable, population_a, population_b ,nbr_intervals = nbr_intervals)
    
    # Titre
    plt.figtext(.5,.9,f"\nPyramides de {observable}\n", fontsize=15, ha='center')

    # paramètrage des axes
    if kind == "barh":
        # limites de x et y
        y = range(0, len(datas))
        x_a = datas[population_a]
        x_b = datas[population_b]

        # Paramètres graphiques
        fig, axes = plt.subplots(ncols=2, sharey=True, figsize=(18, 10))

        # Couleur d'arrière plan et titre
        fig.patch.set_facecolor('white')
        
        axes[0].barh(y, x_a, align='center', color='green')
        axes[1].barh(y, x_b, align='center', color='red')
        axes[0].set(title=population_a , xlabel="QUANTITE" , ylabel= f"INTERVAL DE {observable.upper()}")        
        axes[1].set(title=population_b, xlabel="QUANTITE")

        # adjust grid parameters and specify labels for y-axis
        axes[1].grid(visible=True ,color='white', linestyle='-', linewidth=3,)
        axes[0].set(yticks=y, yticklabels=datas[observable])
        axes[0].invert_xaxis()
        axes[0].grid(visible=True ,color='white', linestyle='-', linewidth=3,)
    # Cette partie de code n'est pas terminée
    """
    elif kind == "area" :
        axes[0].area(y, x_a, align='center', color='green')
        axes[1].area(y, x_b, align='center', color='red')
    """
    if save : plt.savefig("pyramide.jpeg")
    plt.show()

def K_NN_imputer_opptimize ( data , max_neighbors = 7 ) :
    if type(data) != type(pd.DataFrame()) :
        data = pd.DataFrame({ f"Var_{i}" : data[:,i] for i in range(data.shape[1]) })
    """La fonction permet de réaliser un KNN Imputer sur un dataFrame en optimisant le paramètre 'Nombre de plus proches voisins' """
    print(data,"\n\n")
    data[data.isin([np.inf])] = np.nan  # Les valeurs infinies sont remplacées par des valeurs manquantes
    data = data.loc[:,data.isna().mean() < 0.3]      # Les lignes et colonnes à 100 % de valeurs manquantes sont supprimées
    data = data.loc[data.isna().mean(axis=1) < 0.3,:]
    print(data)
    cols = np.array(data.select_dtypes(include=[float]).columns)   # 
    for i in cols :
        KNNImputer(missing_values=np.nan)
        X = data[cols[~np.isin(cols,i)]]  # Toutes les variables sauf i
        Y = data[i]
        model = make_pipeline(KNNImputer() , SGDClassifier()) # Notre pipeline contient un KNN Imputer et un algorithme de classification linéaire SGDClassifier()
        params = {  # Dictionnaire de pamarèmtres à optimiser
                  "knnimputer__n_neighbors" : [ i+1 for i in range(max_neighbors) ] }
        grid = GridSearchCV(model , param_grid=params , cv = 5 )
        grid.fit(X,Y)
        print(grid.best_params_)
        imputer = KNNImputer(n_neighbors=grid.best_params_["knnimputer__n_neighbors"])
        imputer.fit_transform(X)

def couleur_léatoire_hex() :
    s = "#"
    a = np.random.choice(["0","1","2","3","4","5","6","7","8","9","A","B","C","D","E","F"] , 6 ,replace=False)
    for i in a : s += i
    return s

def camemberg ( labels = ["Brice","Romeo","Marthe","Franck","Arnaud"] , sizes=[28,18,23,30,15] ,titre = "CAMENBERG") : 
    # Data to plot
    labels = np.array(labels)
    sizes = np.array(sizes)
    colors = [couleur_léatoire_hex() for i in range(sizes.size) ]
    explode = np.array([ 0. for i in range(sizes.size) ])  # explode 1st slice
    explode[sizes == sizes.max()] = 0.08

    # Plot
    plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=90)
    plt.title(titre, color ='brown' , size = 18 )
    plt.axis('equal')
    
#########################################################################################################
####################      2   -  ANALYSE DES COMPOSANTES PRINCIPALES                #####################
####################      3   -  K-MEANS                                            #####################
####################      4   -  CLASSIFICATION HIERARCHIQUE                        #####################
#########################################################################################################

def display_circles(pcs, n_comp, pca, axis_ranks, labels=None, label_rotation=0,
                        lims=None , fgsize = (12,12),label_num=False,labl_arrow_size = 14):
    for d1, d2 in axis_ranks: # On affiche les 3 premiers plans factoriels, donc les 6 premières composantes
        if d2 < n_comp:

            # initialisation de la figure
            fig, ax = plt.subplots(figsize=fgsize)

            # détermination des limites du graphique
            if lims is not None :
                xmin, xmax, ymin, ymax = lims
            elif pcs.shape[1] < 30 :
                xmin, xmax, ymin, ymax = -1, 1, -1, 1
            else :
                xmin, xmax, ymin, ymax = min(pcs[d1,:]), max(pcs[d1,:]), min(pcs[d2,:]), max(pcs[d2,:])

            # affichage des flèches
            # s'il y a plus de 30 flèches, on n'affiche pas le triangle à leur extrémité
            vsbl = 0.3
            if pcs.shape[1] < 30 :
                plt.quiver(np.zeros(pcs.shape[1]), np.zeros(pcs.shape[1]), 
                   pcs[d1,:], pcs[d2,:],  angles='xy', scale_units='xy', scale=1, color="green",width = 0.005, alpha=vsbl)
                # (voir la doc : https://matplotlib.org/api/_as_gen/matplotlib.pyplot.quiver.html)
            else:
                lines = [[[0,0],[x,y]] for x,y in pcs[[d1,d2]].T]
                ax.add_collection(LineCollection(lines, axes=ax, alpha=vsbl, color='black'))
            
            # affichage des noms des variables  
            if labels is not None:  
                for i,(x, y) in enumerate(pcs[[d1,d2]].T):
                    fluct = np.random.normal(0,0.02,x.shape)
                    if x >= xmin and x <= xmax and y >= ymin and y <= ymax :
                        if label_num :
                            plt.text(x+fluct, y+fluct, f"{i}", fontsize=labl_arrow_size, ha='center', va='center', 
                                                            rotation=label_rotation, color="magenta", alpha=vsbl)
                        else :
                            plt.text(x+fluct, y+fluct, labels[i], fontsize=labl_arrow_size, ha='center', va='center', 
                                                            rotation=label_rotation, color="magenta", alpha=vsbl)
            
            # affichage du cercle
            circle = plt.Circle((0,0), 1, facecolor='none', edgecolor='green')
            plt.gca().add_artist(circle)

            # définition des limites du graphique
            plt.xlim(xmin, xmax)
            plt.ylim(ymin, ymax)
        
            # affichage des lignes horizontales et verticales
            plt.plot([-1, 1], [0, 0], color='grey', ls='--')
            plt.plot([0, 0], [-1, 1], color='grey', ls='--')

            # nom des axes, avec le pourcentage d'inertie expliqué
            plt.xlabel('F{} ({}%)'.format(d1+1, round(100*pca.explained_variance_ratio_[d1],1)))
            plt.ylabel('F{} ({}%)'.format(d2+1, round(100*pca.explained_variance_ratio_[d2],1)))

            plt.title(f"Cercle des corrélations - directions principales F{d1+1} et F{d2+1}", size=15)
            plt.show(block=False)
        
def display_factorial_planes(X_projected, n_comp, pca, axis_ranks, labels=None, alpha=0.6, 
                                          illustrative_var=None,show_label_element = True , fgsize = (12,8)):
    for d1,d2 in axis_ranks:
        if d2 < n_comp:
            # initialisation de la figure       
            fig = plt.figure(figsize=fgsize)
        
            # affichage des points
            if illustrative_var is None:
                plt.scatter(X_projected[:, d1], X_projected[:, d2], alpha=alpha, c = labels)
            else:
                illustrative_var = np.array(illustrative_var)
                for value in np.unique(illustrative_var):
                    selected = np.where(illustrative_var == value)
                    plt.scatter(X_projected[selected, d1], X_projected[selected, d2], alpha=alpha, label=value, c = labels)
                plt.legend()

            # affichage des labels des points
            if (labels is not None)&(show_label_element):
                for i,(x,y) in enumerate(X_projected[:,[d1,d2]]):
                    plt.text(x, y, labels[i], fontsize='14', ha='center',va='center', alpha=0.5) 
                
            # détermination des limites du graphique
            boundary = np.max(np.abs(X_projected[:, [d1,d2]])) * 1.1
            plt.xlim([-boundary,boundary])
            plt.ylim([-boundary,boundary])
        
            # affichage des lignes horizontales et verticales
            plt.plot([-100, 100], [0, 0], color='grey', ls='--')
            plt.plot([0, 0], [-100, 100], color='grey', ls='--')

            # nom des axes, avec le pourcentage d'inertie expliqué
            plt.xlabel('F{} ({}%)'.format(d1+1, round(100*pca.explained_variance_ratio_[d1],1)))
            plt.ylabel('F{} ({}%)'.format(d2+1, round(100*pca.explained_variance_ratio_[d2],1)))

            plt.title(f"Projection des individus- directions principales F{d1+1} et F{d2+1}",size=15)
            plt.show(block=False)

def display_scree_plot(pca , fgsize = (12,8)):
    scree = pca.explained_variance_ratio_*100
    plt.figure(figsize= fgsize ) 
    plt.bar(np.arange(len(scree))+1, scree)
    plt.plot(np.arange(len(scree))+1, scree.cumsum(),c="red",marker='o')
    plt.xlabel("rang de l'axe d'inertie")
    plt.ylabel("pourcentage d'inertie")
    plt.title("Eboulis des valeurs propres")
    plt.show(block=False)

def plot_dendrogram(Z, names):
    plt.figure(figsize=(10,15))
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('distance')
    dendrogram( Z , labels = names ,  orientation = "left"  )
    plt.show()