In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

# Projet Open Food Facts - Partie 2: Exploration d'un jeu de données

In [2]:
# Scientific librairies
import numpy as np
import scipy as sp
from scipy import stats
import pandas as pd

# File formats
import json

# Plots
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Widgets
import ipywidgets as widgets
from ipywidgets import AppLayout, interactive_output, fixed

# Perso
import clean_lib as cl # Librairie perso de nettoyage des données
#import stats_lib as sl # Librairie perso de statistiques
#from functions import *  # Fonction OCR pour l'ACP

import importlib
importlib.reload(cl)
#importlib.reload(sl)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))


In [3]:
# Importation des données nettoyées
data_orig = pd.read_csv("df_cleaned.csv",\
                        sep = ',', low_memory=False)

data_orig.set_index('Unnamed: 0', drop=True, append=False, inplace=True, verify_integrity=False)
data_orig.index.rename('index', inplace = True)

# data_orig.shape

In [4]:
############################################################
# Variable categorization
# Helper class to group variables
############################################################  

df_cat = data_orig.copy()

############################################################
# Class definition
############################################################ 

class DfCategory():
    
    import numpy as np
    import pandas as pd
    
    def __init__(self, cat_name):
        # Initializes the category name 'cat_name'.
        # Sets-up a dataframe self.m:
        # 1) That will list the members of the category in the column 'var_name'.
        # 2) If one member relates to the same concept as another one, 
        #        self.m provides the name of the aggregation member ('aggregate');
        #        otherwise: 'aggregate' equals 'var_name'.
        # 3) If one member is a sub-component of another one: 
        #        self.m provides the name of this other member ('total');
        #        otherwise: 'total' equals 'var_name'.
        self.cat_name = cat_name
        self.m = pd.DataFrame(columns = ['var_name', 'aggregate', 'total'])
        self.m = self.m.astype('object')
    
    def add_members(self, df, index = [], exclusions = [], inclusions = []):
        # Adds members to the dataframe self.m.
        # Candidates to membership must be variables of df.
        # A variable of df is added to the member list when  its index is within the boundaries of 'index'.
        # Variables within the boundaries can be excluded by specifying their name in 'exclusions'.
        # Variables outside the boundaries can be included by specifyng their name in 'inclusions'.
        
        # Initializes the index defining the boundaries
        if len(index) != 0:
            ind = np.arange(index[0], index[1] + 1)
        else:
            ind = set()
            
        # Initializes the DataFrame to store new members
        var_list = pd.DataFrame(columns = ['var_name', 'aggregate', 'total'], dtype = 'object')
        var_list['var_name'] = df.columns
        var_list['aggregate'] = df.columns
        
        # Update the index according to the exclusions and inclusions
        excl = var_list[var_list['var_name'].isin(exclusions)].index
        incl = var_list[var_list['var_name'].isin(inclusions)].index
        ind = set(ind) - set(excl)
        ind = set(ind) | set(incl)
        
        # Update self.m
        self.m = pd.concat([self.m, var_list.loc[ind]],axis = 0)
        
        # Remove duplicates
        self.m.drop_duplicates('var_name', inplace = True)
        
    def remove_members(self, remove_list):
        # Removes members listed in 'remove_list' the dataframe self.m.
        # Members in the list and not in the category are ignored.
        # Returns the number of removed members
        active_list = list(self.get_members())
        n = 0
        for member in remove_list:
            if member in active_list:
                # If member represented the total of the category, the 'total' column is updated
                if member in set(self.get_var_total()):
                    self.m['total'] = np.nan
                # If member represented an aggregate, the 'agglomerate' column is updated
                if member in set(self.m['aggregate']):
                    ind = (self.m['aggregate'] == member)
                    self.m.loc[ind,'aggregate'] = self.m.loc[ind, 'var_name']
                # Drops member
                ind = self.m[self.m['var_name'] == member].index
                self.m.drop(ind, inplace = True)
                n = n + 1
            else:
                pass
        return(n)

    def aggregate(self, agg_v, agg_l):
        # Links each member of the list agg_l to the aggregation member agg_v
        assert agg_v in agg_l, "cat.aggregate: agg_v shall be in agg_l"
        self.m.loc[self.m['var_name'].isin(agg_l), 'aggregate'] = agg_v              
    
    def set_total(self, var_total):
        # Links all members of the category to 'var_total', the total for the category
        # A member can be set to represent the total only if he is aggregated with himself
        if var_total is not np.nan:
            ind = (self.m['var_name'] == var_total)
            assert self.m.loc[ind, 'aggregate'].isin([var_total]).bool(),\
            "A member can be set to represent the total only if he is aggregated with himself"             
        self.m['total'] = var_total
        
    def get_info(self):
        # Returns the name and dataframe self.m of the category.
        # Type: dictionnary of a name and a pandas DataFrame.
        info = {'category name': self.cat_name,
                'members': self.m}
        return(info)
    
    def get_members(self):
        # Returns the members of the category.
        # Type: pandas Series.
        return(self.m['var_name'])
    
    def get_var_total(self):
        # Returns the member representing the total of the category and its aggregated members.
        # Type: pandas Series
        var_total = self.m.iloc[0,2]
        return(self.m.loc[self.m['aggregate'].isin([var_total]), 'var_name'])
    
    def get_sub_members(self):
        # Returns members excluding those representing the total of the category.
        # Type: pandas Series
        var_total = self.get_var_total()
        ind = ~self.m['aggregate'].isin(var_total)
        return(self.m.loc[ind, 'var_name'])
    
    def get_sum_members(self):
        # Return the members to use to calculate the total of the category.
        # Type: pandas Series
        # If 'total' is filled for the category, returns the total member and its aggregated members;
        # otherwise: returns get_sub_members() 
        var_total = self.get_var_total()
        if (var_total.empty == True):
            return(self.get_sub_members())
        else:
            return(var_total)

############################################################
# Defines categories
############################################################ 

# 'cats' lists categories which members are mutually exclusive and can be summed
categories_main = []
# 'others' lists other categories
categories_others = []

def set_up_cat(cat_name, df, index, exclusions = [], inclusions = [], total = np.nan, cats = []):
    c = DfCategory(cat_name)
    c.add_members(df, index, exclusions, inclusions)
    c.set_total(total)
    cats.append(c)
    return(c)

exclusions = ['url', 'created_t', 'created_datetime', 'last_modified_t', 'last_modified_datetime', 'image_url', 'image_small_url']
quali = set_up_cat('info', df_cat, index = (0,32),\
                  exclusions = exclusions, inclusions = [], total = np.nan, cats = categories_others)

exclusions = ['sum_ingredients', 'fat_components', 'sugar_components', 'nutrition-score-uk_100g']
quanti = set_up_cat('info', df_cat, index = (33,52),\
                  exclusions = exclusions, inclusions = [], total = np.nan, cats = categories_others)

# Dashboard

In [5]:
############################################################
############################################################
# WIDGET DE FILTRAGE
############################################################
############################################################

############################################################
# To Dos: Devs - Bugs - Checks - Améliorations
############################################################

# Dev: ajouter un interval de confiance de normalité de la distribution

# Bug: [Plus sur si d'actualité] - le filtre sur tags n'est pas parfait: 
#     - l'item 'folic acid)' ne déclenche pas la mise à jour du graph: doit s'agir d'une mésinterprétation de la parenthèse dans la gestion des regex

# Check: vérifier l'utilisation de pd.concat() dans le code

# Amélioration: insérer les tests statistiques dans les graphiques correspondants

# Amélioration: trouver un moyen de réduire l'utilisation de variable globales
# Amélioration: dans certaines fonction th est encore paramétrée en dur
# Amélioration: quali - indiquer le total représenté par les barres sans autres
# Amélioration: quali - optimiser la vitesse d'affichage de la valeur sur chaque barre
# Amélioration: quali - rajouter le footer informatif (var_info + missing)
# Amélioration: quali - identifier les différents niveaux de 'categories_fr', recréer la taxonomie

# Amélioration: quanti - widgetiser l'axe des x et calculer automatiquement la taille des bins
# Amélioration: quanti - rajouter l'information de la matrice de corrélation
# Amélioration: quanti - rajouter une grid
# Amélioratoin: optimiser le rafraichissement des graphs en les ramenant au strict nécessaire



############################################################
# Inputs
############################################################
importlib.reload(cl)

data_g = data_orig.copy() # DataFrame d'ancrage niveau initial
df_g = data_g.copy() # DataFrame de travail
#df_par_g = data_g.copy()  # Not a global variable anymore
cat_others_g = '__Autres__'

filtered_vars_g = [] # Stores the filtered variables

widgets_list=['filter_w', 
              'filter_var_w',
              'filter_cat_w',
              'filter_trigger_w', 
              'filter_text_w', 
              'filter_reset_w', 
              'filter_df_changed_w', 
              'filter_buttons_w', 
              'filter_info_w',
              'select_w', 
              'select_plot_w', 
              'selection_type_w', 
              'n_bar_w', 
              'x_range_w', 
              'hue_w', 
              'fig_args_w', 
              'plot_w', 
              'fig_w', 
              'data_Xplor_w', 
              'pca_trigger_w', 
              'DataXplor_tab_w',
              'header_button_w', 
              'left_button_w', 
              'center_button_w', 
              'right_button_w', 
              'footer_button_w']

for w in widgets_list:
    if w in globals():
        widget_w = globals()[w]
        widget_w.close()  
        
        
############################################################
# Defines the categories of each variable and stores them
############################################################         
        
# Helper function
def check_how_tags(var):
    # Handles variables with tags
    import re
    re_tags = re.compile('(_tags$)')
    re_fr = re.compile('(_fr$)')
    re_text = re.compile('(_text$)')
    
    # Choose how to compute the pareto distribution
    if re_tags.search(var) or re_fr.search(var) or re_text.search(var):
        how = 'pareto_multiplex'
    else:
        how = 'pareto'
    return(how)

# Global variable storing the categories of each variables
BUILD_VARIABLES_CATS_G = False
variables_cats_g = {}

if BUILD_VARIABLES_CATS_G == True:
    for var in list(quali.get_members()):
        how = check_how_tags(var)
        df_par_g, log, pareto_log = cl.clean_variables(df_g, variables = [var], how = how, th = 20, cat_name = cat_others_g,verbose = False)
        variables_cats_g[var] = {}
        variables_cats_g[var]['options'] = pareto_log[var][var].to_list()
        variables_cats_g[var]['value'] = variables_cats_g[var]['options']
    variables_cats_g[''] = {}
    variables_cats_g['']['value'] = ()
    variables_cats_g['']['options'] = ()


    with open('variables_cats_g.json', 'w') as fp:
        json.dump(variables_cats_g, fp)
else:
    with open('variables_cats_g.json', 'r') as fp:
        variables_cats_g = json.load(fp)


############################################################
# Defines some helper widgets
############################################################   
def create_expanded_button(description, button_style, h, w):
    return widgets.Button(description=description, button_style=button_style, layout=widgets.Layout(height=h, width=w))

header_button_w = create_expanded_button('DataXplor', 'success', 'auto', 'auto')
left_button_w = create_expanded_button('Left', 'info','auto', 'auto')
center_button_w = create_expanded_button('Center', 'warning','auto', 'auto')
right_button_w = create_expanded_button('Right', 'info','auto', 'auto')
footer_button_w = create_expanded_button('Footer', 'success','auto', 'auto')
    
    
############################################################
# Defines the function for the filtering widget
############################################################

# Filters the dataframe (global variable df_g) and returns whether it has been changed
def filter_df_helper(var, new_cats, old_cats):
    global df_g
    df_before = df_g.copy()
    how = check_how_tags(var)
    
    if how == 'pareto':
        # Finds who are the others under cat_others_g
        new_cats = list(new_cats)
        if cat_others_g in new_cats:
            new_cats.remove(cat_others_g)
            new_cats.extend(df_g.loc[~df_g[var].isin(old_cats), var].unique())
        # Filters
        df_g = df_g[df_g[var].isin(new_cats)]
    else:
        words_new = [f"^{w}(?:,|$)|,{w},|(?:^|,){w}$|" for w in new_cats]
        words_new[-1] = words_new[-1][:-1]
        words_new = "".join(words_new)
        # Finds who are the others under cat_others_g
        df_g_new = df_g[df_g[var].str.contains(words_new, regex = True)]
        if cat_others_g in new_cats:
            words_old = [f"^{w}(?:,|$)|,{w},|(?:^|,){w}$|" for w in old_cats]
            words_old[-1] = words_old[-1][:-1]
            words_old = "".join(words_old)
            df_g_old = df_g[~df_g[var].str.contains(words_old, regex = True)]
            df_g = pd.concat([df_g_new,df_g_old])
        else:
            df_g = df_g_new
        
    df_changed = df_g.equals(df_before)
    return(df_changed)


def filter_df():
    global df_g
    df_g = data_g
    for var in filtered_vars_g:
        filter_df_helper(var, variables_cats_g[var]['value'], variables_cats_g[var]['options'])

# Compute the options list from which to select the categories, update the dataframe if needed
# def compute_filters(var, new_cats, old_cats):
#     global df_g #,df_par_g #Not a global variable anymore
#     #print('\nvar en entrée de fonction: ', var)
#     #print('cats en entrée de fonction: ', new_cats)
#     if new_cats == ():
#         #print("Passage dans l'init")
#         how = check_how_tags(var)
#         #df_g = data_g.copy()
        
#         # After some test time, it would be nice to change df_par_g to df_par because this variable is not global anymore
#         df_par_g, log, pareto_log = cl.clean_variables(df_g, variables = [var], how = how,
#                                                         th = 20, cat_name = cat_others_g,verbose = False)
#         #df_g = df_par_g.copy() # Not a bug, required as a intermediate anchor, but alters too much df_g
#         return(pareto_log[var][var].to_list())
    
#     else:
#         #print("Passage dans l'update")
#         #df_g = data_g.copy()
#         filter_df(var, new_cats, old_cats)
#         return(old_cats)

def compute_shape():
    return(df_g.shape)

############################################################
#Defines the widget to filter a variable
############################################################
filter_var_w = widgets.Combobox(
    value = '',
    placeholder = 'Choisir une variable',
    options = list(quali.get_members()),
    description='',
    ensure_option=True,
    disabled=False
)


# Defines the widget to reset filter_var_w
filter_erase_w = widgets.Button(
    description='Vider la cellule !',
    disabled=False,
    button_style= '', # 'info', # 'success', 'info', 'warning', 'danger' or ''
    #tooltip='Click me',
    icon='eraser' # (FontAwesome names without the `fa-` prefix)
    )

def erase_selection_box(*args):
    filter_var_w.value = ''
    filter_cat_w.value = ()
    filter_cat_w.options = ()


filter_erase_w.on_click(erase_selection_box)

############################################################
# Defines the widget to filter the categories
############################################################
filter_cat_w = widgets.SelectMultiple(
    value = (),
    options = (),
    rows = 20,
    description='',
    disabled=False)

def update_cats_on_var(*args):
    if filter_var_w.value != '':
        # global df_g #, df_par_g # Not a global variable anymore
        # df_g = data_g.copy()
        # df_par_g = data_g.copy() # Not a global variable anymore
        filter_cat_w.options = variables_cats_g[filter_var_w.value]['options']
        filter_cat_w.value = variables_cats_g[filter_var_w.value]['value']
        # filter_text_w.value = str(compute_shape())
        # filter_df_changed_w.value = (filter_df_changed_w.value + 1) % 2    
filter_var_w.observe(update_cats_on_var, 'value')

############################################################
# Defines the widget to trigger filtering
############################################################
filter_trigger_w = widgets.Button(
    description='Filtrer',
    disabled=False,
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
    #tooltip='Click me',
    icon='filter' # (FontAwesome names without the `fa-` prefix)
)
def trigger_filter(*args):
    if (filter_var_w.value != '') and (filter_cat_w.value != ()):
        # global df_g
        #df_g = df_par_g.copy() # Not a global variable anymore
        
        variables_cats_g[filter_var_w.value]['value'] = filter_cat_w.value
        
        if filter_var_w.value in filtered_vars_g:
            if filter_cat_w.value == filter_cat_w.options:
                filtered_vars_g.remove(filter_var_w.value)
            else:
                pass
        else:
            if filter_cat_w.value == filter_cat_w.options:
                pass
            else:
                filtered_vars_g.append(filter_var_w.value)
                
        filter_df()
        filter_text_w.value = str(compute_shape())
        filter_df_changed_w.value = (filter_df_changed_w.value + 1) % 2

filter_trigger_w.on_click(trigger_filter)

############################################################
# Defines the widget to display the dataset infos
############################################################
filter_text_w = widgets.Text(
    value=str(compute_shape()),
    placeholder="Pas d'information",
    description='',
    disabled=False
)

############################################################
# Defines the widget to reset the filtering widgets and the database
############################################################
filter_reset_w = widgets.Button(
    description='Réinitialiser',
    disabled=False,
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
    #tooltip='Click me',
    icon='database' # (FontAwesome names without the `fa-` prefix)
)
def reset_filter(*args):
    global df_g, filtered_vars_g#, df_par_g # Not a global variable anymore
    df_g = data_g.copy()
    #df_par_g = data_g.copy()
    filter_var_w.value = ''
    filter_cat_w.value = ()
    filter_cat_w.options = ()
    filter_text_w.value = str(compute_shape())
    for var in filter_var_w.options:
        variables_cats_g[var]['value'] = variables_cats_g[var]['options']
    filtered_vars_g = []
    filter_df_changed_w.value = (filter_df_changed_w.value + 1) % 2
filter_reset_w.on_click(reset_filter)

############################################################
# Defines a widget to monitor when changes occur within the DataFrame df_g

# Changes are triggered by events monitored by
# filter_var_w.observe(update_cats_on_var, 'value')
# filter_trigger_w.on_click(trigger_filter)
# filter_reset_w.on_click(reset_filter)
############################################################
filter_df_changed_w = widgets.IntText(value=0,description='Any:',disabled=False)
filter_df_changed_plot_w = widgets.IntText(value=filter_df_changed_w.value,description='Any:',disabled=False)

############################################################
# Defines the main filtering widget
############################################################
filter_buttons_w = widgets.HBox([filter_trigger_w, filter_reset_w])

filter_info_w = widgets.VBox([widgets.Label(value='Taille du jeu de données'),
                             filter_text_w,])
 
filter_w = widgets.VBox([widgets.HBox([widgets.Label(value = 'Sélectionner pour filtrer:'), filter_erase_w]),
                         filter_var_w,
                         filter_cat_w,
                         filter_buttons_w,
                         filter_info_w,],
                        layout={'width': 'max-content'})

In [6]:
############################################################
############################################################
# WIDGET DE SELECTION ET REPRESENTATION + STATS
############################################################
############################################################


############################################################
# Inputs
############################################################
importlib.reload(cl)

n_cats_to_display_g = 20
hue_g = None # Used as a default parameter in scatter_plot() 
hue_order_g = None # Used once in the interactive_output plot_w and as a default parameter in scatter_plot() 
height_g = 3 # Used once in the interactive_output plot_w and as a default parameter in scatter_plot()
aspect_g = 3 # Used once in the interactive_output plot_w and as a default parameter in scatter_plot() 
fit_g = stats.norm # Used once in the interactive_output plot_w

############################################################
# Defines plot functions and their associated statistics if any
############################################################

# Distribution plot
def dist_plot(var, fit, x_min, x_max, active_x_range):
    
    # Inits parameters
    data = df_g.copy()
    
    # Comptute the stats: conformity test with the normal distribution
    ############################################################
    
    # Generates normal data for testing purposes
    #data_test = stats.norm.rvs(data_stats.mean(), data_stats.std(), len(data_stats))
    #data_stats = data_test
    
    shapiro_p = []
    kolmogorov_p = []
    # On calcule la moyenne des pvalue sur 20 echantillons de 500 individus
    for i in range(20):
        data_stats = data.loc[df_g[var].notna(), var]
        if data_stats.shape[0] >= 500:
            data_stats = data_stats.sample(n = 500)
            
        shapiro = stats.shapiro(data_stats)
        
        kolmogorov = stats.kstest(data_stats,
                                  'norm', 
                                  args=(data_stats.mean(), data_stats.std()))
    
        shapiro_p.append(shapiro.pvalue)
        kolmogorov_p.append(kolmogorov.pvalue)

    data_stats = data.loc[df_g[var].notna(), var]
    print("Test statistique d'ajustement")
    print(f"Hypothèse: distribution normale")
    print(f"Test de Shapiro: pvalue = {np.mean(shapiro_p) * 100: .2f}%")
    print(f"Test de Kolmogorov: pvalue ={np.mean(kolmogorov_p) * 100: .2f}%")
    
    # Computes the graph
    ############################################################
    fig, ax = plt.subplots(figsize = (10, 5))
    g = sns.distplot(data[var], rug = False, hist = True, fit = fit, ax = ax)
    g.set_title("Densité de probabilité")
    
    # Sets the limits of the x slider widget
    # If active_x_range is False, the graph is drawn for the first time and the widget is set-up to cover the whole x range
    if active_x_range == False:
        new_range = g.get_xlim()
        x_range_w.min = new_range[0]
        x_range_w.max = new_range[1]
        x_range_w.value = new_range
    # If active_x_range is True, the graph is re-drawn following a change in the widget so the graph is displayed accordingly
    if active_x_range == True:
        g.set_xlim(left = x_min, right = x_max)
        

# Count plot
def count_plot(var, n_cats_to_display = 20, kind = 'bar', height = 5, aspect = 3):

    # Inits parameters
    data = df_g.copy()
        
    # Computes the data to plot
    how = check_how_tags(var)
    data_plot, log, pareto_log = cl.clean_variables(data, variables = [var], how = how,
                                                    th = n_cats_to_display, cat_name = cat_others_g, verbose = False)

    # Computes the graph
    data_plot = pareto_log[var]
    x = var
    y = '%'
    kind = 'bar'
    estimator =  (lambda x: np.mean(x))
    
    g = sns.catplot(x = x, y = y, kind = kind, data = data_plot, estimator = estimator,
                    height = height, aspect = aspect)
    
    # Adds value annotations
    for p in g.ax.patches:
        g.ax.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),\
                   ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
    
    # Shortens xticks long labels 
    max_length = 20
    
    g.set_xticklabels([label[0:max_length-3] + '...'\
                       if (len(label) > max_length - 3) else label\
                       for label in pareto_log[var][var]])
    
    # Rotates xticks labels
    g.set_xticklabels(rotation = 90)
    g.fig.suptitle("Diagramme en bâton", y=1.01)


# Scatter plot
def scatter_plot(x, y, hue = hue_g, hue_order = hue_order_g, kind = 'scatter', height = height_g, aspect = aspect_g):
    
    import plotly.graph_objects as go
 
    # Inits parameters
    data = df_g.copy()
    
    # Computes the data to plot by transforming the 'hue' variable:
    # The categories of the 'hue' variable not belonging to the first n_cats_to_display_g ones (by population) are grouped in a new category
    # named cat_others_g.
    if hue == '':
        hue = None
        data_plot = data
        category_orders = {}
    else:   
        data_plot, log, pareto_log = cl.clean_variables(data, variables = [hue], how = 'pareto',
                                                    th = n_cats_to_display_g, cat_name = cat_others_g, verbose = False)
        category_orders  = {hue: list(data_plot[hue].cat.categories)}
    
    # Compute stats: print the correlation information
    corr, _ = stats.pearsonr(df_g[x].fillna(df_g[x].median()), df_g[y].fillna(df_g[y].median()))
    print('La corrélation entre les deux variables est de: %.3f' % corr)
    
    # Computes the graph
    g = px.scatter(x = x, y = y, data_frame = data_plot,color = hue,
                   category_orders = category_orders, hover_name  = 'product_name', title = "Nuage de points")
    g.show("notebook")

    


# Box plot
def box_plot(x, y, kind = 'box', height = 5, aspect = 3):
    
    # Inits parameters
    data = df_g.copy()
    
    # Computes the data to plot by transforming the 'x' variable:
    # The categories of the 'x' variable not belonging to the first n_cats_to_display_g ones (by population) are grouped in a new category
    # named cat_others_g.
    data_plot, log, pareto_log = cl.clean_variables(data, variables = [x], how = 'pareto',
                                                    th = n_cats_to_display_g, cat_name = cat_others_g, verbose = False)
    
    # Computes stats: ANOVA
    cats = pareto_log[x][x].to_numpy()
    in_anova = [data_plot.loc[data_plot[x] == cat, y].dropna() for cat in cats]
    f_val, p_val = stats.f_oneway(*in_anova)
    print("Test statistique d'analyse de la variance")
    print(f"Hypothèse: '{x}' n'a pas d'influence sur '{y}'")
    print(f"ANOVA: pvalue = {p_val * 100: .2f}%")
    
    # computes the graph
    g = sns.catplot(x = x, y = y, kind = kind, data = data_plot,
                    height = height, aspect = aspect, showmeans = True)
    
    
    # Shortens xticks long labels 
    max_length = 20
    
    g.set_xticklabels([label[0:max_length-3] + '...'\
                       if (len(label) > max_length - 3) else label\
                       for label in pareto_log[x][x]]) 
    
    # Rotates xticks labels
    g.set_xticklabels(rotation = 90)
    g.fig.suptitle(str("Boites à moustaches"), y=1.01)

    

############################################################
# Defines the function to select the appropriate plots according to the variables selected 
############################################################
def plot(var, df_has_changed, **kwargs): 
    
    lim_inf_lines = 10
    
    if df_g.shape[0] == 0:
        print(f"Attention, le jeu de données est vide.")
        return()
    
    if df_g.shape[0] <= lim_inf_lines:
        print(f"Attention, le nombre de lignes du jeu de données n'est pas significatif.")
        return()
    
    if selection_type_w.value == 'both variables qualitative':
        print(f"Pas de graphe avec deux variables qualitatives")
        return()
    
    elif selection_type_w.value == 'too much variables':
        print(f"Pas de graphe avec plus de deux variables")
        return()

    elif selection_type_w.value == 'count':
        n_bar = kwargs['n_bar']
        kind = 'bar'
        height = kwargs['height']
        aspect = kwargs['aspect']
        count_plot(var[0], n_bar, kind, height, aspect)
        
    elif selection_type_w.value == 'dist':
        fit = kwargs['fit']
        x_min = kwargs['x_range'][0]
        x_max = kwargs['x_range'][1]
        dist_plot(var[0], fit, x_min, x_max, active_x_range_w.value)
    
    elif selection_type_w.value == 'scatter':
        x = var[0]
        y = var[1]
        hue = kwargs['hue']
        hue_order = kwargs['hue_order']
        kind = 'scatter'
        height = kwargs['height']
        aspect = kwargs['aspect']
        scatter_plot(x, y, hue, hue_order, kind, height, aspect)
        
    elif selection_type_w.value == 'box':
        if caracterize_var(var[0]) == 'Qualitative':   
            x = var[0]
            y = var[1]
        else:
            x = var[1]
            y = var[0]        
        kind = 'box'
        height = kwargs['height']
        aspect = kwargs['aspect']
        box_plot(x, y, kind, height, aspect)
        
    else:
        print("Pas de graph pour ce type de sélection :", selection_type_w)
        return()

############################################################
# Defines the widget to select the variables to plot
############################################################
select_l = list(quali.get_members()) + list(quanti.get_members()) # Selection list of variables to be explored

select_w = widgets.VBox([widgets.Label(value = 'Selectionner pour visualiser:'),
                         widgets.SelectMultiple(
                             value = ['countries_fr'],
                             options = select_l,
                             row = 20,
                             description = '',
                             disabled = False,
                             layout = {'width': 'max-content', 'height': '100%'},)],
                        layout={'width': 'max-content', 'height': '100%'})

# Defines a widget that will be used in the plot interactive_output
# to trigger the plot function uppon change
select_plot_w = widgets.SelectMultiple(value = select_w.children[1].value,
                                       options = select_w.children[1].options,)

# Defines the widget that records what type of graph to display
# according to the type of variables selected
selection_type_w = widgets.Text(value='count', disabled=False)


############################################################
# Defines the widgets to select plot arguments
############################################################

# Count plot - Selects the number of bars to display
n_bar_w = widgets.IntSlider(value=20,
                            min=2,
                            max=40,
                            step=1,
                            description="",
                            disabled=False,
                            continuous_update=False,
                            orientation='horizontal',
                            readout=True,
                            readout_format='d',
                            layout = {'display': 'flex'},)

# Dist plot - Selects the x-axis range to display
x_range_w = widgets.FloatRangeSlider(value=[0, 100],
                                     min=0,
                                     max=10000,
                                     step=0.1,
                                     description='',
                                     disabled=False,
                                     continuous_update=False,
                                     orientation='horizontal',
                                     readout=True,
                                     readout_format='.1f',
                                     layout = {'display': 'none'},)

# Dist plot - Defines the widget that triggers the interactive plot function 
# when the x-axis range changes
x_range_plot_w = widgets.FloatRangeSlider(value=x_range_w.value,
                                     min=x_range_w.min,
                                     max=x_range_w.max,)

def update_x_range_plot(*args):
    if active_x_range_w.value == True:
        x_range_plot_w.value = x_range_w.value
        
x_range_w.observe(update_x_range_plot, 'value')

# Dist plot - Defines the widget that records whether the x range selection
# is active or not. 
# NB: it is inactive for the first time the figure is plotted, active then
active_x_range_w = widgets.Checkbox(value=False,
                                    description='',
                                    disabled=False,
                                    indent=False,)


# Defines the widget to select the hue coloration of the graph
hue_w = widgets.Combobox(
    value = '',
    placeholder = "Choisir l'angle d'exploration:",
    options = list(quali.get_members()),
    description='',
    ensure_option=True,
    disabled=False,
    layout = {'display': 'none'},
)


############################################################
# Defines the figure widget
############################################################

# Defines the figure widget
fig_args_w = widgets.VBox([widgets.Label(value = 'Nombre de barres:'),
                          n_bar_w,
                          x_range_w,
                          hue_w],
                          layout={'width': 'max-content', 'height': 'auto'})

# Caracterizes variables: helper function for update_fig_args()
def caracterize_var(var):
    dtype = df_g[var].dtype
    if (dtype == 'float64') or (dtype == 'int64'):
        typ = 'Numerical'
    else:
        typ = 'Qualitative'
    return(typ)

# Updates the figure's features according to the variables selected 
# This will trigger the interactive plot
def update_fig_args(*args):
    
    # Clear the current interactive plot
    plot_w.clear_output()
    
    # If more than 2 variables are selected, the app cannot display a graph
    if len(select_w.children[1].value) > 2:
        fig_args_w.children[0].value = ''
        fig_args_w.children[1].layout.display = 'none'
        fig_args_w.children[2].layout.display = 'none'
        fig_args_w.children[3].layout.display = 'none'
        selection_type_w.value = 'too much variables'
    
    # If 1 variable is selected, the graph to plot is a dist plot or a count plot
    if len(select_w.children[1].value) == 1:
        if caracterize_var(select_w.children[1].value[0]) == 'Numerical':
            fig_args_w.children[0].value = 'Plage des abcisses:'
            fig_args_w.children[1].layout.display = 'none'
            fig_args_w.children[2].layout.display = 'flex'
            fig_args_w.children[3].layout.display = 'none'
            selection_type_w.value = 'dist'
        elif caracterize_var(select_w.children[1].value[0]) == 'Qualitative':
            fig_args_w.children[0].value = 'Nombre de bars:'
            fig_args_w.children[1].layout.display = 'flex'
            fig_args_w.children[2].layout.display = 'none'
            fig_args_w.children[3].layout.display = 'none'
            selection_type_w.value = 'count'
        else:
            raise Exception("Invalid variable type").with_traceback(tracebackobj)
            
    # If 2 variables are selected the graph to plot is a scatter plot or a box plot
    if len(select_w.children[1].value) == 2:
        if (caracterize_var(select_w.children[1].value[0]) == 'Numerical') and\
        (caracterize_var(select_w.children[1].value[1]) == 'Numerical'):
            fig_args_w.children[0].value = ''
            fig_args_w.children[1].layout.display = 'none'
            fig_args_w.children[2].layout.display = 'none'
            fig_args_w.children[3].layout.display = 'flex'
            selection_type_w.value = 'scatter'
    
        if ((caracterize_var(select_w.children[1].value[0]) == 'Numerical') and\
            (caracterize_var(select_w.children[1].value[1]) == 'Qualitative')) or\
            ((caracterize_var(select_w.children[1].value[0]) == 'Qualitative') and\
            (caracterize_var(select_w.children[1].value[1]) == 'Numerical')):
            fig_args_w.children[0].value = ''
            fig_args_w.children[1].layout.display = 'none'
            fig_args_w.children[2].layout.display = 'none'
            fig_args_w.children[3].layout.display = 'none'
            selection_type_w.value = 'box'
            
        if (caracterize_var(select_w.children[1].value[0]) == 'Qualitative') and\
        (caracterize_var(select_w.children[1].value[1]) == 'Qualitative'):
            fig_args_w.children[0].value = ''
            fig_args_w.children[1].layout.display = 'none'
            fig_args_w.children[2].layout.display = 'none'
            fig_args_w.children[3].layout.display = 'none'
            selection_type_w.value = 'both variables qualitative'

    active_x_range_w.value = False  # The range selection is set to inactive and will be set-up by the plot function
    select_plot_w.value = select_w.children[1].value # The upadte of select_plot_w will trigger the interactive plot (if it has changed)
    filter_df_changed_plot_w.value = filter_df_changed_w.value # The upadte of filter_df_changed_plot_w will trigger the interactive plot (if it has changed)
    active_x_range_w.value = True # The range selection has been set up by the plot function and is set to active
    
    
select_w.children[1].observe(update_fig_args, 'value')
filter_df_changed_w.observe(update_fig_args, 'value')

# Plot interactive output
# It is triggered two ways:
# 1) When a new graph has to be plotted: happens when the dataframe changes or the variables to plot change.
#    This is triggered by 'select_plot_w' or 'filter_df_changed_plot_w' which are driven by the function 'update_fig_args'
# 2) For a given figure, when its visualization features change.
#    This is triggerede by 'x_range_plot_w', 'n_bar_w' and 'hue_w'
plot_w = widgets.interactive_output(plot,
                         {'var': select_plot_w,
                          'df_has_changed': filter_df_changed_plot_w,
                          'fit': fixed(fit_g),
                          'x_range': x_range_plot_w,
                          'n_bar': n_bar_w,
                          'hue' : hue_w,
                          'hue_order' : fixed(hue_order_g),
                          'height' : fixed(height_g),
                          'aspect' : fixed(aspect_g),})
# Figure widget
fig_w = widgets.VBox([fig_args_w, plot_w])


############################################################
# Defines the application widget
############################################################
dataXplor_w = AppLayout(header=header_button_w,
          left_sidebar=filter_w,
          center=select_w,
          right_sidebar=fig_w,
          footer=footer_button_w,
          pane_widths=[4.5, 4.5, 10],
          pane_heights=['60px', '60', '10px'])

In [None]:
############################################################
############################################################
# WIDGET DE PCA
############################################################
############################################################


############################################################
# Computes the PCA
############################################################
def compute_pca(df_g, pca_cat = quanti):
    
    from sklearn.preprocessing import StandardScaler
    from sklearn.impute import SimpleImputer
    from sklearn.decomposition import PCA

    # Set-up
    df = df_g.copy()

    # Selects variables (called pca_features here)
    pca_features = df.columns.intersection(pca_cat.get_members())

    # Fills_in nan with the mean, centers, normalizes
    pca_in = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(df[pca_features])
    pca_in = StandardScaler(with_std = True).fit_transform(pca_in)

    # How many components to compute with the PCA
    n_comp = min(pca_in.shape)

    # PCA
    pca = PCA(n_components=n_comp)
    pcs = pca.fit_transform(pca_in)
    pcs_columns = [f"PC{i+1}" for i in range(n_comp)]

    # Matrice de passage des variables aux features
    eigval = (pca.singular_values_**2) / len(pcs)
    eigval = np.reshape(eigval, (len(eigval),1))
    vect_components = pca.components_ * np.sqrt(eigval)

    # Returns in 'df_pcs' the concatenation of the original DataFrame and the principal components
    df_pcs = pd.DataFrame(pcs, columns = pcs_columns)
    df_pcs = pd.concat([df.reset_index(), df_pcs], axis = 1)

    # Concatenates to 'df_pcs' the quality of representation of individuals along each principal component
    ind_qty = (pcs * pcs) / (pcs*pcs).sum(axis=1).reshape(-1,1)
    ind_qty_columns = [f"Q{i+1}" for i in range(n_comp)]
    df_qty = pd.DataFrame(ind_qty,columns = ind_qty_columns)
    df_pcs = pd.concat([df_pcs.reset_index(), df_qty], axis = 1)

    # Concatenates to 'df_pcs' the contributon of each individual to the principal component
    ind_ctr = (pcs * pcs) / (pcs*pcs).sum(axis=0).reshape(1,-1)
    ind_ctr_columns = [f"CTR{i+1}" for i in range(n_comp)]
    df_ctr = pd.DataFrame(ind_ctr,columns = ind_ctr_columns)
    df_pcs = pd.concat([df_pcs, df_ctr], axis = 1)
    
    return(df_pcs, pca, pcs, vect_components, pca_features)

############################################################
# Computes the eboulis and their cumulative sum
############################################################
def eboulis(df_has_changed):
    y = pca.explained_variance_ratio_
    x = list(range(len(y)))
    d = pd.DataFrame(np.array([x, y]).T, columns = ['Facteur', '% Inertie'])
    g = px.bar(d, y = '% Inertie',height = 600, width = 600, title = "Eboulis de la contribution des facteurs à l'inertie")
    g.show("notebook")


def eboulis_cum(df_has_changed):
    y = pca.explained_variance_ratio_
    x = list(range(len(y)))
    d = pd.DataFrame(np.array([x, y]).T, columns = ['Facteur', '% Inertie'])
    d_cum = d.cumsum()
    g_cum = px.bar(d_cum, y = '% Inertie', height = 600, width = 600, title = "Eboulis cumulés")
    g_cum.show("notebook")


############################################################
# Plots the vraibles in the factorial plan
############################################################
def fact_plan(fact_ind, df_has_changed):
    import plotly.express as px
    import plotly.graph_objects as go
    
    # Set-up
    hover = 'product_name'
    # hover = 'code'
    data_plot = df_pcs.copy()
    PC = (fact_ind[0]+1, fact_ind[1]+1)
    
    # Filters the DataFrame 
#     th = ((pcs**2).sum(axis=0) / len(data_plot))
#     th = np.sqrt(th)
#     ind = (data_plot[f"CTR{PC[0]}"] >= (1/len(data_plot))) | (data_plot[f"CTR{PC[1]}"] >= (1/len(data_plot)))
#     data_plot = data_plot[ind]

    # Graph boundaries
    max_x = data_plot[f"PC{PC[0]}"].abs().max()
    max_y = data_plot[f"PC{PC[1]}"].abs().max()
    max_xy = max(max_x, max_y)
    range_x = (-1.1 * max_xy, 1.1 * max_xy)
    range_y = range_x

    # Axis labels
    axis_labels = {f"PC{i}": f"PC{i} ({pca.explained_variance_ratio_[i-1] * 100:.1f}%)" for i in PC}

    # Adds an information columns to the DataFrame (plotly specific)
    data_plot['hover'] = (data_plot[hover] + 
                           ";\nreprésenté à: " + 
                           round((data_plot[f"Q{PC[0]}"] + data_plot[f"Q{PC[1]}"])*100,2).astype('str') + "%")



    data_plot['qty'] = ((data_plot[f"Q{PC[0]}"] + data_plot[f"Q{PC[1]}"]) * 100).to_numpy()
    
    fig_fact = px.scatter(
        data_plot,
        x = f"PC{PC[0]}",
        y = f"PC{PC[1]}",
        labels=axis_labels,
        color='pnns_groups_2',
        hover_name = 'hover',
        range_x = range_x,
        range_y = range_y,
        title = f"Plan factoriel PC{PC[0]} / PC{PC[1]}",
        render_mode='webgl'
    )

#     fig_fact.add_traces(go.Scatter(x=np.linspace(-max_xy, max_xy, 1000), y=np.full((1000,), th[PC[1]-1]), mode='lines'))
#     fig_fact.add_traces(go.Scatter(x=np.linspace(-max_xy, max_xy, 1000), y=np.full((1000,), -th[PC[1]-1]), mode='lines'))
#     fig_fact.add_traces(go.Scatter(y=np.linspace(-max_xy, max_xy, 1000), x=np.full((1000,), th[PC[0]-1]), mode='lines'))
#     fig_fact.add_traces(go.Scatter(y=np.linspace(-max_xy, max_xy, 1000), x=np.full((1000,), -th[PC[0]-1]), mode='lines'))
    fig_fact.update_layout(width=820, height=600)
    
    fig_fact.show("notebook")



############################################################
# Plots the correlation circle
############################################################
def corr_circle(fact_ind, df_has_changed):
    import plotly.express as px
    import plotly.graph_objects as go
    
    PC = (fact_ind[0]+1, fact_ind[1]+1)
    
    # Builds the DataFrame to plot
    circle_labels = [f"PC{i}" for i in PC]
    df_circle = vect_components[(PC[0]-1,PC[1]-1),:].T
    df_circle = pd.DataFrame(df_circle, columns = circle_labels)
    df_circle['pca_features'] = pca_features
    
    # Axis labels
    axis_labels = {f"PC{i}": f"PC{i} ({pca.explained_variance_ratio_[i-1] * 100:.1f}%)" for i in PC}

    # Plots the DataFrame
    fig_circle = px.scatter(df_circle, x = circle_labels[0], y = circle_labels[1], hover_name="pca_features", labels=axis_labels,
                     title = f"Cercle des corrélations - PC{PC[0]} / PC{PC[1]}")
    fig_circle.update_xaxes(range=[-1.1, 1.1], zeroline = True)
    fig_circle.update_yaxes(range=[-1.1, 1.1], zeroline = True)
    fig_circle.update_layout(width=600, height=600)


    # Adds circles
    fig_circle.update_layout(
        shapes=[dict(type="circle", xref="x", yref="y", x0=-1, y0=-1, x1=1, y1=1, line_color="LightSeaGreen",),])

    # Adds arrows
    X = vect_components[PC[0]-1,:]
    Y = vect_components[PC[1]-1,:]
    for x,y in zip(X,Y):
        fig_circle.add_annotation(x=x, y=y, ax=0, ay=0, xref='x', yref='y', axref='x', ayref='y', text='', showarrow=True,
                           arrowhead=2, arrowsize=1.2, arrowwidth=2, arrowcolor='black')

    fig_circle.show("notebook")
    
    
############################################################
# Defines the widget that triggers the replotting of the PCA
############################################################
df_pcs_changed_plot_w = widgets.IntText(value=0,description='Any:',disabled=False)


############################################################
# Defines the widget that records a replotting request from the user
############################################################
pca_trigger_w = widgets.Button(
    description="(Re)Lancer l'ACP",
    disabled=False,
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
    #tooltip='Click me',
    icon='filter' # (FontAwesome names without the `fa-` prefix)
)

def trigger_pca(*args):
    global df_pcs, pca, vect_components, pca_features
    df_pcs, pca, pcs, vect_components, pca_features = compute_pca(df_g, pca_cat = quanti)
    df_pcs_changed_plot_w.value = (df_pcs_changed_plot_w.value + 1) % 2

pca_trigger_w.on_click(trigger_pca)



############################################################
# Initializes the visualization widget
############################################################
def init_DataXplor_tab_w(df_g, dataXplor_w, pca_cat, hover = 'product_name'):
    
    children = [dataXplor_w]

    
    children.append(widgets.HBox([pca_trigger_w,
                                  interactive_output(eboulis,{'df_has_changed': df_pcs_changed_plot_w}),
                                  interactive_output(eboulis_cum,{'df_has_changed': df_pcs_changed_plot_w})]))
    

    for i in range(4):
        children.append(widgets.HBox([pca_trigger_w,
                                      interactive_output(fact_plan, {'fact_ind': fixed((2*i,2*i+1)),'df_has_changed': df_pcs_changed_plot_w}), 
                                      interactive_output(corr_circle,  {'fact_ind': fixed((2*i,2*i+1)),'df_has_changed': df_pcs_changed_plot_w})]))
    return(children)
                        

############################################################
# Visualization widget
############################################################
df_pcs, pca, pcs, vect_components, pca_features = compute_pca(df_g, pca_cat = quanti)
tab_titles = {0:'DataXplor', 1:'PCA: Eboulis', 2:'PC1-PC2', 3:'PC3-PC4', 4:'PC5-PC6', 5:'PC7-PC8'}
DataXplor_tab_w = widgets.Tab()
DataXplor_tab_w._titles = tab_titles
DataXplor_tab_w.children = init_DataXplor_tab_w(df_g, dataXplor_w, pca_cat = quanti, hover = 'code')


In [None]:
DataXplor_tab_w

In [None]:
# UNDER CONSTRUCTION


# Test d'indépendance
from scipy.stats import chi2, chi2_contingency
prob = 0.95

x = 'stores'
y = 'energy_100g'
data_plot, log, pareto_log = cl.clean_variables(df_g, variables = [x,y], how = 'pareto',
                                                    th = 5, cat_name = '__Grouping__', verbose = False)

# data_plot = data_plot[~data_plot[x].isin(['__Grouping__'])]
# data_plot = data_plot[~data_plot[y].isin(['__Grouping__'])]

observed = pd.crosstab(data_plot[x], data_plot[y])



stat, p, dof, expected = chi2_contingency(observed, correction=True, lambda_=None)

critical = chi2.ppf(prob, dof)
# Test effectiveness => at least 5 observations per cell in observed

d2 = (observed - expected)**2  / expected


# sns.heatmap(d2, annot = True, xticklabels = True, yticklabels = True, cmap="YlGnBu")



#interpret test-statistic
# print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
# if abs(stat) >= critical:
#     print('Dependent (reject H0)')
# else:
#     print('Independent (fail to reject H0)')

# #interpret p-value
# alpha = 1.0 - prob
# print('\nsignificance=%.3f, p=%.3f' % (alpha, p))
# if p <= alpha:
#     print('Dependent (reject H0)')
# else:
#     print('Independent (fail to reject H0)')