In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data_path = "/home/clairegayral/Documents/openclassroom/data/"
res_path = "/home/clairegayral/Documents/openclassroom/src/P2_gayral_claire/res/"

## my .py : 
import read_csv
import preprocess
import list_from_data_field 
import missing_values_treatment

My preprocess consisted in : 
* Read the data : I only kept data with the "nutrition score" for this analyse,
* Drop the columns with too many missing values
* Select the variables that will interest me : the nutrition variable, with name of the product, its code reference, country of sell, and name of the person that inter it in the open food fact base.
* Merge the two columns that are about the palm oil ingredients.
* Remove outliers (from the semantic of columns, and compared to the other float values with hierarchical clustering)

Mon prétraitement consistait en : 
* Lecture des données : Je n'ai gardé que les données avec le "score nutritionnel" pour cette analyse,
* Suppression des colonnes avec trop de valeurs manquantes
* Sélection des variables d'intérêt : la variable nutritionnelle, avec le nom du produit, sa référence de code, le pays de vente, et le nom de la personne qui l'a introduit dans la base de données 'open food facts'.
* Fusion des deux colonnes qui concernent les ingrédients de l'huile de palme.
* Fusion des doublons (par nom de produit)
* Suppression les valeurs aberrantes (de la sémantique des colonnes, et par rapport aux autres valeurs avec une classification hierarchique).


In [2]:
##
## read data : 
##
# read_csv.save_csv()
df_original = pd.read_csv(data_path+"projet2/df_original.csv", 
                          low_memory=False, index_col=0)

df = df_original.copy()

##
## select variables 
##
list_of_nutri_facts = list_from_data_field.list_of_nutri_facts
interest_var = pd.Index(["code","product_name","creator","countries",
                         "additives_n","ingredients_from_palm_oil"])
interest_var = interest_var.append(df.columns.intersection(list_of_nutri_facts))
df = preprocess.select_columns(df, interest_var)


##
## set dtype : 
##
df = preprocess.set_dtypes(df)

##
## merge "from palm oil" 
##
df = preprocess.merge_palm_oil_cols(df)

##
## drop empty columns
##
nan_repartition = df_original.isna().sum(axis=0)
# nan_threshold = nan_repartition.mean()
nan_threshold = nan_repartition.quantile(0.75)
df = preprocess.preprocess_drop_col_nan(df_original, nan_threshold)


To drop duplicates, I compute a hierarchical clustering on the euclidean distance matrix of products with the same name, and I saved the indexes to gather in a dictionnary : 

```python
# prod_name = "Salade de carottes râpées"
prod_name = "Abondance"
res = {}

x = data[data.product_name == prod_name]
prod_index = x.index
## if there is more than one product with the same name 
if len(prod_index) > 1 : 
    prod_values = data_float.loc[prod_index,:]
    prod_dist = pd.DataFrame(nan_euclidean_distances(prod_values),
                          columns=prod_index, index = prod_index)
    Z = linkage(prod_dist, "weighted")
    prod_clustering = pd.Series(fcluster(Z, t=1.15), index = prod_index)
    for k in np.unique(prod_clustering.values):
        index_merge = prod_clustering[prod_clustering==k].index
        res[prod_name+str(k)]= index_merge

```

In [61]:
##
## Extract float data 
##

float_var = list_from_data_field.list_of_nutri_facts
float_var += ["additives_n", "ingredients_from_palm_oil","ingredients_from_palm_oil_n"]
float_var = df.columns.intersection(float_var).values

data_columns = np.insert(float_var, 0, "product_name")
data = df[data_columns].copy()

##
## Extract duplicates to be merge
##

# res = get_index_merge_duplicates(data, float_var, 1.15)
# import pickle
# with open(res_path +'index_to_merge_duplicates.pkl', 'wb') as fp:
#     pickle.dump(res, fp, pickle.HIGHEST_PROTOCOL)
with open(res_path +'index_to_merge_duplicates.pkl', 'rb') as f:
    res = pickle.load(f)
    
##
## Merge duplicates from the dict
##

data_clean = preprocess.drop_and_merge_duplicates(data)

In [None]:
data_clean = data.copy()
for product_name in res:
    merge_index = res[product_name]
    first_index = merge_index[0]
    if len(merge_index)>1:
        prod_duplicate = data.loc[merge_index].mean()
        prod_duplicate.at["product_name"] = product_name[:-1] # original name
        data_clean = data_clean.drop(merge_index.values, axis=0) 
        data_clean.at[first_index] = prod_duplicate
        data_clean.loc[first_index]

In [None]:

import missing_values_treatment
##
## from the hyp that the variable has been entered in mg instead of g -> rescale 
##
df = missing_values_treatment.rescale_outliers100g_val(df)

##
## drop outliers values and drop product/variables with too many NaNs
##
df = missing_values_treatment.drop_outliers(df)
df.shape

In [None]:
nb_line_plot = int(np.floor(len(float_var)/2)+1)
fig = plt.figure(figsize=(16, 100))

fig_count = 1
for var in df.columns.intersection(float_var) :#data.columns.intersection(list_of_nutri_facts):
    ax = fig.add_subplot(nb_line_plot,2, fig_count)
    nb_bins = min(20, len(np.unique(df[var].dropna().values)))
    ax.hist(df[var], bins = nb_bins, color='steelblue', density=True, edgecolor='none')
    ax.set_title(var)
    fig_count += 1
plt.show()

In [None]:
data = df.copy()
colname = "additives_n"
possible_vals = possible_val_dict[colname]


help_to_set_outliers_vals(data, colname, possible_vals)

In [None]:
import list_from_data_field
float_var = list_from_data_field.list_of_characteristics 

floa

In [None]:
def merge_palm_oil_cols(data):
    if "ingredients_from_palm_oil_n" in data.columns :
        data.at[data["ingredients_from_palm_oil_n"] > 0, "ingredients_from_palm_oil_n"] = 1
        if ("ingredients_from_palm_oil" in data.columns) :
            data['ingredients_from_palm_oil'].fillna(data['ingredients_from_palm_oil_n'], inplace=True)
            data.drop("ingredients_from_palm_oil_n", inplace = True, axis = 1)
        else :
             data.rename(columns={"ingredients_from_palm_oil_n":"ingredients_from_palm_oil"}, inplace=True)
        data["ingredients_from_palm_oil"] = data["ingredients_from_palm_oil"].astype("float")
    return(data)