# Performance by product --> SoM Dashboard

In [1]:
BASE_DIR = '/Users/efraflores/Desktop/EF/Corner/Brands/SoM/data/'
CPG = 'danone'

## Import

In [2]:
# %load basic
import os
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format
print([x for x in os.listdir(BASE_DIR) if x.endswith(f'{CPG}.csv')])

['som_danone.csv', 'nielsen_danone.csv', 'products_danone.csv', 'nielsen_pred_danone.csv']


In [3]:
som = pd.read_csv(os.path.join(BASE_DIR,f'som_{CPG}.csv'),low_memory=False,
                 sep='\t', encoding='UTF-16')
som.sample(2)

Unnamed: 0,city_name,store_name,category_en,category_id,product_id,product_name,barcodes,package,product_sku,brand_name,...,marca_detalle,marca_generica,gramaje,descripcion_nielsen,segmento,subsegmento_1,subsegmento_2,subsegmento_3,mes,anio
20818,Mérida,Chedraui,Gelatin & Custards,1242,1151087,Gelatina en polvo de agua naranja sobre,735257002476,120 g,735257002476,D'Gari,...,D'Gari,D'Gari,120 G,Gelatina En Polvo De Agua Naranja Sobre,Gelatin & Custards,,,,8,2021
1095,Chihuahua,Alsuper,Fresh Juice & Nectar,31,119532,Néctar de piña,7501013192490,Cartón 1 L,750101319249,Vigor,...,Vigor,Vigor,1.0,Jumex Vigor Jugo Pina Carton 1000 Ml,Bebidas Saborizadas,Bebidas Refrescantes,,,8,2021


In [4]:
df = som[['product_id','barcodes','category_en','product_name',
          'proveedor_general', 'marca_generica', 
          'descripcion_nielsen','segmento']]
df = df.drop_duplicates('product_id').set_index('product_id')
df.sample()

Unnamed: 0_level_0,barcodes,category_en,product_name,proveedor_general,marca_generica,descripcion_nielsen,segmento
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3073216,758104007646,Water,Agua con toque sabor fresa,Danone,Levite,Levite Fresa 750 Ml,Bebidas Saborizadas


In [5]:
nielsen = pd.read_csv(os.path.join(BASE_DIR,f'nielsen_{CPG}.csv'),low_memory=False,
                 sep='\t', encoding='UTF-16')
nielsen.sample(2)

Unnamed: 0,UPC,FORMATO,PROVEEDOR GENERAL,PROVEEDOR ABIERTO,MARCA DETALLE,MARCA GENERICA,GRAMAJE,DESCRIPCION NIELSEN,PRODUCT ID,SEGMENTO,SUBSEGMENTO 1,SUBSEGMENTO 2,SUBSEGMENTO 3
6104,7502247420205,FAM,DEMAS FABRICANTES,DEMAS FABRICANTES,KING CITRUS,KING CITRUS,3.1,KING FRUIT CITRUS NARANJA BOT PLAST 3100ML NAL,,BEBIDAS SABORIZADAS,BEBIDAS REFRESCANTES,,
725,7501040097980,KILO,SIGMA,SIGMA,YOPLAIT,YOPLAIT,0.95,YOPLAIT S/AZUCAR NATURAL BATIDO BOTE 950 GR,2669344.0,BB SOLIDO,BASE BUSINESS,,


## Functions

### Timing and tone

In [6]:
import time
import numpy as np
from IPython.lib.display import Audio

start = time.time()
def time_exp(x):
    minutes, seconds = np.floor(x/60), 60*(x/60-np.floor(x/60))
    print(f"{'{:.0f}'.format(minutes)} minutos con {'{:.2f}'.format(seconds)} segundos")
    
def tono(a = 1000, b = 700, play_time_seconds = 1, framerate = 4410):
    t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)*np.pi
    return Audio(np.sin(a*t) + np.sin(b*t), rate = framerate, autoplay=True)

### Clean text

In [7]:
import re
import unicodedata

def clean_text(text):
    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore')
    text = re.sub("[^a-zA-Z\s]",'',text.decode('utf-8'),flags=re.UNICODE)
    text = ' '.join(text.split())
    return text.lower()

### TAD

In [8]:
def tad(data):
    X_test = data.iloc[:,:-1].apply(lambda x:' '.join(x.dropna().astype(str)),
                                    axis=1).apply(clean_text).values
    y_test = data['segmento'].values
    product_id_col = data.index
    return X_test.astype(str),y_test,product_id_col

## Model

### Just nielsen categories

In [9]:
df = df.reset_index().merge(nielsen[['UPC','SEGMENTO']], left_on='barcodes', right_on='UPC', how='left')
df = df.drop(['UPC','barcodes'], axis=1).rename(columns={'SEGMENTO':'NIELSEN'}).set_index('product_id')
df.sample()

Unnamed: 0_level_0,category_en,product_name,proveedor_general,marca_generica,descripcion_nielsen,segmento,NIELSEN
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
271829,Pastries,Mediterra 25,,Pastries (C-327),Mediterra 25,Pastries,


In [10]:
df['nielsen_cat'] = (df['NIELSEN'].notnull())*1
val = df[df['nielsen_cat']==0].copy()
val['segmento'].value_counts()

Fresh Juice & Nectar              591
Pastries                          586
Gelatin & Custards                211
Water                             141
Yogurt                            139
Plant-Based Beverages              89
Desserts & Refrigerated Bakery     29
Name: segmento, dtype: int64

In [11]:
df = df[df['nielsen_cat']==1].copy()
df['segmento'].value_counts()

Bebidas Saborizadas    368
Bb Solido              136
Agua Natural           134
Bb Liquido              96
Plant Based             94
Griego                  86
Salud Activa            80
Bebidas Infantiles      38
Gelatinas               29
Light                   26
Specialties             23
Postres Regulares       18
Infantil Liquido        17
Lacteos Fermentados     15
Infantil Solido         15
Name: segmento, dtype: int64

### Train test split

In [12]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df,train_size=0.8,random_state=22)

### Preprocessing

In [13]:
X_train = df_train.iloc[:,:-1].apply(lambda x:' '.join(x.dropna().astype(str)),axis=1
                              ).apply(clean_text).values
y_train = df_train['segmento'].values

### Test

In [14]:
X_test,y_test,_ = tad(df_test)

### Training

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

tv = CountVectorizer(ngram_range=(1,1),min_df=1, 
                     max_features=10000,lowercase=False)
logreg = LogisticRegression()

model_logreg = Pipeline(steps=[('vectorizer', tv),
                               ('model',logreg)])

print('Accuracy score: ','{:.2%}'.format(model_logreg.fit(X_train,y_train).score(X_test,y_test)))
print('Training accuracy: ','{:.2%}'.format(model_logreg.score(X_train,y_train)))

Accuracy score:  100.00%
Training accuracy:  100.00%


### Confussion matrix

In [16]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,model_logreg.predict(X_test))

array([[22,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 27,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 90,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0, 15,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 16,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 17,  0],
       [ 0,  0,  0,  0,  0,  0,  0

from sklearn.metrics import confusion_matrix
model = model_logreg
pd.DataFrame(confusion_matrix(y_test,model.predict(X_test)),
             index=model_logreg[1].classes_,columns=model_logreg[1].classes_
            ).style.background_gradient(cmap='Blues')

### Predict

In [17]:
X_val,cs_cat,product_id_col = tad(val)

In [18]:
resultado = val.iloc[:,:-1].join(pd.DataFrame(model_logreg.predict(X_val),
                                              index=product_id_col,
                                              columns=['Nielsen_pred'])).drop(['NIELSEN'], axis=1)
resultado.sample(4)

Unnamed: 0_level_0,category_en,product_name,proveedor_general,marca_generica,descripcion_nielsen,segmento,Nielsen_pred
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1124785,Water,Agua tónica sabor limón,,Fever-Tree,Agua Tónica Sabor Limón,Water,Agua Natural
3953391,Fresh Juice & Nectar,Jugo J5 bliss,,Jugen,Jugo J5 Bliss,Fresh Juice & Nectar,Bebidas Saborizadas
337724,Water,Agua natural artesanal,Marinter,Fiji,Agua Natural Artesanal,Water,Agua Natural
1144039,Fresh Juice & Nectar,Liquido de coco para cocinar organico,,Waitrose,Liquido De Coco Para Cocinar Organico,Fresh Juice & Nectar,Bebidas Saborizadas


## Export

In [19]:
resultado.to_csv(os.path.join(BASE_DIR,f'nielsen_pred_{CPG}.csv'), sep='\t',encoding='utf-16')

## Fin

In [20]:
print('Accuracy score: ','{:.2%}'.format(model_logreg.fit(X_train,y_train).score(X_test,y_test)))
print('Training accuracy: ','{:.2%}'.format(model_logreg.score(X_train,y_train)))

Accuracy score:  100.00%
Training accuracy:  100.00%


In [21]:
confusion_matrix(y_test,model_logreg.predict(X_test))

array([[22,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 27,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 90,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0, 15,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 16,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 17,  0],
       [ 0,  0,  0,  0,  0,  0,  0

In [22]:
time_exp(time.time() - start)
tono()

0 minutos con 2.32 segundos
