# Performance by product --> SoM Dashboard

In [1]:
BASE_DIR = '/Users/efraflores/Desktop/EF/Corner/Brands/data/'
CPG = 'danone'

## Import

In [2]:
import sys
sys.path.append('/Users/efraflores/Desktop/hub/cornershop/venv/lib/python3.9/site-packages')

In [3]:
# %load basic
import os
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format
print([x for x in os.listdir(BASE_DIR) if x.endswith(f'{CPG}.csv')])

['som_danone.csv', 'nielsen_danone.csv', 'products_danone.csv', 'nielsen_pred_danone.csv']


In [4]:
som = pd.read_csv(os.path.join(BASE_DIR,f'som_{CPG}.csv'),low_memory=False,
                 sep='\t', encoding='UTF-16')
som.sample(2)

Unnamed: 0,city_name,store_name,category_en,category_id,product_id,product_name,barcodes,package,product_sku,brand_name,...,marca_detalle,marca_generica,gramaje,descripcion_nielsen,segmento,subsegmento_1,subsegmento_2,subsegmento_3,mes,anio
15535,Cancún,Chedraui,Yogurt,24,321655,Alimento lácteo fermentado,7501025511005,5 x 80 ml,,Yakult,...,YAKULT,YAKULT,0.4,YAKULT 5 PZAS. 80 ML C/U = 400 ML NAL.,LACTEOS FERMENTADOS,LACTEOS FERMENTADOS,,,4,2021
50982,Guadalajara,Chedraui,Plant-based Beverages,1516,1253775,Alimento líquido de avena y linaza sin azúcar,7502252484285,946 ml,,Nature's Heart,...,NATURES HEART,,0.946,NATURES HEART ALIM LIQ AVENA LINAZA S/AZUCAR ...,PLANT BASED,PLANT BASED,AVENA-LINAZA,UHT,2,2021


In [5]:
df = som[['product_id','category_en','product_name',
          'proveedor_general', 'marca_generica', 
          'descripcion_nielsen','segmento']]
df = df.drop_duplicates('product_id').set_index('product_id')
df.sample()

Unnamed: 0_level_0,category_en,product_name,proveedor_general,marca_generica,descripcion_nielsen,segmento
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
379443,Yogurt,Yoghurt griego sin azúcar añadida con mango,SIGMA,YOPLAIT GRIEGO,YOPLAIT GRIEGO S/AZUCAR MANGO 145GR,GRIEGO


## Functions

### Timing and tone

In [6]:
import time
import numpy as np
from IPython.lib.display import Audio

start = time.time()
def time_exp(x):
    minutes, seconds = np.floor(x/60), 60*(x/60-np.floor(x/60))
    print(f"{'{:.0f}'.format(minutes)} minutos con {'{:.2f}'.format(seconds)} segundos")
    
def tono(a = 1000, b = 700, play_time_seconds = 1, framerate = 4410):
    t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)*np.pi
    return Audio(np.sin(a*t) + np.sin(b*t), rate = framerate, autoplay=True)

### Clean text

In [7]:
import re
import unicodedata

def clean_text(text):
    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore')
    text = re.sub("[^a-zA-Z\s]",'',text.decode('utf-8'),flags=re.UNICODE)
    text = ' '.join(text.split())
    return text.lower()

### TAD

In [8]:
def tad(data):
    X_test = data.iloc[:,:-1].apply(lambda x:' '.join(x.dropna().astype(str)),
                                    axis=1).apply(clean_text).values
    y_test = data['segmento'].values
    product_id_col = data.index
    return X_test.astype(str),y_test,product_id_col

## Model

### Just nielsen categories

In [9]:
df['nielsen_cat'] = df['segmento'].apply(lambda x: int(x==x.upper())) 
val = df[df['nielsen_cat']==0].copy()
val['segmento'].value_counts()

Water                             218
Yogurt                            214
Plant-based Beverages             106
Desserts & Refrigerated Bakery     56
Name: segmento, dtype: int64

In [10]:
df = df[df['nielsen_cat']==1].copy()
df['segmento'].value_counts()

AGUA NATURAL           210
BB SOLIDO              176
BEBIDAS SABORIZADAS    172
PLANT BASED            129
BB LIQUIDO             123
GRIEGO                  97
SALUD ACTIVA            96
BEBIDAS INFANTILES      42
GELATINAS               33
SPECIALTIES             30
LIGHT                   29
LACTEOS FERMENTADOS     25
INFANTIL SOLIDO         22
POSTRES REGULARES       21
INFANTIL LIQUIDO        18
Name: segmento, dtype: int64

### Train test split

In [11]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df,train_size=0.8,random_state=22)

### Preprocessing

In [12]:
X_train = df_train.iloc[:,:-1].apply(lambda x:' '.join(x.dropna().astype(str)),axis=1
                              ).apply(clean_text).values
y_train = df_train['segmento'].values

### Test

In [13]:
X_test,y_test,_ = tad(df_test)

### Training

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

tv = CountVectorizer(ngram_range=(1,1),min_df=1, 
                     max_features=10000,lowercase=False)
logreg = LogisticRegression()

model_logreg = Pipeline(steps=[('vectorizer', tv),
                               ('model',logreg)])

print('Accuracy score: ','{:.2%}'.format(model_logreg.fit(X_train,y_train).score(X_test,y_test)))
print('Training accuracy: ','{:.2%}'.format(model_logreg.score(X_train,y_train)))

Accuracy score:  100.00%
Training accuracy:  100.00%


### Confussion matrix

In [15]:
from sklearn.metrics import confusion_matrix
model = model_logreg
pd.DataFrame(confusion_matrix(y_test,model.predict(X_test)),
             index=model_logreg[1].classes_,columns=model_logreg[1].classes_
            ).style.background_gradient(cmap='Blues')

Unnamed: 0,AGUA NATURAL,BB LIQUIDO,BB SOLIDO,BEBIDAS INFANTILES,BEBIDAS SABORIZADAS,GELATINAS,GRIEGO,INFANTIL LIQUIDO,INFANTIL SOLIDO,LACTEOS FERMENTADOS,LIGHT,PLANT BASED,POSTRES REGULARES,SALUD ACTIVA,SPECIALTIES
AGUA NATURAL,43,0,0,0,0,0,0,0,0,0,0,0,0,0,0
BB LIQUIDO,0,22,0,0,0,0,0,0,0,0,0,0,0,0,0
BB SOLIDO,0,0,36,0,0,0,0,0,0,0,0,0,0,0,0
BEBIDAS INFANTILES,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0
BEBIDAS SABORIZADAS,0,0,0,0,34,0,0,0,0,0,0,0,0,0,0
GELATINAS,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0
GRIEGO,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0
INFANTIL LIQUIDO,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0
INFANTIL SOLIDO,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0
LACTEOS FERMENTADOS,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0


### Predict

In [16]:
X_val,cs_cat,product_id_col = tad(val)

In [17]:
resultado = val.iloc[:,:-1].join(pd.DataFrame(model_logreg.predict(X_val),
                                              index=product_id_col,
                                              columns=['Nielsen_pred']))
resultado.sample(4)

Unnamed: 0_level_0,category_en,product_name,proveedor_general,marca_generica,descripcion_nielsen,segmento,Nielsen_pred
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
313847,Yogurt,Yoghurt bebible vitalínea sabor guayaba,Danone,Vitalínea,Yoghurt bebible vitalínea sabor guayaba,Yogurt,LIGHT
1014858,Yogurt,Yogurt para beber con mango 250ml,without CPG,Villa de Patos,Yogurt para beber con mango 250ml,Yogurt,SALUD ACTIVA
2805783,Water,Agua,without CPG,Ougaku,Agua,Water,AGUA NATURAL
3465268,Yogurt,Yogurt griego,Fage,Fage,Yogurt griego,Yogurt,GRIEGO


## Export

In [18]:
resultado.to_csv(os.path.join(BASE_DIR,f'nielsen_pred_{CPG}.csv'),
                 sep='\t',encoding='utf-16')

## Fin

In [19]:
print('Accuracy score: ','{:.2%}'.format(model_logreg.fit(X_train,y_train).score(X_test,y_test)))
print('Training accuracy: ','{:.2%}'.format(model_logreg.score(X_train,y_train)))

Accuracy score:  100.00%
Training accuracy:  100.00%


In [20]:
time_exp(time.time() - start)
tono()

0 minutos con 2.06 segundos
