In [30]:
import shutil
import apiquery
import pandas as pd
import sys
import seaborn as sns
import os
import torch
import numpy as np
import random
DATA_PATH = '../01.Data'
shutil.copy("apiquery_pyc.py", "apiquery.pyc")
module_path = "../src"
if module_path not in sys.path:
    sys.path.append(module_path)
    
from utils.training import *
from utils.encoding import *
from utils.utils import *
from models.models import Roberta_Model
from dataset.dataset import BNPParibasText
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
import math
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import math
import time
import matplotlib.pyplot as plt
import transformers
pd.set_option('display.max_rows', 900)


In [31]:
def get_embedding(data_loader, model, device):
    from tqdm.notebook import tqdm
    # Put the model in eval mode
    model.to(device)
    model.eval()
    # List for store final predictions
    final_predictions = []
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for b_idx, data in enumerate(tk0):
            for key,value in data.items():
                data[key] = value.to(device)
            predictions = model._embeddings(data['ids'],data['mask'])
            predictions = predictions.cpu()
            final_predictions.append(predictions)
    return np.vstack(final_predictions)

In [2]:
MAX_LENGTH   = 16
PRETRAINED   = 'roberta-base'
SEED         = 42

In [32]:
%%time
df_train     = pd.read_csv(os.path.join("../01.Data",'fold.csv'))
y_submission = pd.read_csv(os.path.join(DATA_PATH,'y_test_submission_example.tsv'), index_col='Index', encoding='utf-8', sep='\t')

CPU times: user 2.39 s, sys: 160 ms, total: 2.55 s
Wall time: 2.55 s


In [33]:
%%time
COLUMN_NAME  = 'product_name' 
tokenizer     = transformers.RobertaTokenizer.from_pretrained(PRETRAINED)
train_dataset = BNPParibasText(df_train,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = Roberta_Model(pretrained_model=PRETRAINED)
train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_train = get_embedding(train_loader, model, 'cuda')
df_train[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_train.shape[1])]] = emb_sentence_train

HBox(children=(FloatProgress(value=0.0, max=3189.0), HTML(value='')))


CPU times: user 1min 19s, sys: 13.7 s, total: 1min 32s
Wall time: 1min 31s


In [34]:
COLUMN_NAME  = 'ingredients_text' 
tokenizer     = transformers.RobertaTokenizer.from_pretrained(PRETRAINED)
train_dataset = BNPParibasText(df_train,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = Roberta_Model(pretrained_model=PRETRAINED)
train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_train = get_embedding(train_loader, model, 'cuda')
df_train[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_train.shape[1])]] = emb_sentence_train

HBox(children=(FloatProgress(value=0.0, max=3189.0), HTML(value='')))




In [35]:
columns_modeling = ['additives_n','ingredients_from_palm_oil_n',
                    'ingredients_that_may_be_from_palm_oil_n','target',
                    'states_en_brands','states_en_categories','states_en_characteristics','states_en_expiration date',
                    'states_en_general_complete','states_en_ingredients','pnns_groups_1','pnns_groups_2',
                    'states_en_packaging','states_en_packaging-code-','states_en_photo_upload',
                    'states_en_photo_validate','states_en_product name','states_en_quantity','diff_t'] + [f'emb_product_name_{i}' for i in range(emb_sentence_train.shape[1])] + [f'emb_ingredients_text_{i}' for i in range(emb_sentence_train.shape[1])]
columns_label = df_train[columns_modeling].select_dtypes(include=['object']).columns.to_list()
print(columns_label)

['states_en_brands', 'states_en_categories', 'states_en_characteristics', 'states_en_expiration date', 'states_en_general_complete', 'states_en_ingredients', 'pnns_groups_1', 'pnns_groups_2', 'states_en_packaging', 'states_en_packaging-code-', 'states_en_photo_upload', 'states_en_photo_validate', 'states_en_product name', 'states_en_quantity']


In [36]:
df_train,dict_le = label_encoding(df_train,label_cols = columns_label, drop_original = True, missing_new_cat = True)

Mode: Missing as new category
Label Encoding:  label_states_en_brands
Label Encoding:  label_states_en_categories
Label Encoding:  label_states_en_characteristics
Label Encoding:  label_states_en_expiration date
Label Encoding:  label_states_en_general_complete
Label Encoding:  label_states_en_ingredients
Label Encoding:  label_pnns_groups_1
Label Encoding:  label_pnns_groups_2
Label Encoding:  label_states_en_packaging
Label Encoding:  label_states_en_packaging-code-
Label Encoding:  label_states_en_photo_upload
Label Encoding:  label_states_en_photo_validate
Label Encoding:  label_states_en_product name
Label Encoding:  label_states_en_quantity


In [37]:
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'rmse'},
        'num_leaves':12,
        'learning_rate': 0.08,
        "min_child_samples": 150,
        "max_depth" : 5,
        'feature_fraction':  0.5,
        "bagging_freq": 1,
        'bagging_fraction': 0.75,
        "is_unbalance" : False,
        'force_col_wise':True,
        'num_threads':18,
        #"scale_pos_weight":5 -> Generally  is the ratio of number of negative class to the positive class.
        'bagging_seed':42,
        'lambda_l1':1.5,
        'lambda_l2':1,
        'verbose': 1

}
cat_columns = [i for i in df_train.columns.to_list() if i.startswith('label_')]
columns_modeling_last = list(set(columns_modeling)-set(columns_label)) + ['fold'] + cat_columns 

In [None]:
results,models,importances,oof,feature_list = Training_Lightgbm(df_train[columns_modeling_last],params,fold_column = 'fold',target_column = 'target',cat_vars = cat_columns ,metric = 'RMSE',early_stopping = 200,max_boost_round = 8000)

Columns: ['emb_ingredients_text_456', 'emb_product_name_287', 'emb_ingredients_text_67', 'emb_ingredients_text_572', 'emb_product_name_525', 'emb_ingredients_text_90', 'emb_ingredients_text_511', 'emb_product_name_39', 'emb_product_name_238', 'emb_ingredients_text_582', 'emb_ingredients_text_559', 'emb_ingredients_text_431', 'emb_product_name_339', 'emb_product_name_718', 'emb_product_name_253', 'emb_product_name_270', 'emb_product_name_155', 'emb_product_name_715', 'emb_product_name_600', 'emb_product_name_633', 'emb_ingredients_text_501', 'emb_product_name_459', 'emb_ingredients_text_448', 'emb_product_name_291', 'emb_product_name_103', 'emb_ingredients_text_712', 'emb_product_name_164', 'emb_ingredients_text_373', 'emb_ingredients_text_185', 'emb_product_name_83', 'emb_ingredients_text_65', 'emb_ingredients_text_574', 'emb_product_name_70', 'emb_product_name_246', 'emb_ingredients_text_45', 'emb_ingredients_text_425', 'emb_ingredients_text_193', 'emb_product_name_162', 'emb_ingredie

[LightGBM] [Info] Total Bins 392061
[LightGBM] [Info] Number of data points in the train set: 81622, number of used features: 1554




[LightGBM] [Info] Start training from score 9.171473
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.73533	valid_1's rmse: 6.785
[100]	training's rmse: 6.31829	valid_1's rmse: 6.40844
[150]	training's rmse: 6.09265	valid_1's rmse: 6.22278
[200]	training's rmse: 5.93063	valid_1's rmse: 6.10217
[250]	training's rmse: 5.8009	valid_1's rmse: 6.01463
[300]	training's rmse: 5.69057	valid_1's rmse: 5.94288
[350]	training's rmse: 5.59228	valid_1's rmse: 5.88203
[400]	training's rmse: 5.50517	valid_1's rmse: 5.83152
[450]	training's rmse: 5.42533	valid_1's rmse: 5.78964
[500]	training's rmse: 5.35268	valid_1's rmse: 5.75596
[550]	training's rmse: 5.28386	valid_1's rmse: 5.72312
[600]	training's rmse: 5.21896	valid_1's rmse: 5.69588
[650]	training's rmse: 5.15768	valid_1's rmse: 5.6717
[700]	training's rmse: 5.09901	valid_1's rmse: 5.64839
[750]	training's rmse: 5.04504	valid_1's rmse: 5.62748
[800]	training's rmse: 4.99039	valid_1's rmse: 5.60894
[850]	trai

[5600]	training's rmse: 2.58233	valid_1's rmse: 5.19768
[5650]	training's rmse: 2.56784	valid_1's rmse: 5.19698
[5700]	training's rmse: 2.55336	valid_1's rmse: 5.19516
[5750]	training's rmse: 2.53982	valid_1's rmse: 5.1931
[5800]	training's rmse: 2.52659	valid_1's rmse: 5.19205
[5850]	training's rmse: 2.51309	valid_1's rmse: 5.1907
[5900]	training's rmse: 2.49984	valid_1's rmse: 5.18968
[5950]	training's rmse: 2.48659	valid_1's rmse: 5.18915
[6000]	training's rmse: 2.47338	valid_1's rmse: 5.18801
[6050]	training's rmse: 2.45996	valid_1's rmse: 5.18728
[6100]	training's rmse: 2.44737	valid_1's rmse: 5.18653
[6150]	training's rmse: 2.43402	valid_1's rmse: 5.18567
[6200]	training's rmse: 2.42097	valid_1's rmse: 5.18502
[6250]	training's rmse: 2.40857	valid_1's rmse: 5.18375
[6300]	training's rmse: 2.39644	valid_1's rmse: 5.18285
[6350]	training's rmse: 2.38387	valid_1's rmse: 5.1821
[6400]	training's rmse: 2.37172	valid_1's rmse: 5.18156
[6450]	training's rmse: 2.35935	valid_1's rmse: 5.1

[LightGBM] [Info] Total Bins 392061
[LightGBM] [Info] Number of data points in the train set: 81622, number of used features: 1554




[LightGBM] [Info] Start training from score 9.169930
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.72244	valid_1's rmse: 6.79149
[100]	training's rmse: 6.30199	valid_1's rmse: 6.4285
[150]	training's rmse: 6.07629	valid_1's rmse: 6.24614
[200]	training's rmse: 5.91538	valid_1's rmse: 6.12885
[250]	training's rmse: 5.78898	valid_1's rmse: 6.04429
[300]	training's rmse: 5.68124	valid_1's rmse: 5.97361
[350]	training's rmse: 5.58563	valid_1's rmse: 5.91422
[400]	training's rmse: 5.49996	valid_1's rmse: 5.86465
[450]	training's rmse: 5.41982	valid_1's rmse: 5.82423
[500]	training's rmse: 5.34593	valid_1's rmse: 5.78579
[550]	training's rmse: 5.27667	valid_1's rmse: 5.74995
[600]	training's rmse: 5.2154	valid_1's rmse: 5.7252
[650]	training's rmse: 5.15456	valid_1's rmse: 5.69831
[700]	training's rmse: 5.0959	valid_1's rmse: 5.67393
[750]	training's rmse: 5.04226	valid_1's rmse: 5.65278
[800]	training's rmse: 4.98931	valid_1's rmse: 5.63341
[850]	trai

[4950]	training's rmse: 2.78453	valid_1's rmse: 5.22317
[5000]	training's rmse: 2.76885	valid_1's rmse: 5.22191
[5050]	training's rmse: 2.75311	valid_1's rmse: 5.22076
[5100]	training's rmse: 2.73694	valid_1's rmse: 5.21952
[5150]	training's rmse: 2.72123	valid_1's rmse: 5.21758
[5200]	training's rmse: 2.70561	valid_1's rmse: 5.2165
[5250]	training's rmse: 2.68991	valid_1's rmse: 5.21365
[5300]	training's rmse: 2.67564	valid_1's rmse: 5.21269
[5350]	training's rmse: 2.66122	valid_1's rmse: 5.21186
[5400]	training's rmse: 2.64718	valid_1's rmse: 5.21022
[5450]	training's rmse: 2.63279	valid_1's rmse: 5.2081
[5500]	training's rmse: 2.61818	valid_1's rmse: 5.20761
[5550]	training's rmse: 2.60311	valid_1's rmse: 5.20621
[5600]	training's rmse: 2.58832	valid_1's rmse: 5.20383
[5650]	training's rmse: 2.57454	valid_1's rmse: 5.20167
[5700]	training's rmse: 2.55994	valid_1's rmse: 5.2002
[5750]	training's rmse: 2.54587	valid_1's rmse: 5.19868
[5800]	training's rmse: 2.53205	valid_1's rmse: 5.1

[LightGBM] [Info] Total Bins 392061
[LightGBM] [Info] Number of data points in the train set: 81622, number of used features: 1554




[LightGBM] [Info] Start training from score 9.171253
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.735	valid_1's rmse: 6.78152
[100]	training's rmse: 6.31889	valid_1's rmse: 6.40721
[150]	training's rmse: 6.09399	valid_1's rmse: 6.22354
[200]	training's rmse: 5.93206	valid_1's rmse: 6.09876
[250]	training's rmse: 5.80006	valid_1's rmse: 6.00057
[300]	training's rmse: 5.68863	valid_1's rmse: 5.92727
[350]	training's rmse: 5.58861	valid_1's rmse: 5.86612
[400]	training's rmse: 5.5014	valid_1's rmse: 5.81639
[450]	training's rmse: 5.42157	valid_1's rmse: 5.77348
[500]	training's rmse: 5.34865	valid_1's rmse: 5.73725
[550]	training's rmse: 5.28173	valid_1's rmse: 5.70425
[600]	training's rmse: 5.22054	valid_1's rmse: 5.6757
[650]	training's rmse: 5.16094	valid_1's rmse: 5.65029
[700]	training's rmse: 5.10201	valid_1's rmse: 5.62446
[750]	training's rmse: 5.04849	valid_1's rmse: 5.60392
[800]	training's rmse: 4.99419	valid_1's rmse: 5.58417
[850]	trai

## Evaluando

In [19]:
df_test      = pd.read_csv(os.path.join(DATA_PATH,'test_preprocessed.csv'))
df_test['target'] = -1

COLUMN_NAME  = 'product_name' 

test_dataset = BNPParibasText(df_test,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = Roberta_Model(pretrained_model=PRETRAINED)
test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_test = get_embedding(test_loader, model, 'cuda')
df_test[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_test.shape[1])]] = emb_sentence_test
df_test = apply_label_encoder(df_test,dict_le,drop_original = True, missing_new_cat = True)



COLUMN_NAME  = 'ingredients_text' 

test_dataset = BNPParibasText(df_test,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = Roberta_Model(pretrained_model=PRETRAINED)
test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_test = get_embedding(test_loader, model, 'cuda')
df_test[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_test.shape[1])]] = emb_sentence_test
df_test = apply_label_encoder(df_test,dict_le,drop_original = True, missing_new_cat = True)

HBox(children=(FloatProgress(value=0.0, max=798.0), HTML(value='')))


Mode: Missing as new category
Applying Label Encoding:  label_states_en_brands
Applying Label Encoding:  label_states_en_categories
Applying Label Encoding:  label_states_en_characteristics
Applying Label Encoding:  label_states_en_expiration date
Applying Label Encoding:  label_states_en_general_complete
Applying Label Encoding:  label_states_en_ingredients
Applying Label Encoding:  label_pnns_groups_1
Applying Label Encoding:  label_pnns_groups_2
Applying Label Encoding:  label_states_en_packaging
Applying Label Encoding:  label_states_en_packaging-code-
Applying Label Encoding:  label_states_en_photo_upload
Applying Label Encoding:  label_states_en_photo_validate
Applying Label Encoding:  label_states_en_product name
Applying Label Encoding:  label_states_en_quantity


In [20]:
probs = 0
for i in models:
    probs = probs + (i.predict(df_test[feature_list]))
    
    print('fin_predict')
y_test_pred = probs/5.0
print(f'Real: ',math.sqrt(mean_squared_error(y_test_pred,df_test['Target'].values)))

fin_predict
fin_predict
fin_predict
fin_predict
fin_predict
Real:  5.748087686708285


In [21]:
y_submission['target'] = y_test_pred
y_submission.head()

Unnamed: 0_level_0,target
Index,Unnamed: 1_level_1
37320,14.072644
3913,20.0384
112180,10.424357
128820,12.515797
16037,19.302766


In [22]:
#Enviar los resultados
#apiquery.submit_api(y_submission,
#       competition_name='food',
#        subname='test_v2', # Pueden cambiar esto sin problemas, poner el nombre que quieran.
#        holdout_key='None',
#        update_ldb=True,
#        username="Insight ML - DD" # Poner el nombre de su equipo como un string. 
                                  # El mejor de los resultados dentro de sus envios es el que aparecera en la tabla de posiciones.
#)

requests number 1
200
{'Date': 'Tue, 18 May 2021 20:58:56 GMT', 'Content-Type': 'application/json', 'Content-Length': '495', 'Connection': 'keep-alive', 'X-Request-ID': '9VDYQEXOTIL4RSGH', 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Methods': 'POST', 'Access-Control-Allow-Headers': 'authorization,content-type'}


{'competition_name': 'food',
 'file_path': 'none',
 'message': 'Submission validated.',
 'name': 'Insight ML - DD',
 'result_csv_file': 'test_v2',
 'score': 5.748294411988524,
 'score2': None,
 'score3': None,
 'sub_name': 'test_v2',
 'sub_uid': '8ff2732f-f618-4572-912d-bfd4d0799d1d',
 'submission_time': '2021/05/18, 20:58:56'}