In [1]:
import os
import csv
import math
import random
import time

import numpy as np
import pandas as pd
import torch.nn.functional as F

from tqdm import tqdm
import torch

import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
from transformers import get_cosine_schedule_with_warmup
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import gc
from torch.cuda import amp
gc.enable()

In [None]:
'''import gdown

url = 'https://drive.google.com/uc?id=1B6_rtcmGRy49hqpwoJT-_Ujnt6cYj5Ba'

output = 'file.npy'

gdown.download(url, output, quiet=False)'''

In [None]:
df = pd.read_csv("../input/amazon-ml-challenge-2021-hackerearth/train.csv", error_bad_lines = False, escapechar = "\\" , quoting = csv.QUOTE_NONE)

In [None]:
df = df.dropna(subset=['TITLE'])
df = df.reset_index(drop = True)
df = df.drop_duplicates()
df

In [None]:
map_values=dict()
train  = list(df['BROWSE_NODE_ID'])
counter= 0
for i in train:
  if map_values.get(i)==None:
    map_values[i]=counter
    counter+=1
df['BROWSE_NODE_ID'] = [map_values[x] for x in train]

In [None]:
skf = StratifiedKFold(n_splits=50)
split_df = skf.split(df,df['BROWSE_NODE_ID'])
del split_df

In [None]:
len(df["BROWSE_NODE_ID"].unique())

In [None]:
len(set(df['BROWSE_NODE_ID']))

In [None]:
df_train,df_test,x,y = train_test_split(df,df['BROWSE_NODE_ID'],test_size=0.05)
del df_train
del x
del df

df=df_test
df = df.reset_index(drop = True)
len(df["BROWSE_NODE_ID"].unique())

#create fold
kf = model_selection.GroupKFold(n_splits = 8)
df['kfold'] = -1
df = df.sample(frac = 1).reset_index(drop = True)
y = df.BROWSE_NODE_ID.values
for f, (t_, v_) in enumerate(kf.split(X = df, y = y, groups = df.TITLE.values)):
    df.loc[v_,'kfold'] = f
    
df.to_csv("new.csv", index = False)

In [14]:
NUM_FOLDS = 5
NUM_EPOCHS = 3
BATCH_SIZE = 16
MAX_LEN = 64
ROBERTA_PATH = "../input/roberta-base"
TOKENIZER_PATH = "../input/roberta-base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

In [6]:
class LitDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__()

        self.df = df        
        self.inference_only = inference_only
        self.text = df.TITLE.tolist()
        #self.text = [text.replace("\n", " ") for text in self.text]
        
        if not self.inference_only:
            self.target = torch.tensor(df.BROWSE_NODE_ID.values, dtype=torch.long)        
        self.MAX_LEN = 128
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            max_length = self.MAX_LEN,
            padding='max_length',
            return_attention_mask=True,
            truncation=True
        )        
 

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, index):        
        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return (input_ids, attention_mask)            
        else:
            target = self.target[index]
            return (input_ids, attention_mask, target)

In [7]:
# a = LitDataset(df)
# temp = a.__getitem__(900)
# temp

In [8]:
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start = 8, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        #print(all_layer_embedding.shape)
        #print(self.layer_weights.shape)
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        #print(weight_factor.shape)
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        #print("Weighted average", weighted_average.shape)
        return weighted_average
    
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        config.update({"output_hidden_states": True,
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                      
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config = config)  
        self.pooler = WeightedLayerPooling(config.num_hidden_layers, layer_start = 9, layer_weights = None)
        
        self.attention = nn.Sequential(            
            nn.Linear(768, 1536),            
            nn.ReLU(),    
            nn.Dropout(0.1),
            nn.Linear(1536, 3072),
            nn.ReLU(),
            nn.Linear(3072, 6144),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(6144, 9919),
        )        

  
        

    def forward(self, input_ids, attention_mask):
        output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)     
        all_hidden_states = torch.stack(output[2])
        weighted_pooling_embeddings = self.pooler(all_hidden_states)
        weighted_pooling_embeddings = weighted_pooling_embeddings[:, 0]
        #print(pooled.shape)
        #print(roberta_output[0].shape)
        out = self.attention(weighted_pooling_embeddings)
        #print(out.shape)
        return out
    

In [9]:
def ce_loss(
    pred, truth, smoothing=False, trg_pad_idx=-1, eps=0.1
):
    '''pred = np.argmax(pred, axis = 1)
    print(pred.shape)'''
    
    truth = truth.contiguous().view(-1)

    one_hot = torch.zeros_like(pred).scatter(1, truth.view(-1, 1), 1)

    if smoothing:
        n_class = pred.size(1)
        one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1)

    loss = -one_hot * F.log_softmax(pred, dim=1)

    if trg_pad_idx >= 0:
        loss = loss.sum(dim=1)
        non_pad_mask = truth.ne(trg_pad_idx)
        loss = loss.masked_select(non_pad_mask)

    return loss.sum()


def loss_fn(output, target):
    
    
    #bs = output.size(0)

    loss = ce_loss(
        output,
        target,
        smoothing=False,
        eps=0.1,
    )


    return loss 

In [None]:
"""class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        '''config.update({"output_hidden_states": False,
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})     '''                  
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config = config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.ReLU(),                       
            nn.Linear(512, 9919),
        )        

  
        

    def forward(self, input_ids, attention_mask):
        output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)     
        pooled = output[1]
        #print(pooled.shape)
        #print(roberta_output[0].shape)
        out = self.attention(pooled)
        #print(out.shape)
        return out"""

In [10]:
def train_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()

    tk0 = tqdm(data_loader, total=len(data_loader))
    average_loss = 0
    for i, (ids, attention, target) in enumerate(tk0):

        ids =ids
        attention = attention
        target = target

        ids = ids.to(device, dtype=torch.long)
        attention = attention.to(device, dtype=torch.long)
        target = target.to(device, dtype=torch.long)

        #print(ids.shape, token_type_ids.shape, mask.shape)
        model.zero_grad()
        outputs  = model(
            input_ids=ids,
            attention_mask=attention,
        )
        #print(outputs,  target)
        loss = loss_fn(outputs ,target)
        average_loss += loss.item()
        #print(loss)
        loss.backward()
        optimizer.step()
        scheduler.step()
    print(f"average_loss is {average_loss/len(data_loader)}")

        


In [11]:
def eval_fn(data_loader, model, device):
    model.eval()

    final_targets = []
    final_outputs = []
    tk0 = tqdm(data_loader, total=len(data_loader))
    with torch.no_grad():
        for i, (ids, attention, target) in enumerate(tk0):

            ids = ids
            attention = attention
            target = target

            ids = ids.to(device, dtype=torch.long)
            attention = attention.to(device, dtype=torch.long)
            target = target.to(device, dtype=torch.long)

            #print(ids.shape, token_type_ids.shape, mask.shape)
            outputs  = model(
                input_ids=ids,
                attention_mask=attention,
            )
            
            target = target.detach().cpu().numpy().tolist()
            output = outputs.detach().cpu().numpy().tolist()
            
            final_targets.extend(target)
            final_outputs.extend(outputs)
            
    return final_outputs, final_targets
        


In [12]:
def create_optimizer(model):
    named_parameters = list(model.named_parameters())    
    
    roberta_parameters = named_parameters[:197]    
    attention_parameters = named_parameters[199:203]
    regressor_parameters = named_parameters[203:]
        
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]

    parameters = []
    parameters.append({"params": attention_group})
    parameters.append({"params": regressor_group})

    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if "bias" in name else 0.01

        lr = 2e-5

        if layer_num >= 69:        
            lr = 5e-5

        if layer_num >= 133:
            lr = 1e-4

        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})

    return AdamW(parameters)

In [None]:
def run(fold):
    dfx = pd.read_csv("./new.csv")
    model_path = f"model_{fold + 1}.pth"
    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)
    best_val_rmse = None
    
    train_dataset = LitDataset(df_train)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size= 16,
        num_workers=4,
        shuffle = True
    )

    valid_dataset = LitDataset(df_valid)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size= 1,
        num_workers = 2
    )

    device = torch.device("cuda")
    model = LitModel()
    model.to(device)

    num_train_steps = int(len(df_train) / 16 * 3)
 
    optimizer = create_optimizer(model)
    scheduler = get_cosine_schedule_with_warmup(
        optimizer, 
        num_warmup_steps= 50, 
        num_training_steps=num_train_steps
    )

    print(f"Training is Starting for fold={fold}")
    
    # I'm training only for 3 epochs even though I specified 5!!!
    for epoch in range(3):
        print("===================start training===================")
        train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler)
        print("===================validation=======================")
        pred, target = eval_fn(valid_data_loader, model, device)
        print(len(pred), len(target))
        
        if not best_val_rmse:
          torch.save(model.state_dict(), model_path)
        
        correct_pred = 0
        for pre, tar in zip(pred, target):
            _, prediction_indices = torch.max(pre, dim = 0)
            correct_pred += (prediction_indices == tar).float()
        acc = correct_pred.sum() / len(target)
        acc = torch.round(acc * 100)
        
        if not best_val_rmse or acc < best_val_rmse:                    
            best_val_rmse = acc
            best_epoch = epoch
            torch.save(model.state_dict(), model_path)
            print(f"New best_val_rmse: {best_val_rmse:0.4}")
        else:       
            print(f"Still best_val: {best_val_rmse:0.4}",
                    f"(from epoch {best_epoch})")
            
        print(f"epochs {epoch} , validation accuracy is {acc}")
        

In [None]:
run(fold = 0)

In [None]:
run(fold=1)

In [None]:
run(fold=2)

In [27]:
del test_df

In [28]:
del df

In [2]:
test_df = pd.read_csv("../input/amazon-ml-challenge-2021-hackerearth/test.csv", error_bad_lines = False, escapechar = "\\" , quoting = csv.QUOTE_NONE)
test_df['TITLE'] = test_df['TITLE'].fillna('a')
# test_dataset = LitDataset(test_df, inference_only=True)


In [30]:
del test_dataset

In [34]:
test_df

Unnamed: 0,PRODUCT_ID,TITLE,DESCRIPTION,BULLET_POINTS,BRAND
0,1,"Command 3M Small Kitchen Hooks, White, Decorat...",Sale Unit: PACK,[INCLUDES - 9 hooks and 12 small indoor strips...,Command
1,2,O'Neal Jump Hardware JAG Unisex-Adult Glove (B...,Synthetic leather palm with double-layer thumb...,[Silicone printing for a better grip. Long las...,O'Neal
2,3,"NFL Detroit Lions Portable Party Fridge, 15.8 ...",Boelter Brands lets you celebrate your favorit...,[Runs on 12 Volt DC Power or 110 Volt AC Power...,Boelter Brands
3,4,Panasonic Single Line KX-TS880MX Corded Phone ...,Features: 50 Station Phonebook Corded Phone Al...,Panasonic Landline Phones doesn't come with a ...,Panasonic
4,5,Zero Baby Girl's 100% Cotton Innerwear Bloomer...,"Zero Baby Girl Panties Set. 100% Cotton, Breat...","[Zero Baby Girl Panties, Pack of 6, 100% Cotto...",Zero
...,...,...,...,...,...
110770,110771,AAHNA E MALL OneBlade Hybrid Trimmer Shaver An...,<p>1-All In One Hyper Advanced Smart Rechargea...,"[Unique One Blade can style, trim and shave, w...",Generic
110771,110772,Grin Health N99 Anti Pollution Reusable Washab...,"<p>SIZE GUIDE : M - (35- 65 Kg), L - (49- 72 K...",[PROTECTION: Filtration rate up to ≥99 percent...,Grin Health
110772,110773,Asian Army Pink Ultra reusable respirator clot...,Asian HyperProtect A95 masks have been enginee...,[Reusable and environment friendly: These mask...,ASIAN
110773,110774,IM Safe 3 Ply Non-Woven Disposable Surgical Fa...,This 3 Ply Disposable face mask is manufacture...,[3 Ply Mask: Genuine 3 Ply Mask. 25 GSM Spun B...,Intermarket


In [53]:
class test_infer(Dataset):
    def __init__(self, df):
        self.df = df
        self.title  = df.TITLE.tolist()
        self.id = df.PRODUCT_ID.values
        self.encoded = tokenizer.batch_encode_plus(
            self.title,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True,
            return_attention_mask=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self,index):
        input_ids = torch.tensor(self.encoded['input_ids'][index])
   
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])

        product_id = self.id[index]

        return (
            input_ids,
            attention_mask,
            product_id
        )
    

In [55]:
def predict_fn(data_loader, model, device):
    model.eval()

    final_outputs = []
    tk0 = tqdm(data_loader, total=len(data_loader))
    with torch.no_grad():
        for i, (ids, attention, product_id) in enumerate(tk0):

            ids = ids
            attention = attention
            product_id = product_id
            
   

            ids = ids.to(device, dtype=torch.long)
            attention = attention.to(device, dtype=torch.long)
            #product_id = product_id.to(device, dtype=torch.long)

            #print(ids.shape, token_type_ids.shape, mask.shape)
            outputs  = model(
                input_ids=ids,
                attention_mask=attention,
            )
            
            product_id = product_id.numpy().tolist()
            
            output = outputs.detach().cpu().numpy().tolist()
            final_outputs.extend(outputs)
            
    return product_id, final_outputs

In [50]:
model = LitModel()
model.load_state_dict(torch.load('model_1.pth'))
model.to(DEVICE)

Some weights of the model checkpoint at ../input/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


LitModel(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), 

In [49]:
temp = test_infer(test_df)
temp.__getitem__(0)

tensor([    0, 46785,   155,   448,  7090, 11580, 14943,    29,     6,   735,
            6,  1502, 28590, 33571,  3130,     6, 18609,   374,     6, 18609,
         4995,     6,   361, 14943,    29,     6,   316, 21836,  3275,     6,
        19268,    12, 31331,   111, 12641,  4111,    12, 12015,     2,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1])


(tensor([    0, 46785,   155,   448,  7090, 11580, 14943,    29,     6,   735,
             6,  1502, 28590, 33571,  3130,     6, 18609,   374,     6, 18609,
          4995,     6,   361, 14943,    29,     6,   316, 21836,  3275,     6,
         19268,    12, 31331,   111, 12641,  4111,    12, 12015,     2,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 1)

In [None]:
dataLoader = torch.utils.data.DataLoader(
        test_infer(test_df),
        batch_size= 16,
        num_workers=4,
    )

product_id, final_outputs = predict_fn(dataLoader,model,DEVICE)

 55%|█████▍    | 3787/6924 [03:40<03:08, 16.63it/s]

In [None]:


out = []

map_rev = {v : k for k, v in map_values.items()}

for pre in final_outputs:
    pre_ind = torch.max(pre,dim=0)
    out.append(map_rev[pre_ind])

df_sub = pd.DataFrame(list(zip(product_id, out)),
               columns =['PRODUCT_ID', 'BROWSE_NODE_ID'])
df_sub.to_csv("sub1.csv", index=False)