In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers
     

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/PS3_train.csv")

print('Number of training sentences: {:,}\n'.format(df.shape[0]))

df.sample(10)

Number of training sentences: 35,112



Unnamed: 0,content,title,uid,target_ind
26470,Standard flat washers are an economical choice...,"Nylon 6/6 Flat Washer, #6, 0.15&#034; ID, 0.31...",B000FN15PQ,363
7920,Our finest shirt fabric: Imperial 100s cotton ...,Amazon.com: Imperial 100s European Straight Co...,B00008JP98,131
22618,PS-12180NB 12v 18Ah Lead Acid Battery 12VOLT,Powersonic PS-12180NB 12v 18Ah Lead Acid Battery,B0002ILJZU,401
7599,Danea Gorbett draws on her background in psych...,Adopted Teens Only: A Survival Guide to Adoles...,0595325831,6
34198,Bio-balls do not require replacement unless da...,Marineland PA11486 Canister Filter Bio-Balls P...,B000NRXB5G,378
25655,I really recommend it... Have fun learning usi...,HEBREW in 10 minutes a day®,0944502253,5
28289,"""...accessible...comprehensive but largely jar...",The Observing Guide to the Messier Marathon: A...,0521803861,16
20631,A century of Transylvanian tranquility is abou...,Super Nintendo Super Castlevania IV,B000035XZD,74
11376,"Distractingly loose but clever, this 1987 come...",Innerspace [VHS] (1987),B000055YXK,103
29879,This polyoxymethylene spur gear with 20-degree...,"Spur Gear, 20 Degree Pressure Angle, Polyoxyme...",B000FMUNDC,357


In [None]:
df['info'] =df['title'] + df['content']
sentences = df['info'].values
labels = df['target_ind'].values

In [None]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

In [None]:
import torch

input_ids = []
attention_masks = []


for sent in sentences:

    encoded_dict = tokenizer.encode_plus(
                        sent,                      
                        add_special_tokens = True, 
                        max_length = 256,           
                        padding = 'max_length',
                        truncation = True,
                        return_attention_mask = True,  
                        return_tensors = 'pt',     
                   )
    
      
    input_ids.append(encoded_dict['input_ids'])
    
    
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Original:  Amazon.com: Wrangler Men's Rugged Wear Relaxed Fit Jean: ClothingPremium quality five pocket jean from Wrangler Rugged Wear. This Relaxed Fit Jean is made from 100% cotton denim for durability with extra room in the seat and thigh for comfort.	Men's Wrangler Trail Trekker Relaxed Fit Jeans Set out on a long hike, or kick back for an afternoon full of watching college football from the comfort of your own home. These Wrangler Trail Trekker Relaxed Fit Jeans are up for anything you are! Check 'em out: 100% cotton denim construction; Relaxed 5 pocket style; Easy entry, extra deep front pockets; Solid brass YKK zip fly; Leather waistband patch; Fit easily over boots; Machine wash / dry. Imported. State Color and Size! Get yours today! Men's Wrangler 36" Inseam Trail Trekker Relaxed Fit Jeans
Token IDs: tensor([    0, 25146,     4,   175,    35, 23511, 32268,  4011,    18, 29599,
         4462, 16915, 38924,   196, 14950,  5363,    35, 36221, 46025,  1318,
          292,  7524,  

In [None]:
from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

31,600 training samples
3,512 validation samples


In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler


batch_size = 16

train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size 
        )

validation_dataloader = DataLoader(
            val_dataset, 
            sampler = SequentialSampler(val_dataset), 
            batch_size = batch_size 
        )

In [None]:

from transformers import RobertaForSequenceClassification, AdamW, RobertaConfig

model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base", 
    num_labels = 500, 
    output_attentions = False, 
    output_hidden_states = False, 
)

model.cuda()

In [None]:

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8 
                )
from transformers import get_linear_schedule_with_warmup

epochs = 4

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
import numpy as np

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
     

In [None]:
import time
import datetime

def format_time(elapsed):

    
    elapsed_rounded = int(round((elapsed)))
    
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import random
import numpy as np


seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


training_stats = []


total_t0 = time.time()

model.load_state_dict(torch.load('/content/drive/MyDrive/weights_roberta_epochs11.pth'))

for epoch_i in range(0, epochs):
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    
    t0 = time.time()

    
    total_train_loss = 0
    total_train_accuracy = []
    
    model.train()

    
    for step, batch in enumerate(train_dataloader):

        
        if step % 40 == 0 and not step == 0:
        
            elapsed = format_time(time.time() - t0)
            
            
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            print(f'acc = {(np.mean(total_train_accuracy))}')


        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)


        model.zero_grad()        


        result = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask, 
                       labels=b_labels,
                       return_dict=True)

        loss = result.loss
        logits = result.logits


        total_train_loss += loss.item()

        loss.backward()


        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)


        optimizer.step()

        scheduler.step()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_train_accuracy.append(flat_accuracy(logits, label_ids))
    
    
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

     
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    
    for batch in validation_dataloader:
        

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        

        with torch.no_grad():        


            result = model(b_input_ids, 
                           token_type_ids=None, 
                           attention_mask=b_input_mask,
                           labels=b_labels,
                           return_dict=True)


        loss = result.loss
        logits = result.logits
            
        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

  
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))


    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

    torch.save(model.state_dict(), f'/content/drive/MyDrive/weights_roberta_epochs{epoch_i+10}.pth')
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


In [None]:
import pandas as pd


df = pd.read_csv("/content/drive/MyDrive/PS3_test.csv")


print('Number of test sentences: {:,}\n'.format(df.shape[0]))
df['info'] = df['title'] + df['content']

sentences = df['info'].values
labels = [0]*len(df)


input_ids = []
attention_masks = []

for sent in sentences:

    encoded_dict = tokenizer.encode_plus(
                        sent,                      
                        add_special_tokens = True, 
                        max_length = 256,           
                        padding = 'max_length',
                        truncation = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',     
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    
    
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)


batch_size = 16 


prediction_data = TensorDataset(input_ids, attention_masks, labels)

prediction_dataloader = DataLoader(prediction_data,  batch_size=batch_size, shuffle = False)


Number of test sentences: 8,106



In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import torch.nn.functional as F
#model.load_state_dict(torch.load('/content/drive/MyDrive/weights_roberta_epochs6.pth'))
print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))


model.eval()

 
predictions , true_labels = [], []


for batch in prediction_dataloader:
  
  batch = tuple(t.to(device) for t in batch)
  
  
  b_input_ids, b_input_mask, b_labels = batch
  
  with torch.no_grad():
  
      result = model(b_input_ids, 
                     token_type_ids=None, 
                     attention_mask=b_input_mask,
                     return_dict=True)

  logits = result.logits

  
  predictions.append(F.softmax(logits, dim = 1))
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  
  true_labels.append(label_ids)

print('    DONE.')

Predicting labels for 8,106 test sentences...
    DONE.


In [None]:
probs_robert = torch.cat(predictions)
probs_robert = np.array(probs_robert.cpu())
preds_robert = np.argmax(probs_robert, axis = 1) 

In [None]:
submission_weights = pd.DataFrame({'uid': [t for t in df['uid']], 'target_ind': preds_robert})
submission_weights.to_csv('/content/drive/MyDrive/submission_robert_10.csv')