In [None]:
!pip install torchtext==0.10.0
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, Dataset

from sklearn.metrics import roc_auc_score

import re


from tqdm.notebook import tqdm

from typing import *
import string

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

In [None]:
from transformers import DistilBertTokenizer, AdamW
from transformers import DistilBertModel, DistilBertConfig, DistilBertForSequenceClassification

In [None]:
SEED = 42
EPOCHS = 2
SEQ_SIZE = 150
BATCH_SIZE = 32
PRE_TRAINED_MODEL_NAME = "distilbert-base-uncased"

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
print((tokenizer.pad_token, tokenizer.pad_token_id), (tokenizer.sep_token, tokenizer.sep_token_id), 
      (tokenizer.cls_token, tokenizer.cls_token_id), (tokenizer.unk_token, tokenizer.unk_token_id))

('[PAD]', 0) ('[SEP]', 102) ('[CLS]', 101) ('[UNK]', 100)


In [None]:
#importing files
train = pd.read_csv('/content/drive/My Drive/DL/DL_Final_Project/source/train.csv')
test = pd.read_csv('/content/drive/My Drive/DL/DL_Final_Project/source/test.csv')
valid = pd.read_csv('/content/drive/My Drive/DL/DL_Final_Project/source/valid.csv')

In [None]:
train_texts = train['text'].values
train_labels = train['class'].values
test_texts = test['text'].values
test_labels = test['class'].values
valid_texts = valid['text'].values
valid_labels = valid['class'].values

In [None]:
train_texts[0]

'why why why questions keep piling every time read another one books subject again maybe im inherently defeatist surrendering utterly character loved deeply truly became mass rapist unwilling whore sure gone sorts mental gymnastics get far purchasing hardcover books every one mammoth tomes came actually managing summon zombies enthusiasm realize theres new book coming every time reading yet another ask damn question why first fair star book parts love easily definable wouldnt difficult task cut paste sections together one unified whole may reach hundred hundred fifty pages max sections ask start novel digressions manny interactions police finally end parts could actually take less fifth actual novel rest worthless whiny relationship shit boring sex drama whiny relationship shit boring sex boring sex entirely fair tiny bit less drama previous novels still filler many fucking lovers one woman fuck maintain deep meaningful relationships managing make every single reader lkhs books complet

In [None]:
class GoodreadsDataset(Dataset):

    def __init__(self, comments, targets, tokenizer, max_len):
        assert len(comments) == len(targets)
        self.comments = comments
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, item):
        comment = str(self.comments[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(comment,
                                              add_special_tokens=True,
                                              max_length=self.max_len,
                                              return_token_type_ids=False,
                                              pad_to_max_length=True,
                                            #   padding='max_length',
                                              return_attention_mask=True,
                                              return_tensors='pt',
                                             )
        return {'review_text': comment,
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'targets': torch.tensor(target, dtype=torch.long)}

In [None]:
def create_data_loader(df: pd.DataFrame, tokenizer, max_len: int, batch_size: int):
    ds = GoodreadsDataset(comments=df.text.to_numpy(),
                        targets=df['class'].to_numpy(),
                        tokenizer=tokenizer,
                        max_len=max_len)

    return DataLoader(ds, batch_size=batch_size)


In [None]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)

In [None]:
set_seed(SEED)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [None]:
config = DistilBertConfig.from_pretrained(PRE_TRAINED_MODEL_NAME)
config.num_labels = len(train_labels)
config.problem_type = "single_label_classification"
config.classifier_dropout = 0.2
config.return_dict = True

In [None]:
model = DistilBertForSequenceClassification(config)
model = model.to(device)
optim = torch.optim.Adam(model.parameters(), lr=2e-5)

In [None]:
train_dataloader = create_data_loader(df=train, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=BATCH_SIZE)
val_dataloader = create_data_loader(df=valid, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
test_dataloader = create_data_loader(df=test, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)

In [None]:
def train_epoch_for_hf(model, data_loader: DataLoader, device: torch.device, optimizer):
    """
    hf = huggingface.
    """
    model.train()

    for batch in tqdm(data_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["targets"].to(device)
        
        optimizer.zero_grad()

        with torch.set_grad_enabled(True):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

In [26]:
def evaluate_for_hf(model, data_loader: DataLoader, device: torch.device):
    model.eval()
    losses = []
    score = None

    for idx, batch in enumerate(tqdm(data_loader)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["targets"].to(device)
        with torch.set_grad_enabled(False):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
            if idx == 0:
                score =  outputs.logits.cpu()
            else:
                score = torch.cat((score, outputs.logits.cpu()))
            losses.append(outputs.loss.item())
    return score, np.mean(losses)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)



In [None]:
train.dtypes

class             int64
text             object
review_length     int64
dtype: object

In [25]:
best_val_loss = 9999.
print('====START TRAINING====')
for epoch in tqdm(range(EPOCHS)):
     print('-' * 10)
     train_epoch_for_hf(model=model, data_loader=train_dataloader, optimizer=optimizer, device=device)
     _, tr_loss = evaluate_for_hf(model=model, data_loader=train_dataloader, device=device)
     val_pred, val_loss = evaluate_for_hf(model=model, data_loader=val_dataloader, device=device)
     y_pred_np = val_pred.numpy()
     val_auc = roc_auc_score(valid['class'].to_numpy(), y_pred_np)
     if val_loss < best_val_loss:
         best_val_loss = val_loss
         torch.save(model.state_dict(), 'distill_bert.pt')
     print(f'Epoch {epoch + 1}/{EPOCHS}', f'train loss: {tr_loss:.4},', f'val loss: {val_loss:.4},', f'val auc: {val_auc:.4}')

====START TRAINING====


  0%|          | 0/2 [00:00<?, ?it/s]

----------


  0%|          | 0/703 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


  0%|          | 0/703 [00:00<?, ?it/s]

RuntimeError: ignored

In [None]:
model = DistilBertForSequenceClassification(config)
model.load_state_dict(torch.load('../input/jigsav-distill-bert/distill_bert.pt'))
model = model.to(device)

In [None]:
test_pred, test_loss = evaluate_for_hf(model=model, data_loader=test_dataloader, device=device)
print('====TEST RESULT====')
print(f'Log loss: {test_loss:.5}')
y_pred_np = test_pred.numpy()
test_auc = roc_auc_score(df_test[y_label].to_numpy(), y_pred_np)
print(f'ROC AUC: {test_auc:.5}')

  0%|          | 0/1926 [00:00<?, ?it/s]



In [None]:
final_model = DistilBertForSequenceClassification(config)
model.load_state_dict(torch.load('../input/jigsav-final-distill-bert/final_distill_bert.pt'))
final_model = model.to(device)

Epoch: 0001/0003 | Batch 0000/1405 | Loss: 0.6895
