In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
import torch
from torch import nn
from torch.optim import Adam
from transformers import BertTokenizer
from transformers import BertModel
from tqdm import tqdm
from pathlib import Path
from typing import List, Dict, Any

In [3]:
IN_PATH = '/kaggle/input/'
DATA_PATH = IN_PATH+'nlp-txt-classification/'
OUT_PATH = '/kaggle/working/' # path for output directory 

SEED = 42 # reproducible results: Same results in every run
np.random.seed(SEED) 
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True  # cuda algorithms
os.environ['PYTHONHASHSEED'] = str(SEED)

MODEL_NAME = 'bert-base-cased'
NUM_CLASSES = 5
MAX_VOCAB_SIZE = 250000
BATCH_SIZE = 64
debug = 0
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # use 'cuda' if available else 'cpu'

In [4]:
!nvidia-smi

In [5]:
test_df = pd.read_csv(DATA_PATH +'test.csv')
test_df.head()

In [6]:
train_df = pd.read_csv(DATA_PATH +'train.csv')
train_df = train_df[['Text', 'Sentiment']].dropna()
train_df.head()

In [7]:
train_df['Sentiment'].unique()

In [8]:
train_df.Sentiment.value_counts(normalize=True)

In [9]:
train_df.groupby(['Sentiment']).size().plot.bar()

In [10]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [11]:
example_text = 'Pretrained model using a masked language modeling (MLM) objective'
bert_input = tokenizer(
    example_text,
    padding='max_length',
    max_length = 10,
    truncation=True,
    return_tensors="pt",
)

print(bert_input['input_ids'])
print(bert_input['token_type_ids'])
print(bert_input['attention_mask'])

In [12]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
labels = {
    'Extremely Negative': 0,
    'Negative': 1,
    'Neutral': 2,
    'Positive': 3,
    'Extremely Positive': 4,
}

class Dataset(torch.utils.data.Dataset):
    def __init__(self, train_df):
        self.labels = [
            labels[label]
            for label in train_df['Sentiment']
        ]
        self.texts = [
            tokenizer(
                text, 
                padding='max_length',
                max_length = 512,
                truncation=True,
                return_tensors="pt",
            )
            for text in train_df['Text']
        ]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [14]:
train_df_, val_df_ = np.split(train_df.sample(frac=1, random_state=SEED), [int(.8*len(train_df))])

print(len(train_df_), len(val_df_))

In [15]:
class BertClassifier(nn.Module):
    def __init__(self, num_classes:int, dropout:int=0.5):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, num_classes)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [16]:
def train(model, train_data, val_data, learning_rate, epochs):
    train = Dataset(train_data)
    val = Dataset(val_data)
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():
            for val_input, val_label in val_dataloader:
                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()

                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc

        print(' | '.join([
            f'Epochs: {epoch_num + 1}',
            f'Train Loss: {total_loss_train / len(train_data):.3f}',
            f'Train Accuracy: {total_acc_train / len(train_data):.3f}',
            f'Val Loss: {total_loss_val / len(val_data):.3f}',
            f'Val Accuracy: {total_acc_val / len(val_data):.3f}',
        ]))

In [17]:
model = BertClassifier(num_classes=len(labels.keys()))

In [18]:
EPOCHS = 1
LR = 1e-6

train(model, train_df_, val_df_, LR, EPOCHS)

In [19]:
EPOCHS = 2
LR = 1e-6

train(model, train_df_, val_df_, LR, EPOCHS)

In [20]:
def flip_dict(x: Dict[Any, Any]) -> Dict[Any, Any]:
    return dict([
        (v, k)
        for k, v in x.items()
    ])

In [21]:
def predict(model, text: str, labels: Dict[int, str]):
    t = tokenizer(
        text, 
        padding='max_length',
        max_length = 512,
        truncation=True,
        return_tensors="pt",
    )
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
    with torch.no_grad():
        mask = t['attention_mask'].to(device)
        input_id = t['input_ids'].squeeze(1).to(device)
        output = model(input_id, mask)
        pred = output.cpu().numpy()
        idx = np.argmax(pred)
        return labels[idx]

In [22]:
x = test_df.loc[1, 'Text']
pred = predict(model, x, labels=flip_dict(labels))
print(x)
pred

In [23]:
pred_labels = flip_dict(labels)
test_df['Sentiment'] = test_df['Text'].apply(lambda text: predict(model, text, labels=pred_labels))
test_df.head()

In [24]:
submission = test_df[['id', 'Sentiment']]
submission.to_csv('submission.csv', index=False)

In [32]:
torch.save(model.state_dict(),OUT_PATH+'BERT_Txt-classification')