# CS-433 Project 2 - Text Classification

This Notebook contains necessary steps for a complete BERT model. 

Pre-processing functions are imported form 'preprocessing.py'.

It was trained on the Google Colab, with a Nvidia Tesla P100.

# 1. Setup

First, install transformers and datasets.

In [None]:
!pip install git+https://github.com/huggingface/transformers.git
!pip install datasets

Check the current GPU infos if train on Google Colab

In [None]:
'''Check the current GPU infos if available'''
# torch.cuda.empty_cache()
!nvidia-smi

The following codes are used to connect to Google drive.

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

## Import

In [None]:
import random
import csv
from os import mkdir

import numpy as np
import torch
from torch.nn.utils import clip_grad_norm_
from torch import nn
from transformers import Trainer

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_cosine_schedule_with_warmup, BertTokenizer
from transformers import BertModel
from transformers import TrainingArguments

from tqdm.auto import tqdm, trange
import pandas as pd
tqdm.pandas()

from datasets import Dataset

from sklearn.metrics import f1_score

from preprocessing import *

%load_ext autoreload
%autoreload 2

SEED = 517

## Set path and device

In [None]:
'''set device to GPU or CPU'''
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
'''save data to the defined path'''
# path = '/content/drive/My Drive/ml_bert/'
path = '../'

# 2. Preprocess

## 2.1 Load and Clean

In [None]:
''' import dataset, preprocess and save in a tsv file '''

with open(path+'data/train_pos_full.txt', 'r', encoding='utf-8') as pos,\
        open(path+'data/train_neg_full.txt', 'r', encoding='utf-8') as neg,\
        open(path+'data/train_clean.tsv', 'w', encoding='utf-8') as out:
    print('label\ttweet', file=out)
    for l in tqdm(neg, total=1250000, desc='Neg'):
        print('0\t' + preprocess(l), file=out)
    for l in tqdm(pos, total=1250000, desc='Pos'):
        print('1\t' + preprocess(l), file=out)

To save time, we can also directly load tran_clean.tsv already prepared.

In [None]:
'''load tsv file'''
train_df = pd.read_csv(path+'data/train_clean.tsv', delimiter='\t', index_col=False)

Dropping duplicates can save time; NA will lead to bugs in the trainer

In [None]:
'''drop duplicate and null rows'''
train_df = train_df.drop_duplicates()
train_df = train_df.dropna()

## 2.2 Tokenize

In [None]:
'''Define tokenizer'''
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
'''define functions for tokenizing'''
def tokenize_text(example):
    return tokenizer(example['text'], padding='max_length', truncation=True, max_length=64)

In [None]:
'''Drop the automatically generated items'''
train_dataset = Dataset.from_pandas(train_df).remove_columns('__index_level_0__').rename_column('tweet','text')

In [None]:
'''Tokenize train data'''
train_dataset = train_dataset.map(tokenize_text,batched=True)

In [None]:
'''to save train_dataset'''
torch.save(train_dataset, path+'data/train_dataset.pt')

In [None]:
'''to load dataset'''
# train_dataset = torch.load(path+'data/train_dataset.pt')

In [None]:
'''split into train and val'''
train_len = int(0.95*len(train_dataset))
val_len = len(train_dataset) - train_len
train,val = torch.utils.data.random_split(train_dataset, [train_len,val_len])

## 2.3 Process Test Data

In [None]:
'''load test data'''
with open(path+'data/test_data.txt', 'r', encoding='utf-8') as test_file:
    lst = [line.rstrip('\n').split(',', 1) for line in test_file]
    test_df = pd.DataFrame(lst, columns=['id', 'tweet'])

In [None]:
'''clean'''
test_data = test_df['tweet'].apply(preprocess).to_frame()

In [None]:
'''tokenize'''
test_data = Dataset.from_pandas(test_data).rename_column('tweet','text').map(tokenize_text)

# 3. Trainer

## 3.1 Preparations for Trainer

Now we define parameters and functions for the trainer.

In [None]:
'''Set Parameters for Trainer'''

batch_size = 64
logging_steps = len(train)
output_dr = path
class_weight = torch.tensor([0.5,0.5])
LR = 3e-5
MAX_GRAD_NORM = 1

In [None]:
'''succeed Trainer'''

class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs = False):
        outputs = model(**inputs)
        # print(inputs)
        logits = outputs.get('logits')
        labels = inputs.get('labels')
        loss_func = nn.CrossEntropyLoss(weight = class_weight).to(device)
        loss = loss_func(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    return {'f1':f1}

In [None]:
training_args = TrainingArguments(output_dir=output_dr, num_train_epochs=10, learning_rate=LR, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, weight_decay=0.01, evaluation_strategy='steps', eval_steps=20000, logging_steps=logging_steps, save_steps=20000, push_to_hub=False, load_best_model_at_end=True)


## 3.2 Training the trainer

In [None]:
'''Set from the pre-trained model'''
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 2).to(device)

'''It is also practical to load the model from the checkpoint in case of a sudden disconnect or crash'''
# model = AutoModelForSequenceClassification.from_pretrained(path+'checkpoint-120000-1219', num_labels = 2)

In [None]:
trainer = WeightedLossTrainer(model=model, args=training_args, compute_metrics=compute_metrics, train_dataset=train, eval_dataset=val)

In [None]:
trainer.train()

# 4. Predict

In [None]:
'''use the finished model and trainer above, or load model from certain checkpoint as follows''' 
# model = AutoModelForSequenceClassification.from_pretrained(path+'checkpoint-50000', num_labels = 2)
# trainer = WeightedLossTrainer(model = model, args=training_args, train_dataset=train, eval_dataset=val)

In [None]:
'''predict results'''
preds = trainer.predict(test_data).predictions

In [None]:
'''get labels'''
test_label = []
for i in range(len(preds)):
  if preds[i][0]>preds[i][1]:
    test_label.append(-1)
  else:
    test_label.append(1)

In [None]:
'''export results'''
test_id = test_df['id'].values.tolist()
with open(path+'submission_bert.csv', 'w') as csvfile:
    fieldnames = ['Id', 'Prediction']
    writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
    writer.writeheader()
    for r1, r2 in zip(test_id, test_label):
        writer.writerow({'Id': int(r1), 'Prediction': int(r2)})