This notebook aims at training, and predicting with a BERT model.

In [None]:
import pandas as pd
import numpy as np
import re
import string

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import torch
torch.cuda.set_device('cuda:1')

from collections import defaultdict

import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
from torch.utils.data import DataLoader, Dataset
from torch.autograd import Variable

from sklearn.model_selection import GroupKFold, StratifiedKFold, train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup

from transformers import AutoModel, AutoTokenizer, AutoConfig

# Parameters

In [None]:
model_options = [['camembert-base'],
                 ['flaubert/flaubert_small_cased', 'flaubert/flaubert_base_uncased',
                  'flaubert/flaubert_base_cased', 'flaubert/flaubert_large_cased']]
model_name = model_options[0][0]
print(model_name)

In [None]:
train_file = "sentiment_dataset/train_new.csv"
#predict_file = "sentiment_dataset/test.csv"

df_train_origin = pd.read_csv(train_file, sep=";")
#df_predict = pd.read_csv(predict_file, sep=";")

In [None]:
max_len = 150
random_seed = 2020
device = "cuda:1"
class_names = ["positif", "negatif", "neutre"]
batch_size = 16
epochs = 5

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.config.num_labels = 3

# Initialisation

In [None]:
class TweetDataset(Dataset):

    def __init__(self, tweets, targets, tokenizer, max_len):
        self.tweets = tweets
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
  
    def __len__(self):
        return len(self.tweets)
  
    def __getitem__(self, item):
        tweet = str(self.tweets[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(tweet, add_special_tokens=True, max_length=self.max_len, 
            return_token_type_ids=False, padding='max_length', return_attention_mask=True, return_tensors='pt',
            truncation=True)

        return {
            'tweet_text': tweet,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
            }

    
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = TweetDataset(tweets=df.text.to_numpy(), targets=df.target.to_numpy(), tokenizer=tokenizer, max_len=max_len)

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=16
        )

class SentimentClassifier(nn.Module):

    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.model = model
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.model.config.hidden_size, n_classes)
  
    def forward(self, input_ids, attention_mask):
        last_layer = model(input_ids=input_ids, attention_mask=attention_mask)[0]
        cls_embedding = last_layer[:, 0, :]
        output = self.drop(cls_embedding)
        return self.out(output)

def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()

    losses = []
    correct_predictions = 0
  
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

def get_predictions(model, data_loader):
    model = model.eval()

    texts = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:

            texts = d["tweet_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)

            probs = F.softmax(outputs, dim=1)

            texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(probs)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
    return texts, predictions, prediction_probs, real_values

def show_confusion_matrix(confusion_matrix):
    hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
    plt.ylabel('True sentiment')
    plt.xlabel('Predicted sentiment');

# Training

In [None]:
n=4
skf = StratifiedKFold(n_splits=n, random_state=random_seed, shuffle=True)
results = []

sclf = SentimentClassifier(n_classes=3)
sclf = sclf.to(device)

optimizer = AdamW(sclf.parameters(), lr=1e-5, correct_bias=False)

loss_fn = nn.CrossEntropyLoss().to(device)

torch.save(sclf.state_dict(), 'reset_model_state_clrep.bin')
torch.save(optimizer.state_dict(), 'reset_model_optimizer_clrep.bin')

global_best_accuracy = 0

for fold, (train_index, val_index) in enumerate(skf.split(df_train_origin, df_train_origin['target']), 1):
    df_train = df_train_origin.iloc[train_index]
    df_val = df_train_origin.iloc[val_index]
    
    print('\nFold {} Training Set Shape = {} - Validation Set Shape = {}'.format(fold, df_train.shape, df_val.shape))
    
    sclf.load_state_dict(torch.load('SAVE_RERUN_global_best_model_state.bin'))
    sclf = sclf.to(device)

    optimizer.load_state_dict(torch.load('reset_model_optimizer_clrep.bin'))
    
    train_data_loader = create_data_loader(df_train, tokenizer, max_len, batch_size)
    val_data_loader = create_data_loader(df_val, tokenizer, max_len, batch_size)

    total_steps = len(train_data_loader) * epochs

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    
    history = defaultdict(list)
    best_accuracy = 0

    for epoch in range(epochs):

        print(f'Epoch {epoch + 1}/{epochs}')
        print('-' * 10)

        train_acc, train_loss = train_epoch(
            sclf,
            train_data_loader,    
            loss_fn, 
            optimizer, 
            device, 
            scheduler, 
            len(df_train)
        )

        print(f'Train loss {train_loss} accuracy {train_acc}')

        val_acc, val_loss = eval_model(
            sclf,
            val_data_loader,
            loss_fn, 
            device, 
            len(df_val)
        )

        print(f'Val   loss {val_loss} accuracy {val_acc}')
        print()

        history['train_acc'].append(train_acc)
        history['train_loss'].append(train_loss)
        history['val_acc'].append(val_acc)
        history['val_loss'].append(val_loss)

        if val_acc > best_accuracy:
            best_accuracy = val_acc
            if val_acc > global_best_accuracy:
                global_best_accuracy = val_acc
                torch.save(sclf.state_dict(), 'global_best_model_state_new_clrep.bin')
                
    
    print("**"*10)
    print("Best val accuracy : "+str(best_accuracy.tolist()))
    print("**"*10)
    results.append(best_accuracy.tolist())

print("-"*10)
print(f"{n}-fold CV accuracy result: Mean: {np.mean(results)} Standard deviation:{np.std(results)}")
print("-"*10)
print(f"global best accuracy : {global_best_accuracy}")

In [None]:
sclf = SentimentClassifier(len(class_names))
sclf.load_state_dict(torch.load('best_model_state.bin'))
sclf = sclf.to(device)

In [None]:
for fold, (train_index, val_index) in enumerate(skf.split(df_train_origin, df_train_origin['target']), 1):
#    if fold == 1:
    df_train = df_train_origin.iloc[train_index]
    df_val = df_train_origin.iloc[val_index]
    train_data_loader = create_data_loader(df_train, tokenizer, max_len, batch_size)
    val_data_loader = create_data_loader(df_val, tokenizer, max_len, batch_size)

    y_texts, y_pred, y_pred_probs, y_test = get_predictions(sclf, val_data_loader)

    cm = confusion_matrix(y_test, y_pred)
    df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
    #show_confusion_matrix(df_cm)
    
    hmap = sns.heatmap(df_cm, annot=True, fmt="d", cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
    plt.ylabel('True sentiment')
    plt.xlabel('Predicted sentiment');
    plt.show()
    
    

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=class_names))

# Predictions

In [None]:
sclf = SentimentClassifier(len(class_names))
sclf.load_state_dict(torch.load('best_model_state.bin'))
sclf = sclf.to(device)

In [None]:
predict_file = "sentiment_dataset/dataset_small.csv"
df_predict = pd.read_csv(predict_file, sep="|")

## Preprocessing

In [None]:
import re
import os
import swifter
from tqdm import tqdm
import pickle

os.environ['NUMEXPR_MAX_THREADS'] = '45'

In [None]:
df_predict["target"] = [2]*len(df_predict)
predict_data_loader = create_data_loader(df_predict, tokenizer, max_len, batch_size)
texts, predictions, prediction_probs, real_values = get_predictions(sclf, predict_data_loader)
df_predict["target"] = predictions.tolist()

In [None]:
df_predict.set_index("Unnamed: 0", inplace=True, drop=True)

In [None]:
df_predict.to_csv("sentiment_dataset/predicted_small.csv", sep=";", header=list(df_predict.columns))