In [303]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast

# specify GPU
device = torch.device("cuda")

In [304]:
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import unidecode
from sklearn.preprocessing import StandardScaler

In [305]:
df = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
label_code = {
    'EAP': 0,
    'HPL': 1,
    'MWS': 2
}
def label_code_apply(x):
    return label_code[x]
df['author'] = df['author'].apply(label_code_apply)
stop_words = set(stopwords.words("english"))


In [306]:
def punct(x):
    return len([e for e in x.lower() if not (e.isalnum() or e.isspace())])

def length(x):
    return len(x)

def preprocess(x):
    x = unidecode.unidecode(x)
    x = ''.join(e for e in x.lower() if (e.isalnum() or e.isspace()))
    # x = ' '.join(lemmatizer.lemmatize(token) for token in x.split(" "))
    # x = ' '.join(lemmatizer.lemmatize(token, "v") for token in x.split(" "))
    x = ' '.join(word for word in x.split(" ") if not word in stop_words)
    return x

In [307]:
df['num_punct'] = df['text'].apply(punct)
df['len'] = df['text'].apply(length)
df['text_processed'] = df['text'].apply(preprocess)

In [308]:
train_text, temp_text, train_labels, temp_labels = train_test_split(df[['text_processed', 'len', 'num_punct']], df['author'],  
                                                                    random_state=2018, 
                                                                    test_size=0.3, 
                                                                    stratify=df['author'])
#train_labels = pd.Series(map(lambda x: label_code[x], train_labels))
#temp_labels = pd.Series(map(lambda x: label_code[x], temp_labels))
# we will use temp_text and temp_labels to create validation and test set
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=2018, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)

In [309]:
len_std = StandardScaler()
punct_std = StandardScaler()
len_std.fit(train_text['len'].values.reshape(-1,1))
punct_std.fit(train_text['len'].values.reshape(-1,1))

StandardScaler(copy=True, with_mean=True, with_std=True)

In [310]:
train_text['len'] = len_std.transform(train_text['len'].values.reshape(-1,1))
val_text['len'] = len_std.transform(val_text['len'].values.reshape(-1,1))
test_text['len'] = len_std.transform(test_text['len'].values.reshape(-1,1))

train_text['num_punct'] = punct_std.transform(train_text['num_punct'].values.reshape(-1,1))
val_text['num_punct'] = punct_std.transform(val_text['num_punct'].values.reshape(-1,1))
test_text['num_punct'] = punct_std.transform(test_text['num_punct'].values.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead


In [None]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [312]:
bert = bert.to(device)
#op1 = bert(emb[:2], attention_mask = mask[:2])

In [313]:
max_seq_len = 50

In [314]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text['text_processed'].tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text['text_processed'].tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text['text_processed'].tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)



In [315]:
train_text['num_punct']

12486   -1.295198
1421    -1.368867
16353   -1.359658
15073   -1.313615
13658   -1.350450
           ...   
9531    -1.313615
13329   -1.359658
9367    -1.322824
16296   -1.359658
2993    -1.368867
Name: num_punct, Length: 13705, dtype: float64

In [316]:
# for train set
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_punct = torch.tensor(train_text['num_punct'].values, dtype=torch.float)
train_len = torch.tensor(train_text['len'].values, dtype=torch.float)
train_y = torch.tensor(train_labels.tolist())

# for validation set
val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_punct = torch.tensor(val_text['num_punct'].values, dtype=torch.float)
val_len = torch.tensor(val_text['len'].values, dtype=torch.float)
val_y = torch.tensor(val_labels.tolist())

# for test set
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_punct = torch.tensor(test_text['num_punct'].values, dtype=torch.float)
test_len = torch.tensor(test_text['len'].values, dtype=torch.float)
test_y = torch.tensor(test_labels.tolist())

In [317]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_punct, train_len, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_punct, val_len, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [318]:
# freeze all the parameters
for param in bert.parameters():
    param.requires_grad = False

In [319]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        self.bert = bert 
        # dropout layer
        self.dropout = nn.Dropout(0.1)
        # relu activation function
        self.relu =  nn.ReLU()
        # dense layer 1
        self.fc1 = nn.Linear(768,512)
        # dense layer 2 (Output layer)
        self.punc_emb = nn.Linear(1,12)
        self.len_emb = nn.Linear(1,12)
        self.fc2 = nn.Linear(512 + 12 + 12,3)
        #softmax activation function
        self.softmax = nn.LogSoftmax(dim=1)
    #define the forward pass
    def forward(self, sent_id, mask, punct, l):
        #pass the inputs to the model  
        bert_op = self.bert(sent_id, attention_mask=mask)
        x = self.fc1(bert_op[1])
        punct = self.punc_emb(punct.unsqueeze(1))
        l = self.len_emb(l.unsqueeze(1))
        x = torch.cat((x, punct, l), dim = -1)
        x = self.relu(x)
        x = self.dropout(x)
        # output layer
        x = self.fc2(x)
        return x

In [320]:
# pass the pre-trained BERT to our define architecture
model = BERT_Arch(bert)

# push the model to GPU
model = model.to(device)

In [321]:
# optimizer from hugging face transformers
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-5)

In [322]:
# loss function
cross_entropy  = nn.CrossEntropyLoss()

# number of training epochs
epochs = 50

In [323]:
# function to train the model
def train():
    model.train()
    total_loss, total_accuracy = 0, 0
      # empty list to save model predictions
    total_preds=[]
      # iterate over batches
    for step,batch in enumerate(train_dataloader):
        # progress update after every 50 batches.
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
        # push the batch to gpu
        batch = [r.to(device) for r in batch]
        sent_id, mask, punct, l, labels = batch
        # clear previously calculated gradients 
        model.zero_grad()        
        # get model predictions for the current batch
        preds = model(sent_id, mask, punct, l)
        # compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)
        # add on to the total loss
        total_loss = total_loss + loss.item()
        # backward pass to calculate the gradients
        loss.backward()
        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # update parameters
        optimizer.step()
        # model predictions are stored on GPU. So, push it to CPU
        preds=preds.detach().cpu().numpy()
        # append the model predictions
        total_preds.append(preds)

    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)

    # predictions are in the form of (no. of batches, size of batch, no. of classes).
    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    #returns the loss and predictions
    return avg_loss, total_preds

In [324]:
# function for evaluating the model
def evaluate():

    print("\nEvaluating...")
    # deactivate dropout layers
    model.eval()
    total_loss, total_accuracy = 0, 0

    # empty list to save the model predictions
    total_preds = []

    # iterate over batches
    for step,batch in enumerate(val_dataloader):

    # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            # elapsed = format_time(time.time() - t0)
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))
        # push the batch to gpu
        batch = [t.to(device) for t in batch]
        sent_id, mask, punct, l, labels = batch
        # deactivate autograd
        with torch.no_grad():
            # model predictions
            preds = model(sent_id, mask, punct, l)
            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds,labels)
            total_loss = total_loss + loss.item()
            preds = preds.detach().cpu().numpy()
            total_preds.append(preds)
    # compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader) 

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [325]:
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, _ = train()
    
    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 50
  Batch    50  of    429.
  Batch   100  of    429.
  Batch   150  of    429.
  Batch   200  of    429.
  Batch   250  of    429.
  Batch   300  of    429.
  Batch   350  of    429.
  Batch   400  of    429.

Evaluating...
  Batch    50  of     92.

Training Loss: 1.080
Validation Loss: 1.068

 Epoch 2 / 50
  Batch    50  of    429.
  Batch   100  of    429.
  Batch   150  of    429.
  Batch   200  of    429.
  Batch   250  of    429.
  Batch   300  of    429.
  Batch   350  of    429.
  Batch   400  of    429.

Evaluating...
  Batch    50  of     92.

Training Loss: 1.063
Validation Loss: 1.053

 Epoch 3 / 50
  Batch    50  of    429.
  Batch   100  of    429.
  Batch   150  of    429.
  Batch   200  of    429.
  Batch   250  of    429.
  Batch   300  of    429.
  Batch   350  of    429.
  Batch   400  of    429.

Evaluating...
  Batch    50  of     92.

Training Loss: 1.049
Validation Loss: 1.037

 Epoch 4 / 50
  Batch    50  of    429.
  Batch   100  of    429.
  Batc

KeyboardInterrupt: 

In [None]:
#load weights of best model
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

In [None]:
# get predictions for test data
torch.cuda.empty_cache()
model.to('cpu')
#del train_data
with torch.no_grad():
    preds = model(test_seq, test_mask)
    preds = preds.detach().cpu().numpy()

In [40]:
preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))

              precision    recall  f1-score   support

           0       0.75      0.67      0.71      1185
           1       0.75      0.62      0.68       846
           2       0.62      0.81      0.70       906

    accuracy                           0.70      2937
   macro avg       0.71      0.70      0.70      2937
weighted avg       0.71      0.70      0.70      2937



In [41]:
# confusion matrix
pd.crosstab(test_y, preds)

col_0,0,1,2
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,795,118,272
1,152,523,171
2,118,54,734


In [23]:
test_seq = test_seq.detach().cpu()

In [34]:
torch.cuda.reset_max_memory_cached()



In [47]:
(df[df['author'] == 2])

Unnamed: 0,id,text,author
3,id27763,How lovely is spring As we looked from Windsor...,2
5,id22965,"A youth passed in solitude, my best years spen...",2
9,id00912,I confess that neither the structure of langua...,2
10,id16737,He shall find that I can feel my injuries; he ...,2
15,id12799,"He had escaped me, and I must commence a destr...",2
...,...,...,...
19563,id10563,Yet from whom has not that rude hand rent away...,2
19566,id00832,"These reflections made our legislators pause, ...",2
19569,id26790,Once my fancy was soothed with dreams of virtu...,2
19570,id14263,"Nay, you may have met with another whom you ma...",2
