# Flair Classification using BERT

In [0]:
# downloading Transformers library by Hugging Face
!pip install transformers



In [0]:
!pip install emoji



## Data Prepocess
For classification simply putting the whole raw data into the model is not enough.<br>
To get Good results we have to first decide what data is to be considered for training and then preprocess the data.

In [0]:
# importing required libraries for data preprocessing
import pandas as pd
import numpy as np
import re
import emoji
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import stopwords

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
# loading the training data csv file
train_df = pd.read_csv('/content/train_data.csv')
train_df.head()

Unnamed: 0,PostUploadTime,Headline,Flair,Rating,UserName,UserPostKarma,UserCommentKarma,Id,PostSource,NumComments,Comments,PostBody
0,1587106000.0,Man Arrested For Allegedly Spitting On Manipur...,Non-Political,111,ReallyRedditLover,1008.0,116.0,g2ph3l,https://www.ndtv.com/mumbai-news/man-arrested-...,7,">The accused was identified as **Amir Khan** ,...",
1,1587106000.0,Bought air tickets during lockdown? Full refun...,Coronavirus,8,ReallyRedditLover,1008.0,116.0,g2pg1d,https://timesofindia.indiatimes.com/business/i...,2,"So, If I booked the ticket on 21st for 26th Ma...",
2,1587105000.0,Bollywood Nazis: A Tale of Two Sisters,Politics,19,iSalaamU,32174.0,14938.0,g2pd5o,https://www.reddit.com/r/india/comments/g2pd5o...,7,Woah so permanent ban right ? Not temporary .....,"Yesterday, Twitter permanently suspended the a..."
3,1587105000.0,"On this occasion of Rongāli bihu, I'd love to ...",Non-Political,10,deboo117,8370.0,2700.0,g2p97t,https://open.spotify.com/playlist/21uBCM64wmKV...,0,,
4,1587104000.0,A Tweet Asking For Muslims And Journalists To ...,Coronavirus,1,,,,g2ovv5,https://www.buzzfeednews.com/article/pranavdix...,1,,[deleted]


In [0]:
# loading validation data csv
valid_df = pd.read_csv('valid_data.csv')
valid_df.head()

Unnamed: 0,PostUploadTime,Headline,Flair,Rating,UserName,UserPostKarma,UserCommentKarma,Id,PostSource,NumComments,Comments,PostBody
0,1587444000.0,I want to be calm like that guy sitting on the...,Photography,4,OtsuKotsu,1209.0,2681.0,g50xtt,https://i.redd.it/kjkywsmcc1u41.jpg,0,,
1,1587444000.0,Coronavirus Proof Greetings,Coronavirus,1,Brilliant_Bharat,1.0,0.0,g50xbz,https://www.brilliantbharat.com/2020/04/19/cor...,0,,
2,1587444000.0,1.7 million dollar cost project in Pakistan.,Business/Finance,1,Deejhons786,1330.0,22.0,g50uin,https://youtu.be/GsUWJ9MTHWc,0,,
3,1587444000.0,Refer and Earn- Rs. 5 per refer - No minimum w...,Business/Finance,1,HashOneandOnly,1.0,0.0,g50q4h,https://www.reddit.com/r/india/comments/g50q4h...,0,,[removed]
4,1587444000.0,Amazon reviews gold ! A 'massager' that clearl...,Non-Political,6,zakiiboy,15811.0,2902.0,g50nld,https://i.redd.it/5tuf59qq91u41.jpg,1,"Yes, the product description is not a giveaway...",


***Function to process text data and remove non neccessary information***<br>
 The flow of text while preprocessing->
  1. converting emojis to text
  2. lower casing
  3. removing punctuations, urls
  4. removing stop words
  5. lemmatization

In [0]:
def textPreProcess(text, rem_stop=False):
  """
  Function to process text data and remove non neccessary information
  1. converting emojis to text
  2. lower casing
  3. removing punctuations, urls
  4. removing stop words
  5. lemmatization 
  Args:
    text (str): text data
    rem_stop (bool): whether to remove stopping words or not
  Return:
    text (str): processed text data
  """
  # converting emojis to text
  text = emoji.demojize(text)
  # removing empty space at start and end of text and lower casing
  text = text.strip().lower()
  # removing punctuation
  PUNCTUATIONS = '[!()\-[\]{};:"\,<>/?@#$%^&.*_~]'
  text = re.sub(PUNCTUATIONS, "", text)
  # removing url links
  urlPattern = re.compile(r'https?://\S+|www\.\S+')
  urlPattern.sub('', text)

  # updating stopping words list
  if rem_stop:
    stopWords = list(stopwords.words('english'))
  else:
    stopWords = []

  # lemmatizing, removing stop word, removing emojis
  lemmaWords=[]
  Lemma=WordNetLemmatizer()
  for word in text.split():
    # removing stop words
    if word not in stopWords:
      lemmaWords.append(Lemma.lemmatize(word.strip()))
  text = " ".join(lemmaWords)

  return text

In [0]:
# checking for all the unique flairs in the data
train_df['Flair'].unique()

array(['Non-Political', 'Coronavirus', 'Politics', 'Policy/Economy',
       'Food', 'Science/Technology', 'Business/Finance', 'Photography',
       'Sports'], dtype=object)

In [0]:
# creating a dictionary(encoder) for converting flair category to integer and reverse
category = {'Non-Political':0, 'Coronavirus':1, 'Politics':2,
            'Policy/Economy':3, 'Food':4, 'Science/Technology':5,
            'Business/Finance':6, 'Photography':7, 'Sports':8}

For training the Text classifier, as feature I will use concat all the text present in the ```Headline```, ```PostBody``` and ```Comments``` as a single ```text``` body. And then apply text preprocessing

In [0]:
def getClasData(df):
  # Concatenating all the text features into one text body
  data = pd.DataFrame(columns=['Text', 'Flair'])
  data['Text'] = df['Headline'].map(str) + df['PostBody'].map(str) + df['Comments'].map(str)
  data['Flair'] = df['Flair'].map(str)

  # apply text processing
  data['Text'] = data['Text'].apply(lambda text: textPreProcess(text))
  data['Flair'] = data['Flair'].apply(lambda flair: category[flair])
  return data

In [0]:
data_trn = getClasData(train_df)
data_vld = getClasData(valid_df)

In [0]:
data_trn.head()

Unnamed: 0,Text,Flair
0,man arrested for allegedly spitting on manipur...,0
1,bought air ticket during lockdown full refund ...,1
2,bollywood nazi a tale of two sistersyesterday ...,2
3,on this occasion of rongāli bihu i'd love to s...,0
4,a tweet asking for muslim and journalist to be...,1


In [0]:
data_vld.head()

Unnamed: 0,Text,Flair
0,i want to be calm like that guy sitting on the...,7
1,coronavirus proof greetingsnannan,1
2,17 million dollar cost project in pakistannannan,6
3,refer and earn r 5 per refer no minimum withdr...,6
4,amazon review gold a 'massager' that clearly i...,0


In [0]:
# save the cleaned processed data int csv files
data_trn.to_csv('clean_train_data.csv')
data_vld.to_csv('clean_valid_data.csv')

## Defining model and preparing dataloader

In [0]:
# Importing required libraries, modules for text classification
import transformers
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import AdamW, AdamWeightDecay
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
import random

In [0]:
# using pretrained Bert tokenizer which has a vocab size of 300k small case words
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [0]:
# Using pretrained BertModel for text Classification from Hugging face
# num_labels is the number of categories in the data
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=9)
# loading model onto the gpu
model = model.cuda()

In [0]:
class RedditData(Dataset):
  """ """
  def __init__(self, path, maxlen):
    # read csv data file
    self.df = pd.read_csv(path)
    # maximum length of the text to be considered
    self.maxlen = maxlen
    # tokenizer
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

  def __len__(self):
    # returns the lenght of dataFrame
    return len(self.df)

  def __getitem__(self, idx):
    """Returns the text tokens and labels based on the index(idx)"""
    # text data
    text = self.df['Text'].iloc[idx]
    # label(flair) of the text
    label = self.df['Flair'].iloc[idx]

    # get token ids and attention mask
    tokens_dict = self.tokenizer.encode(text =text,
                          add_special_tokens=True,
                          max_length = self.maxlen,
                          pad_to_max_length=True,
                          padding_side = 'left')
    
    token_ids, labels = torch.tensor(tokens_dict['input_ids']), torch.tensor(label)
    attn_masks = torch.tensor(tokens_dict['attention_mask'])

    return token_ids, label, attn_masks

In [0]:
max_len = 350
bs = 16

train_set = RedditData('/content/clean_train_data.csv', max_len)
test_set = RedditData('/content/clean_valid_data.csv', max_len)
# creating dataloader
trainloader = DataLoader(train_set, shuffle = True, batch_size=bs)
testloader = DataLoader(test_set, shuffle = True, batch_size=bs)

In [0]:
# Optimizer for training
lr = 2e-5
optimizer = AdamW(model.parameters(), lr = lr)

In [0]:
# setting device for training
if torch.cuda.is_available():
  device = 'cuda'
else:
  device = 'cpu'

In [0]:
def getAccuracy(preds, labels):
  """
  Function to calculate the Accuracy of the model
  Args:
      preds:  output of the model(predicted categories)
      labels: target values
  """
  # number of predictions
  n = preds.shape[0]
  # detaching from cuda
  preds = preds.detach().to('cpu')
  labels = labels.detach().to('cpu')
  # matching the dimensions of both prediction tensor and label tensor
  preds = preds.argmax(dim=-1).view(n, -1)
  labels = labels.view(n, -1)
  return ((preds==labels).sum()).float()/n

In [0]:
def seed_all(seed_value):
    """Function to random seed all the gpu and cpu variables for 
       regeneration of same results at the time of testing as they were at training
    """
    random.seed(seed_value) # Python
    torch.manual_seed(seed_value) # cpu vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value) # gpu variables

In [0]:
import time
def getTime(clk):
  '''
  function to convert seconds time into proper format
  Args:
    clk: time in seconds
  '''
  print('Time Taken:', time.strftime("%H:%M:%S",time.gmtime(clk)))

### Fine Tuning pretrained model
The model that I am using is pretrained on small cased english sentences from the data on the internet. But inorder to get best result for my data i will fine tune the model on my data

Here is the complete code for the training and evaluating model

In [0]:
EPOCHS = 10
seed = 59

# to regenerate the same results
seed_all(seed)
curr_time = 0
loss_list =[]
acc_list = []

train_time = 0

# complete training+validation time
for epoch in range(EPOCHS):
    print("-------- Epoch: {} --------".format(epoch+1))

    #########################
    #       Training        #
    #########################
    
    # training time in one epoch
    curr_time = time.time()
    model.train()
    batch_loss=0
    pbar = tqdm(enumerate(trainloader), leave = False, total = len(trainloader))
    for idx, (tokens, labels, masks) in pbar:
        tokens = tokens.to(device)
        labels = labels.to(device)
        masks = masks.to(device)

        model.zero_grad()
        
        output = model(tokens,
                    labels = labels,
                    attention_mask= masks)
        loss = output[0]
        loss_list.append(loss)
        batch_loss+=loss.item()

        loss.backward()
        optimizer.step()

    ##########################
    #       Validation       #
    ##########################
    model.eval()
    avg_acc=0
    with torch.no_grad():
        pbar = tqdm(enumerate(testloader), leave = False, total = len(testloader))
        for idx, (tokens, labels, masks) in pbar:
            tokens = tokens.to(device)
            labels = labels.to(device)
            masks = masks.to(device)

            output = model(tokens,
                          attention_mask = masks)
            # output=> probabilities
            valid_acc = getAccuracy(output[0], labels)
            avg_acc+=valid_acc
            acc_list.append(valid_acc)

    train_time = time.time() - curr_time

    print('Epoch: {}/{} | Training loss: {:.4f} | Accuracy: {:.4f}'.format(epoch+1, EPOCHS, batch_loss/len(trainloader), avg_acc.item()/len(testloader)))
    getTime(train_time)
    print("")

-------- Epoch: 1 --------


HBox(children=(IntProgress(value=0, max=2207), HTML(value='')))



HBox(children=(IntProgress(value=0, max=25), HTML(value='')))

Epoch: 1/10 | Training loss: 1.2929 | Accuracy: 0.6200
Time Taken: 00:37:26

-------- Epoch: 2 --------


HBox(children=(IntProgress(value=0, max=2207), HTML(value='')))



HBox(children=(IntProgress(value=0, max=25), HTML(value='')))

Epoch: 2/10 | Training loss: 0.8878 | Accuracy: 0.6250
Time Taken: 00:37:24

-------- Epoch: 3 --------


HBox(children=(IntProgress(value=0, max=2207), HTML(value='')))



HBox(children=(IntProgress(value=0, max=25), HTML(value='')))

Epoch: 3/10 | Training loss: 0.5933 | Accuracy: 0.6500
Time Taken: 00:37:25

-------- Epoch: 4 --------


HBox(children=(IntProgress(value=0, max=2207), HTML(value='')))



HBox(children=(IntProgress(value=0, max=25), HTML(value='')))

Epoch: 4/10 | Training loss: 0.3702 | Accuracy: 0.6462
Time Taken: 00:37:24

-------- Epoch: 5 --------


HBox(children=(IntProgress(value=0, max=2207), HTML(value='')))



HBox(children=(IntProgress(value=0, max=25), HTML(value='')))

Epoch: 5/10 | Training loss: 0.2204 | Accuracy: 0.6562
Time Taken: 00:37:25

-------- Epoch: 6 --------


HBox(children=(IntProgress(value=0, max=2207), HTML(value='')))

In [0]:
def saveModel(model, filepath, category):
    """
    Function to save model parameters for further use
    Args:
        model: training model
        filepath: (optional) model parameters are saved at "filepath"
    """
    filepath = filepath+'_model.pth'
        
    torch.save({
            'model' : model,
            'state_dict': model.state_dict(),
            'category': category
            }, filepath)
    print("Model saved at {}".format(filepath))

#### Saving fine tuned model

In [0]:
saveModel(model, '/content/bert_finetuned')

In [0]:
# def validate(model, testloader, acc_list):
#     model.eval()
#     avg_acc=0
#     with torch.no_grad():
#         pbar = tqdm(enumerate(testloader), leave = False, total = len(testloader))
#         for idx, (tokens, labels, masks) in pbar:
#             tokens = tokens.to(device)
#             labels = labels.to(device)
#             masks = masks.to(device)

#             output = model(tokens,
#                           attention_mask = masks)
#             # output=> probabilities
#             valid_acc = getAccuracy(output[0], labels)
#             avg_acc+=valid_acc
#             acc_list.append(valid_acc)
#     print(avg_acc/len(testloader))
#     # print('Epoch: {}/{} | Batch: {:4} | Training loss: {:.4f} | Accuracy: {:.4f}'.format(epoch+1, EPOCHS, idx+1, loss.item(), avg_acc.item()//(idx+1)))
#     # print("")

In [0]:
# validate(model, testloader, acc_list)