## 1: Importing libraries and Loading data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import nltk
nltk.download("punkt")
import string
nltk.download("stopwords")
from wordcloud import WordCloud
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
import torch
from tqdm.notebook import tqdm

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import re
import os
import bz2

In [5]:
print(os.listdir("/content/drive/MyDrive/datasets"))

['test.ft.txt.bz2', 'train.ft.txt.bz2']


In [6]:
train_df=bz2.BZ2File("/content/drive/MyDrive/datasets/train.ft.txt.bz2")
test_df=bz2.BZ2File("/content/drive/MyDrive/datasets/test.ft.txt.bz2")

In [7]:
def load_extract(file):
  texts,labels=[],[]
  for line in file:
    x=line.decode('utf-8')
    labels.append(int(x[9])-1)
    texts.append(x[10:].strip())
  print("Done!")
  return np.array(labels),texts

In [8]:
train_labels,train_texts=load_extract(train_df)
test_labels,test_texts=load_extract(test_df)

Done!
Done!


In [9]:
df1=pd.DataFrame({"category":train_labels,"text":train_texts})
df1

Unnamed: 0,category,text
0,1,Stuning even for the non-gamer: This sound tra...
1,1,The best soundtrack ever to anything.: I'm rea...
2,1,Amazing!: This soundtrack is my favorite music...
3,1,Excellent Soundtrack: I truly like this soundt...
4,1,"Remember, Pull Your Jaw Off The Floor After He..."
...,...,...
3599995,0,Don't do it!!: The high chair looks great when...
3599996,0,"Looks nice, low functionality: I have used thi..."
3599997,0,"compact, but hard to clean: We have a small ho..."
3599998,0,what is it saying?: not sure what this book is...


In [10]:
df1["category"].value_counts()

1    1800000
0    1800000
Name: category, dtype: int64

In [11]:
df=pd.DataFrame(columns=["category","text"])
df

Unnamed: 0,category,text


In [12]:
df=df.append(df1.loc[df1["category"]==0][:10000])
df=df.append(df1.loc[df1["category"]==1][:10000])
df

  df=df.append(df1.loc[df1["category"]==0][:10000])
  df=df.append(df1.loc[df1["category"]==1][:10000])


Unnamed: 0,category,text
6,0,"Buyer beware: This is a self-published book, a..."
10,0,The Worst!: A complete waste of time. Typograp...
13,0,Oh please: I guess you have to be a romance no...
14,0,Awful beyond belief!: I feel I have to write t...
15,0,Don't try to fool us with fake reviews.: It's ...
...,...,...
19551,1,A Tree Grows in Brooklyn: This book was writte...
19552,1,Couldn't put it down.: This was a great book. ...
19553,1,Simply Wonderful!: A Tree Grows in Brooklyn is...
19554,1,Good Read: I read this book last month....shou...


In [13]:
df.reset_index(drop=True,inplace=True)

In [14]:
df

Unnamed: 0,category,text
0,0,"Buyer beware: This is a self-published book, a..."
1,0,The Worst!: A complete waste of time. Typograp...
2,0,Oh please: I guess you have to be a romance no...
3,0,Awful beyond belief!: I feel I have to write t...
4,0,Don't try to fool us with fake reviews.: It's ...
...,...,...
19995,1,A Tree Grows in Brooklyn: This book was writte...
19996,1,Couldn't put it down.: This was a great book. ...
19997,1,Simply Wonderful!: A Tree Grows in Brooklyn is...
19998,1,Good Read: I read this book last month....shou...


In [15]:
possible_labels = df.category.unique()
possible_labels

array([0, 1], dtype=object)

In [16]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [17]:
df.category = df['category'].map(label_dict)

##2:Train-Test Split

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.category.values, 
                                                  test_size=0.15, 
                                                  random_state=42,
                                                  stratify=df.category.values)

In [20]:
df['data_type'] = ['not_set']*df.shape[0]
df

Unnamed: 0,category,text,data_type
0,0,"Buyer beware: This is a self-published book, a...",not_set
1,0,The Worst!: A complete waste of time. Typograp...,not_set
2,0,Oh please: I guess you have to be a romance no...,not_set
3,0,Awful beyond belief!: I feel I have to write t...,not_set
4,0,Don't try to fool us with fake reviews.: It's ...,not_set
...,...,...,...
19995,1,A Tree Grows in Brooklyn: This book was writte...,not_set
19996,1,Couldn't put it down.: This was a great book. ...,not_set
19997,1,Simply Wonderful!: A Tree Grows in Brooklyn is...,not_set
19998,1,Good Read: I read this book last month....shou...,not_set


In [21]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [22]:
df.groupby(['category', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
category,data_type,Unnamed: 2_level_1
0,train,8500
0,val,1500
1,train,8500
1,val,1500


##3:Loading Tokenizer and Encoding our data

In [23]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m100.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [24]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [25]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased', #bert-large-uncased
    do_lower_case=True
)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [26]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text.values,#get all the rows used for training 
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].category.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].category.values)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [27]:
dataset_train = TensorDataset(input_ids_train, 
                              attention_masks_train,
                              labels_train)

dataset_val = TensorDataset(input_ids_val, 
                            attention_masks_val,
                           labels_val)

##4:Setting up BERT pretrained model

In [28]:
from transformers import BertForSequenceClassification

In [29]:
model = BertForSequenceClassification.from_pretrained(
                                      'bert-base-uncased', 
                                      num_labels = len(label_dict),
                                      output_attentions = False,
                                      output_hidden_states = False
                                     )

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

##5:Creating Data loaders

In [30]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [31]:
dataset_train

<torch.utils.data.dataset.TensorDataset at 0x7fa8fcc1d6a0>

In [32]:
batch_size = 4

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=32
)

##6:Setting up optimizer and Scheduler

In [33]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [34]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5,
    eps = 1e-8
)



In [35]:
epochs = 3

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps = len(dataloader_train)*epochs
)

##7:Defining our Performance metrics


In [36]:
from sklearn.metrics import f1_score

In [37]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

In [38]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy:{len(y_preds[y_preds==label])}/{len(y_true)}\n')

##8:Creating our Training Loop

In [39]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [40]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [41]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [42]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0
    
    progress_bar = tqdm(dataloader_train, 
                        desc='Epoch {:1d}'.format(epoch), 
                        leave=False, 
                        disable=False)
    
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        
        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total +=loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})     
    
    torch.save(model.state_dict(), f'BERT_ft_Epoch{epoch}.model')
    
    tqdm.write('\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')

torch.save(model,'BertModel')

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/4250 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.28948798100098366


  0%|          | 0/94 [00:00<?, ?it/s]

Validation loss: 0.30634036798902015
F1 Score (weighted): 0.9409525980578854


Epoch 2:   0%|          | 0/4250 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.1456877109274364


  0%|          | 0/94 [00:00<?, ?it/s]

Validation loss: 0.3113537324865552
F1 Score (weighted): 0.9469919368846216


Epoch 3:   0%|          | 0/4250 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.05973229712247506


  0%|          | 0/94 [00:00<?, ?it/s]

Validation loss: 0.3461781035622017
F1 Score (weighted): 0.943665258298124


##9:Evaluating our model

In [43]:
accuracy_per_class(predictions, true_vals)

Class: 0
Accuracy:1408/1500

Class: 1
Accuracy:1423/1500



##*For second epoch ,we got max accuracy*