In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m62.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m55.2 MB/s[0m eta [36m0:00:0

## This experiment is binary text classification. It is Sentiment classification that is to classify the sentence postive (1) or negative (0). In this experiment , I used bert model to get embedding of the text and feed that embedding features to lightgbm classifier, which is gradient boosting method for faster training. It got 84% accuracy.

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb
import lightgbm as lgbm
import time
from sklearn.model_selection import train_test_split
import time

## Load dataset

In [None]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [None]:
df.shape

(6920, 2)

## use only 3000 samples for faster training

In [None]:
used_batch = df[:3000]

In [None]:
used_batch.shape

(3000, 2)

In [None]:
used_batch[1].value_counts()

1    1565
0    1435
Name: 1, dtype: int64

## Load Bert tokenizer and Bert Model

In [None]:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [None]:
#check tokenizer properties
tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

## Tokenize the text data

In [None]:
tokenized = used_batch[0].apply((lambda x: tokenizer.encode(x,add_special_tokens = True)))

In [None]:
"""for i in range(len(tokenized)):
  count = 0
  for j in tokenized[i]:
    count += 1
  print(count)"""
tokenized.values

array([list([101, 1037, 18385, 1010, 6057, 1998, 2633, 18276, 2128, 16603, 1997, 5053, 1998, 1996, 6841, 1998, 5687, 5469, 3152, 102]),
       list([101, 4593, 2128, 27241, 23931, 2013, 1996, 6276, 2282, 2723, 1997, 2151, 2445, 12217, 7815, 102]),
       list([101, 2027, 3653, 23545, 2037, 4378, 24185, 1050, 1005, 1056, 4133, 2145, 2005, 1037, 11507, 10800, 1010, 2174, 14036, 2135, 3591, 1010, 2061, 2027, 19817, 4140, 2041, 1996, 7511, 2671, 4349, 3787, 1997, 11829, 7168, 9219, 1998, 28971, 2308, 1999, 8301, 8737, 2100, 4253, 102]),
       ..., list([101, 2433, 20922, 2278, 1010, 2021, 4569, 102]),
       list([101, 2045, 1005, 1055, 2070, 2204, 3430, 1999, 2037, 2466, 2055, 1037, 7027, 7805, 5782, 2062, 2041, 1997, 2166, 1010, 2021, 1996, 3185, 2205, 2411, 23371, 2049, 7787, 2007, 5220, 8146, 1998, 23563, 5019, 102]),
       list([101, 11317, 7545, 1996, 5372, 10652, 2000, 2010, 2535, 2004, 1048, 15185, 4463, 15803, 25269, 2497, 102])],
      dtype=object)

## Find max length of sentence to add padding to be same length of sentence vector

In [None]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [None]:
np.array(padded).shape

(3000, 66)

## Add attention mask to emphasize only the actual features not padding

In [None]:
attention_mask = np.where(padded != 0,1,0)
attention_mask

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

## Feed the tokenized text to Bert Model to get Feature Embedding

In [None]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)
with torch.no_grad():
  last_hidden_states = model(input_ids,attention_mask=attention_mask)

In [None]:
## get only cls token to perform classification
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
labels = used_batch[1]

## Split Dataset

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels,test_size=0.4)
test_features,val_features,test_labels,val_labels = train_test_split(test_features,test_labels,test_size=0.5)
print(len(train_features),len(test_features),len(val_features))

1800 600 600


In [None]:
train_labels.shape

(1800,)

## Train LightGBM

In [None]:
# Get the train and test data for the training sequence
train_data = lgbm.Dataset(train_features,label=train_labels)
test_data = lgbm.Dataset(test_features, label=test_labels )
val_data = lgbm.Dataset(val_features, label=val_labels)

# Parameters we'll use for the prediction
parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'boosting': 'dart',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}
start_time = time.time()
# Train the classifier
classifier = lgbm.train(parameters,
                       train_data,
                       valid_sets= val_data,
                       num_boost_round=5000)
train_time= time.time()-start_time
print("Training time",train_time)

You can set `force_col_wise=true` to remove the overhead.
Training time 228.6930549144745


## Inference

In [None]:
start_time = time.time()
# PREDICTION
test_pred = classifier.predict(test_features)
time_taken = time.time()-start_time
print("Inference time of lightgbm",time_taken)

Inference time of lightgbm 0.39127111434936523


In [None]:
predictedLabels = (test_pred>0.35).astype(int)
predictedLabels

array([0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0,

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print ("\naccuracy_score using lightgbm classifier:",accuracy_score(test_labels,predictedLabels))


accuracy_score using lightgbm classifier: 0.84


In [None]:
import time

## Train using Random Forest

In [None]:
x_train = train_features
y_train = train_labels

x_val = val_features
y_val = val_labels

x_test = test_features
y_test = test_labels

print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)


from sklearn.ensemble import RandomForestClassifier
start_time = time.time()
rf = RandomForestClassifier()
rf.fit(x_train,y_train)
print("Training time of randomforest",time.time()-start_time)

(1800, 768) (1800,) (600, 768) (600,)
Training time of randomforest 3.9699838161468506


In [None]:
start_time = time.time()
prediction = rf.predict(x_test)
print("inference time of randomforest",time.time()-start_time)
prediction

inference time of randomforest 0.023691892623901367


array([0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,

In [None]:
print ("\naccuracy_score using random forest classifier:",accuracy_score(test_labels,prediction))


accuracy_score using random forest classifier: 0.785
