In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.40.1-py3-none-any.whl (9.0 MB)
[K     |████████████████████████████████| 9.0 MB 83 kB/s eta 0:00:014
Collecting huggingface-hub<1.0,>=0.19.3
  Downloading huggingface_hub-0.22.2-py3-none-any.whl (388 kB)
[K     |████████████████████████████████| 388 kB 345 kB/s eta 0:00:01
[?25hCollecting filelock
  Downloading filelock-3.13.4-py3-none-any.whl (11 kB)
Collecting tokenizers<0.20,>=0.19
  Downloading tokenizers-0.19.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[K     |████████████████████████████████| 3.6 MB 256 kB/s eta 0:00:01
[?25hCollecting safetensors>=0.4.1
  Downloading safetensors-0.4.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 162 kB/s eta 0:00:01
Collecting fsspec>=2023.5.0
  Downloading fsspec-2024.3.1-py3-none-any.whl (171 kB)
[K     |████████████████████████████████| 171 kB 136 kB/s eta 0:00:01
Installing collected packag

In [None]:
import torch
from transformers import *
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x=='positive' else 0)
df.head()

In [None]:
#dataset dimensions
df.shape

In [None]:
#Labeled reviews barplot
sns.countplot(df.sentiment)
plt.xlabel('sentiments')

The Transformers library provides a wide variety of Transformer models (including BERT). It works with TensorFlow and PyTorch! It also includes prebuild tokenizers to do the heavy work required for bert model input.

In [None]:
#Selecting the bert-base-cased
PRE_TRAINED_MODEL_NAME = '../input/bert-base-cased/'

In [None]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
#Let's see how Bertencoder is encoding a sentence
sample_txt = 'When was I last outside? I am stuck at home for 2 weeks.'
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f' Sentence: {sample_txt}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

In [None]:
tokenizer.sep_token, tokenizer.sep_token_id

In [None]:
tokenizer.cls_token, tokenizer.cls_token_id

In [None]:
tokenizer.pad_token, tokenizer.pad_token_id

In [None]:
tokenizer.unk_token, tokenizer.unk_token_id

In [None]:
#To do all preprocessing you specify some parameters in the encod_plus() method of the tokenizer
encoding = tokenizer.encode_plus(
  sample_txt,
  max_length=32,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
)

encoding.keys()

In [None]:
#The tokens ids list
print(len(encoding['input_ids'][0]))
encoding['input_ids'][0]

In [None]:
#The attentions masked tokens
print(len(encoding['attention_mask'][0]))
encoding['attention_mask']

In [None]:
#let's see how the sentence is tokenized with bert tokenizer
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

In [None]:
token_lens = []

for txt in df.review:
  tokens = tokenizer.encode(txt, max_length=512)
  token_lens.append(len(tokens))
sns.distplot(token_lens)
plt.xlim([0, 500]);
plt.xlabel('Token count')

In [None]:
MAX_LEN = 200      #for not consuming much resources
RANDOM_SEED = 42
device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu' )

In [None]:
#doing the split of the dataset into training, validation and testing sets
df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)


In [None]:
df_train.shape, df_val.shape, df_test.shape