In [1]:
!pip install transformers
!pip install datasets
!pip install accelerate
!pip install evaluate
!pip install tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m73.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m116.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, get_scheduler, AdamW
from datasets import load_dataset, DatasetDict
import evaluate
import torch
from accelerate import Accelerator
from torch.utils.data import DataLoader
import torch
import tqdm
from tqdm.auto import tqdm

In [3]:
imdb_dataset = load_dataset("imdb")

print(imdb_dataset)

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [None]:
imdb_dataset['train'][0:3]

{'text': ['There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier\'s plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it\'s the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...',
  'This movie is a great. The plot is very true to the book which is a classic written by Mark Twain. The movie starts of with a scene where Hank sings a song with a bunch of kids called "when you stub your toe on the moon" It

A good strategy is to clean our dataset. We can see that some HTML tags, in an unappropriate way are used, so we have to treat them as 'special characters'. Also a good idea is to remove any possible legit HTML tags, URLs and some punctuation marks, which in our case are used as escape characters.

In [4]:
import re
import html

In [5]:
#Removing symbols
def remove(example):
  x = example['text']
  #punctuation marks
  x = re.sub('[,\.!?:()"]' , '', x)
  #html tags
  x = html.unescape(x)
  #URls
  x = re.sub('http\S+', '', x)
  #<br ></ br>
  x = re.sub(r'<br\s*/?>', '', x)

  return {"text":x}


imdb_dataset = imdb_dataset.map(remove)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
imdb_dataset['train'][0:3]

{'text': ["I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967 I also heard that at first it was seized by US customs if it ever tried to enter this country therefore being a fan of films considered controversial I really had to see this for myselfThe plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States In between asking politicians and ordinary denizens of Stockholm about their opinions on politics she has sex with her drama teacher classmates and married menWhat kills me about I AM CURIOUS-YELLOW is that 40 years ago this was considered pornographic Really the sex and nudity scenes are few and far between even then it's not shot like some

A good strategy is to also convert all words in lower cases.

In [6]:
#Converting words into lower case

imdb_dataset = imdb_dataset.map(lambda x: {'text':x['text'].lower()})

imdb_dataset['train'][0]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

{'text': "i rented i am curious-yellow from my video store because of all the controversy that surrounded it when it was first released in 1967 i also heard that at first it was seized by us customs if it ever tried to enter this country therefore being a fan of films considered controversial i really had to see this for myselfthe plot is centered around a young swedish drama student named lena who wants to learn everything she can about life in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such as the vietnam war and race issues in the united states in between asking politicians and ordinary denizens of stockholm about their opinions on politics she has sex with her drama teacher classmates and married menwhat kills me about i am curious-yellow is that 40 years ago this was considered pornographic really the sex and nudity scenes are few and far between even then it's not shot like some 

## Applying tokenization and initiating data collator object

In [7]:
#Initializing tokenizer and pre-trained model
model_checkpoint = 'lvwerra/distilbert-imdb'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

#function that applies tokenization
def tokenize_sentences(example):
  return tokenizer(example['text'], truncation = True)

#mapping the function to the imdb_dataset, batching sequences also
final_imdb_dataset = imdb_dataset.map(tokenize_sentences, batched=True)

#Initializing data collator object
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Not that we tokenized all the sentences and words, it's time to apply some last preprocessing steps. First we remove the column text, because it's not needed anymore. We rename the column label to labels, because in this way, the model will understand and we also change the format of the dataset to return torch tensors instead of lists.

In [8]:
#Removing unnecessary columns
final_imdb_dataset = final_imdb_dataset.remove_columns(['text'])

#Rename column label to labels
final_imdb_dataset = final_imdb_dataset.rename_column('label','labels')

#set the format of dataset to torch tensors
final_imdb_dataset.set_format('torch')

Since our dataset contains a lot of data, it's necessary to store our results in a GPU instead of a CPU.

In [9]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
print(device)

cuda


Now we can instantiate our data loaders, for our test and train data.

In [10]:
train_data_loader = DataLoader(final_imdb_dataset['train'], batch_size=32, shuffle=True, collate_fn = data_collator)
test_data_loader = DataLoader(final_imdb_dataset['test'], batch_size=32, shuffle=True, collate_fn = data_collator)

Using Accelerator, we allow distributed training across several CPUs and GPUs. But first we need to initialize our optimization parameter,which in this case will be Adam.

In [11]:
optimizer = AdamW(model.parameters(), lr = 2e-5)


accelerator = Accelerator()
dl_train, dl_test, model, optimizer = accelerator.prepare(train_data_loader, test_data_loader, model, optimizer)



In [12]:
#Further optimizing hyperparameters, now learning rate scheduler

num_epochs = 3

num_training_steps = num_epochs * len(dl_train)

learning_rate_scheduler = get_scheduler(
    "linear", #linearly increases the learning rate from 0 to the initial learning rate over the training steps
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps #the scheduler will regulate learning rate over the training steps
)

In [13]:
#Training Process
#initialize a tqdm progress bar over our training steps to get some sense when training is over

progress_bar = tqdm(range(num_training_steps))

model.train()

for epochs in range(num_epochs):
  for batch in dl_train:
    batch = {k:v.to(device) for k,v in batch.items()}
    outputs = model(**batch)

    #Calculate loss
    loss = outputs.loss

    #Backprop
    accelerator.backward(loss)
    optimizer.step()
    learning_rate_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

  0%|          | 0/2346 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [14]:
#Evaluation process
from datasets import load_metric


metric = load_metric('accuracy')
predictions_list = []
digit_predictions = []
model.eval()

for batch in dl_test:
  batch = {k:v.to(device) for k,v in batch.items()}
  with torch.no_grad():
    outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim = -1)
    metric.add_batch(predictions = predictions, references = batch["labels"])
    predictions_list.append(predictions)

print(metric.compute())


for batch_predictions in predictions_list:
  for batch in batch_predictions:
    digit_predictions.append(batch.item())


#print(digit_predictions) #This is the final list, with the predictions
print(len(digit_predictions))

  metric = load_metric('accuracy')


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

{'accuracy': 0.92932}
25000


## Passing the results to a dataframe in pandas

In [15]:
imdb_dataset.set_format('pandas')

In [16]:
test_df = imdb_dataset['test'][:]

print(test_df.shape)

(25000, 2)


In [19]:
test_df['sentiment'] = digit_predictions

test_df.drop(columns = ['label'], inplace=True)

In [20]:
test_df.head()

Unnamed: 0,text,sentiment
0,i love sci-fi and am willing to put up with a ...,1
1,worth the entertainment value of a rental espe...,1
2,its a totally average film with a few semi-alr...,0
3,star rating ***** saturday night **** friday n...,1
4,first off let me say if you haven't enjoyed a ...,0


In [21]:
test_df['text'][4]

"first off let me say if you haven't enjoyed a van damme movie since bloodsport you probably will not like this movie most of these movies may not have the best plots or best actors but i enjoy these kinds of movies for what they are this movie is much better than any of the movies the other action guys segal and dolph have thought about putting out the past few years van damme is good in the movie the movie is only worth watching to van damme fans it is not as good as wake of death which i highly recommend to anyone of likes van damme or in hell but in my opinion it's worth watching it has the same type of feel to it as nowhere to run good fun stuff"