# Choose the device

In [None]:
# setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

# Load RoBERTa

In [None]:
!pip install transformers==3.0.2

Collecting transformers==3.0.2
  Downloading transformers-3.0.2-py3-none-any.whl (769 kB)
[?25l[K     |▍                               | 10 kB 27.8 MB/s eta 0:00:01[K     |▉                               | 20 kB 21.3 MB/s eta 0:00:01[K     |█▎                              | 30 kB 16.6 MB/s eta 0:00:01[K     |█▊                              | 40 kB 14.6 MB/s eta 0:00:01[K     |██▏                             | 51 kB 5.5 MB/s eta 0:00:01[K     |██▋                             | 61 kB 5.9 MB/s eta 0:00:01[K     |███                             | 71 kB 5.5 MB/s eta 0:00:01[K     |███▍                            | 81 kB 6.1 MB/s eta 0:00:01[K     |███▉                            | 92 kB 6.1 MB/s eta 0:00:01[K     |████▎                           | 102 kB 5.3 MB/s eta 0:00:01[K     |████▊                           | 112 kB 5.3 MB/s eta 0:00:01[K     |█████▏                          | 122 kB 5.3 MB/s eta 0:00:01[K     |█████▌                          | 133 kB 5.3 MB

In [None]:
# importing libraries for neural network
import torch
from transformers import RobertaModel

In [None]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.classifier = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = RobertaClass()
model.to(device)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, eleme

# Load data

You can download dataset [here](https://www.kaggle.com/kazanova/sentiment140)

In [None]:
from google.colab import drive
# connect with your google drive
drive.mount('/content/drive') 

Mounted at /content/drive


In [None]:
import pandas as pd
# paste your path to the dataset
!cp '/content/drive/MyDrive/dataset.zip' dataset.zip

In [None]:
# unzip files from archive
import zipfile
with zipfile.ZipFile("dataset.zip", 'r') as zip_ref:
    zip_ref.extractall("./")

In [None]:
# Colab Notebooks/Work/Article/

# Prepare data

In [None]:
# drop unnecessary columns and rename the remaining ones
full_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', encoding='latin-1').drop(["1467810369","Mon Apr 06 22:19:45 PDT 2009","NO_QUERY","_TheSpecialOne_"],axis=1).dropna()
full_data.rename(columns={"0":"label",
                         "@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D":"text"}, inplace= True)

In [None]:
NUM_SAMPLES = 30000
# separate the positive and negative tweets and take NUM_SAMPLES from them
negative_samples = full_data[full_data["label"]==0][:NUM_SAMPLES]
positiv_samples = full_data[full_data["label"]==4][:NUM_SAMPLES]

In [None]:
# replace label 4 with label 1 because the neural network outputs only 0 or 1
positiv_samples["label"]=[1]*NUM_SAMPLES

In [None]:
# concat back to one dataset
full_data = pd.concat([negative_samples,  positiv_samples])

In [None]:
full_data

Unnamed: 0,label,text
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew
...,...,...
829994,1,"Really wants to go and see 17 again, because Z..."
829995,1,@krissa22 Thank you!
829996,1,dreaming of you
829997,1,@TheEllenShow I saw a clip online! good show!


In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(full_data, test_size=0.3)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

# information about dataset
print("FULL Dataset: {}".format(full_data.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))



FULL Dataset: (60000, 2)
TRAIN Dataset: (42000, 2)
TEST Dataset: (18000, 2)


In [None]:
from transformers import RobertaTokenizer

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [None]:
MAX_LEN = 130
train_tokenized_data = [tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        for text in train_data['text']]
test_tokenized_data = [tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        for text in test_data['text']]


# Prepare dataset

In [None]:
TRAIN_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32      
LEARNING_RATE = 1e-05

In [None]:
from torch.utils.data import Dataset, DataLoader

class SentimentData(Dataset):
    def __init__(self, data, inputs_tokenized):
        self.inputs = inputs_tokenized
        self.text = data['text']
        self.targets = data['label']

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        
        input = self.inputs[index]
        ids = input['input_ids']
        mask = input['attention_mask']
        token_type_ids = input['token_type_ids']

        return {
            'sentence': text,
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

train_dataset = SentimentData(train_data, train_tokenized_data)
test_dataset = SentimentData(test_data, test_tokenized_data)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': True
                }

train_loader = DataLoader(train_dataset, **train_params)
test_loader = DataLoader(test_dataset, **test_params)

# Fine-tuning model

In [None]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
from tqdm import tqdm
import matplotlib.pyplot as plt
from IPython.display import clear_output

train_loss = []
test_loss = []

train_accuracy = []
test_accuracy = []

def train_loop(epochs):
  for epoch in range(epochs):
    for phase in ['Train', 'Test']:
      if(phase == 'Train'):
        model.train()
        loader = train_loader
      else:
        model.eval()
        loader = test_loader  
        # sentenses that was predicted wrong
        false_test_answers = [[],[]]
      epoch_loss = 0
      epoch_acc = 0
      count = 0
      for steps, data in tqdm(enumerate(loader, 0)):
        sentence = data['sentence']
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model.forward(ids, mask, token_type_ids)

        loss = loss_function(outputs, targets)        
        
        epoch_loss += loss.detach()
        _, max_indices = torch.max(outputs.data, dim=1)
        bath_acc = (max_indices==targets).sum().item()
        epoch_acc += bath_acc

        count += targets.size(0)
        if (phase == 'Train'):
          train_loss.append(loss.detach()) 
          train_accuracy.append(bath_acc)
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
        else:
          test_loss.append(loss.detach()) 
          test_accuracy.append(bath_acc)
          for i in range(len(targets)):
            if targets[i] != max_indices[i]:
              false_test_answers.append([sentence[i], targets[i].item(),max_indices[i].item()])

      print(f"{phase} Loss: {epoch_loss/steps}")
      print(f"{phase} Accuracy: {epoch_acc/count}")

In [None]:
EPOCHS = 4
train_loop(EPOCHS)

1313it [27:03,  1.24s/it]


Train Loss: 0.37673285603523254
Train Accuracy: 0.8345


563it [04:19,  2.17it/s]


Test Loss: 0.3121001720428467
Test Accuracy: 0.8678888888888889


1313it [27:02,  1.24s/it]


Train Loss: 0.2885291278362274
Train Accuracy: 0.8811428571428571


563it [04:19,  2.17it/s]


Test Loss: 0.303607314825058
Test Accuracy: 0.8763333333333333


978it [20:08,  1.24s/it]

In [None]:
false_test_answers

# Graphics

In [None]:
plt.plot(train_loss,  color='blue')
plt.title("Train Loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.show()

plt.plot(test_loss,  color='orange')
plt.title("Test Loss")
plt.xlabel("Batch")
plt.ylabel("Loss")      
plt.show()

plt.plot(train_accuracy,  color='blue')
plt.title("Train Accuracy")
plt.xlabel("Batch")
plt.ylabel("Accuracy")  
plt.show()

plt.plot(test_accuracy,  color='orange')
plt.title("Test Accuracy")
plt.xlabel("Batch")
plt.ylabel("Accuracy")  
 
plt.show()

<a id='section07'></a>
### Save model

In [None]:
save_path="./"
torch.save(model, save_path+'trained_roberta.pt')
print('All files saved')
print('Congratulations, you complete this tutorial')