In [1]:
import pandas as pd
import numpy as np
import os
import gc

import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# seed value
torch.manual_seed(555)

from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

import transformers
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import AdamW

import warnings
warnings.filterwarnings("ignore")


print(torch.__version__)

1.10.0+cu111


In [2]:
#Use the complete path of the CSV file
train_df = pd.read_csv('en_04_17.csv')

In [3]:
train_df

Unnamed: 0.1,Unnamed: 0,pageid,title,category,label,language,text,links
0,0,26700,Science,Science,0,en,Science (from Latin scientia 'knowledge') is ...,"[56661172, 2665685, 6679056]"
1,1,9145213,Outline of science,Science,0,en,The following outline is provided as a topical...,"[61744455, 52412, 69658254]"
2,2,70476313,Monochrome-astrophotography-techniques,Science,0,en,Monochrome photography is one of the earliest ...,"[571760, 14092434, 422994]"
3,3,26997,Scientist,Science,0,en,A scientist is a person who conducts scientifi...,"[56661172, 238212, 19616384]"
4,4,9145213,Outline of science,Science,0,en,The following outline is provided as a topical...,"[61744455, 52412, 69658254]"
...,...,...,...,...,...,...,...,...
6995,6995,65326971,Pop culture fiction,Entertainment,6,en,Pop culture fiction is a genre of fiction wher...,"[246007, 39973509, 39973509]"
6996,6996,24145,Pun,Entertainment,6,en,"A pun, also known as paronomasia, is a form of...","[17585982, 38870062, 2986]"
6997,6997,1311322,Satire boom,Entertainment,6,en,The satire boom was the output of a generation...,"[82767, 167557, 19344654]"
6998,6998,9765056,Satirical ostraca,Entertainment,6,en,Satirical ostraca are a category of ostraca (s...,"[874, 11039790, 604865]"


In [4]:
train_df  = train_df[['text','category']]

In [5]:
train_df['category'].value_counts()

Science          1000
Sports           1000
Economy          1000
Politics         1000
Education        1000
Health           1000
Entertainment    1000
Name: category, dtype: int64

In [6]:
train_df

Unnamed: 0,text,category
0,Science (from Latin scientia 'knowledge') is ...,Science
1,The following outline is provided as a topical...,Science
2,Monochrome photography is one of the earliest ...,Science
3,A scientist is a person who conducts scientifi...,Science
4,The following outline is provided as a topical...,Science
...,...,...
6995,Pop culture fiction is a genre of fiction wher...,Entertainment
6996,"A pun, also known as paronomasia, is a form of...",Entertainment
6997,The satire boom was the output of a generation...,Entertainment
6998,Satirical ostraca are a category of ostraca (s...,Entertainment


In [7]:
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

# Instantiate the XLMRoBERTa tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base', do_lower_case=True)

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [8]:
model  = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=7)

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

In [9]:
model

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

In [10]:
# Use labels according to language of the dataset
labels = {'Science':0,
          'Sports':1,
          'Economy':2,
          'Politics':3,
          'Education': 4,
          'Health':5,
          'Entertainment':6
          }
# labels = {'Bilim':0,
#                'Spor':1,
#                'Ekonomi':2,
#                'Siyaset':3,
#                'Eğitim':4,
#                'Sağlık':5,
#                'Eğlence':6}

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['category']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [11]:
np.random.seed(112)
df_train, df_val, df_test = np.split(train_df.sample(frac=1, random_state=42), 
                                     [int(.8*len(train_df)), int(.9*len(train_df))])

print(len(df_train),len(df_val), len(df_test))

5600 700 700


In [12]:
from torch.optim import AdamW
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=6, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=6)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    optimizer = AdamW(model.parameters(), lr= learning_rate, eps = 1e-8)

    if use_cuda:

            model = model.to(device)

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device=device, dtype=torch.int64)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                batch_loss, output = model(input_ids = input_id, attention_mask = mask, labels= train_label).values()

                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device=device, dtype=torch.int64)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    batch_loss, output = model(input_ids = input_id, attention_mask = mask, labels= val_label).values()

                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
            
            

            output_model = f'model_xlm_{epoch_num}.pth'
            with open(output_model,'wb') as f:
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict()}, f)
                  
EPOCHS = 5
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=7)
LR = 1e-6
              
train(model, df_train, df_val, LR, EPOCHS)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

Epochs: 1 | Train Loss:  0.234                 | Train Accuracy:  0.638                 | Val Loss:  0.127                 | Val Accuracy:  0.850


100%|██████████| 934/934 [10:24<00:00,  1.50it/s]


Epochs: 2 | Train Loss:  0.093                 | Train Accuracy:  0.871                 | Val Loss:  0.080                 | Val Accuracy:  0.873


100%|██████████| 934/934 [10:24<00:00,  1.50it/s]


Epochs: 3 | Train Loss:  0.068                 | Train Accuracy:  0.891                 | Val Loss:  0.073                 | Val Accuracy:  0.874


100%|██████████| 934/934 [10:24<00:00,  1.50it/s]


Epochs: 4 | Train Loss:  0.056                 | Train Accuracy:  0.913                 | Val Loss:  0.071                 | Val Accuracy:  0.873


100%|██████████| 934/934 [10:23<00:00,  1.50it/s]


Epochs: 5 | Train Loss:  0.046                 | Train Accuracy:  0.926                 | Val Loss:  0.069                 | Val Accuracy:  0.880


In [13]:

def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=6)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              loss, output = model(input_ids = input_id, attention_mask = mask, labels= test_label).values()

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    
evaluate(model, df_test)

Test Accuracy:  0.880
