In [3]:
import pandas as pd
import numpy as np
from cleantext import clean
import re
from transformers import XLNetTokenizer, XLNetForSequenceClassification, TrainingArguments, Trainer, pipeline
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import datasets 
import evaluate
import random

***PreProcess the data***

In [4]:
data_train = pd.read_csv('./emotions_data/emotion-labels-train.csv') 
data_test = pd.read_csv('./emotions_data/emotion-labels-test.csv')
data_val = pd.read_csv('./emotions_data/emotion-labels-val.csv')
# data should be saved in a folder called 'emotions' which is saved in the same place as your notebook

In [5]:
data_train.head()

Unnamed: 0,text,label
0,Just got back from seeing @GaryDelaney in Burs...,joy
1,Oh dear an evening of absolute hilarity I don'...,joy
2,Been waiting all week for this game ❤️❤️❤️ #ch...,joy
3,"@gardiner_love : Thank you so much, Gloria! Yo...",joy
4,I feel so blessed to work with the family that...,joy


In [6]:
data = pd.concat([data_train, data_test,data_val], ignore_index = True)

In [7]:
data['text_clean'] = data['text'].apply(lambda x: clean(x, no_emoji=True))

In [8]:
data['text_clean'] = data['text_clean'].apply(lambda x: re.sub('@[^s]+', '', x))

In [9]:
data.head(20)

Unnamed: 0,text,label,text_clean
0,Just got back from seeing @GaryDelaney in Burs...,joy,just got back from seeing slem. amazing!! face...
1,Oh dear an evening of absolute hilarity I don'...,joy,oh dear an evening of absolute hilarity i don'...
2,Been waiting all week for this game ❤️❤️❤️ #ch...,joy,been waiting all week for this game #cheer #fr...
3,"@gardiner_love : Thank you so much, Gloria! Yo...",joy,"so much, gloria! you're so sweet, and thoughtf..."
4,I feel so blessed to work with the family that...,joy,i feel so blessed to work with the family that...
5,"Today I reached 1000 subscribers on YT!! , #go...",joy,"today i reached 1000 subscribers on yt!! , #go..."
6,"@Singaholic121 Good morning, love! Happy first...",joy,"@singaholic121 good morning, love! happy first..."
7,#BridgetJonesBaby is the best thing I've seen ...,joy,#bridgetjonesbaby is the best thing i've seen ...
8,Just got back from seeing @GaryDelaney in Burs...,joy,just got back from seeing slem. amazing!! face...
9,@IndyMN I thought the holidays could not get a...,joy,"s could not get any more cheerful, and then i ..."


In [10]:
#data['label'].value_counts().plot(kind = 'bar')

In [11]:
g = data.groupby('label')
data = pd.DataFrame(g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True)))

  data = pd.DataFrame(g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True)))


In [12]:
#data['label'].value_counts().plot(kind='bar')

In [13]:
data['label_int'] = LabelEncoder().fit_transform(data['label'])

In [14]:
NUM_LABELS = 4

In [15]:
train_split, test_split = train_test_split(data, train_size = 0.8)
train_split, val_split = train_test_split(train_split, train_size = 0.9)


In [16]:
print(len(train_split))
print(len(test_split))
print(len(val_split))

4414
1227
491


In [17]:
train_df = pd.DataFrame({
    "label": train_split.label_int.values,
    "text": train_split.text_clean.values
})


test_df = pd.DataFrame({
    "label": test_split.label_int.values,
    "text": test_split.text_clean.values
})

In [18]:
train_df = datasets.Dataset.from_dict(train_df)
test_df = datasets.Dataset.from_dict(test_df)

In [19]:
datasets_dict = datasets.DatasetDict({'train': train_df, "test": test_df})

In [20]:
datasets_dict

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 4414
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 1227
    })
})

Embeddings


In [21]:
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")

In [22]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding = "max_length", max_length = 128, truncation= True)

In [23]:
tokenized_datasets = datasets_dict.map(tokenize_function, batched = True)

Map:   0%|          | 0/4414 [00:00<?, ? examples/s]

Map:   0%|          | 0/1227 [00:00<?, ? examples/s]

In [24]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4414
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1227
    })
})

In [25]:
print(tokenized_datasets['train']['text'][0])

st! novelists shouldn't be discouraged by rejection, but they usually are because their work is so personal.


In [26]:
print(tokenized_datasets['train']['input_ids'][0])

[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 17, 639, 136, 16153, 23, 5859, 26, 46, 39, 21359, 37, 12704, 19, 57, 63, 1044, 41, 149, 58, 154, 27, 102, 739, 9, 4, 3]


In [27]:
tokenizer.decode(5)

'<pad>'

In [28]:
print(tokenized_datasets['train']['token_type_ids'][0])

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]


In [29]:
print(tokenized_datasets['train']['attention_mask'][0])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [30]:
small_train_dataset = tokenized_datasets['train'].shuffle(seed=42).select(range(100))
small_test_dataset = tokenized_datasets['test'].shuffle(seed=42).select(range(100))

FINE TUNING

In [31]:
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased',
                                                      num_labels =NUM_LABELS,
                                                      id2label = {0: 'anger', 1: 'fear', 2: 'joy', 3: 'sadness'},
                                                      use_safetensors=True
)

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
metric = evaluate.load('accuracy')

In [33]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis = -1)
    return metric.compute(predictions=predictions, references=labels)
            

In [34]:
trainings_args = TrainingArguments(output_dir='test_trainer', eval_strategy='epoch', num_train_epochs=3)


In [40]:
trainer = Trainer(
    model=model, 
    args=trainings_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_test_dataset,
    compute_metrics=compute_metrics)

In [41]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.420229,0.22
2,No log,1.412532,0.21
3,No log,1.400505,0.22


TrainOutput(global_step=39, training_loss=1.4149353809845753, metrics={'train_runtime': 477.6473, 'train_samples_per_second': 0.628, 'train_steps_per_second': 0.082, 'total_flos': 21366375321600.0, 'train_loss': 1.4149353809845753, 'epoch': 3.0})

Evaluate Model

In [42]:
trainer.evaluate()

{'eval_loss': 1.4005045890808105,
 'eval_accuracy': 0.22,
 'eval_runtime': 37.6846,
 'eval_samples_per_second': 2.654,
 'eval_steps_per_second': 0.345,
 'epoch': 3.0}

In [43]:
model.save_pretrained("fine_tuned_model")

In [45]:
fine_tuned_model = XLNetForSequenceClassification.from_pretrained('fine_tuned_model')

In [46]:
clf = pipeline("text-classification", fine_tuned_model, tokenizer=tokenizer)

Device set to use cpu


In [47]:
rand_int = random.randint(0, len(val_split))
print(val_split['text_clean'][rand_int])
answer = clf(val_split['text_clean'][rand_int], top_k = None)
print(answer)

  print(val_split['text_clean'][rand_int])
  answer = clf(val_split['text_clean'][rand_int], top_k = None)


i wonder how a guy can broke his penis while having sex? #serious
[{'label': 'sadness', 'score': 0.30957669019699097}, {'label': 'anger', 'score': 0.2612883150577545}, {'label': 'joy', 'score': 0.21543504297733307}, {'label': 'fear', 'score': 0.21369995176792145}]
