#Imports

In [None]:
!pip install transformers==4.30.2
!pip install simpletransformers

In [None]:
import pandas as pd
import numpy as np
import logging
import torch
import sklearn
from torch import cuda
import gc
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.model_selection import train_test_split
from google.colab import drive

#Functions

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def empty_cache(model, optimizer):
  device = 'cuda' if cuda.is_available() else 'cpu'
  with torch.no_grad():
      torch.cuda.empty_cache()
  gc.collect()

In [None]:
def f1_multiclass(labels, preds):
  return f1_score(labels, preds, average='macro')

def precision_multiclass(labels,preds):
  return precision_score(labels, preds, average='macro')

def recall_multiclass(labels,preds):
  return recall_score(labels, preds, average='macro')

In [None]:
label_list = ['anger',
              'anticipation',
              'disgust',
              'fear',
              'joy',
              'love',
              'neutral',
              'sadness',
              'surprise',
              'trust']

In [None]:
def split_df(df):
  X_train, X_test, y_train, y_test = train_test_split(df['text'], df['labels'],
                                                    stratify = df['labels'],
                                                    test_size = 0.20,
                                                    random_state = 42)

  train_df = pd.DataFrame({'labels': y_train, 'text': X_train})
  test_df = pd.DataFrame({'labels': y_test, 'text': X_test})

  return train_df, test_df

In [None]:
def run_transformers(train_df, eval_df, label_encoder, reps=1):
    all_preds = []
    for i in range(reps):
        model = ClassificationModel(
            "roberta",
            "distilroberta-base",
            num_labels=len(label_encoder.classes_),
            args={
                'num_train_epochs': 2,
                'max_seq_length': 256,
                'learning_rate': 1e-5,
                'overwrite_output_dir': True,
                'use_cuda': cuda.is_available(),
                'custom_callback': empty_cache
            }
        )
        model.train_model(train_df)
        predictions, _ = model.predict(eval_df['text'].tolist())
        decoded_predictions = label_encoder.inverse_transform(predictions)
        all_preds.append(decoded_predictions)

        # Print confusion matrix and classification report
        #print(f"Run {i+1}:")
        #print(classification_report(eval_df['labels'], predictions, labels=train_df['labels'].unique()))
        #print(confusion_matrix(eval_df['labels'], predictions))

    return all_preds

In [None]:
def build_model(train_df):
  model = ClassificationModel(
      "roberta",
      "distilroberta-base",
      num_labels= len(label_list),
      #use_cuda = False,
      args={'num_train_epochs':2,
            'labels_list': label_list,
            'max_seq_length':256,
            'learning_rate':1e-5,
            'overwrite_output_dir': True,
            'custom_callback': empty_cache,
            }
    )
  #model_args.labels_list = ["true", "false"]

  # train the model
  model.train_model(train_df)

  return model

In [None]:
def predict_labels(model, test_df):
  # test the model with testing dataframe
  predictions, _ = model.predict(test_df['text'].tolist())

  # print confusion matrix
  display(confusion_matrix(test_df['labels'], predictions))

  # print detailed classification report
  print(classification_report(test_df['labels'], predictions, labels=label_list))

  return predictions

# Load Datasets

In [None]:
carer_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/unified formatted data/CARER_unified_sample.csv'
covid_worry_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/unified formatted data/covid_worry_unified.csv'
emo_event_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/unified formatted data/emoevent_unified.csv'
isear_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/unified formatted data/enISEAR_unified.csv'
git_tr_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/unified formatted data/github_love_train_unified.csv'
git_tst_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/unified formatted data/github_love_test_unified.csv'
go_emo_tr_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/unified formatted data/go_emotions_train_unified.csv'
go_emo_tst_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/unified formatted data/go_emotions_test_unified.csv'
good_news_path = '/content/drive/My Drive/Emotion Data Analysis Project/John/john ufd/goodnewseveryone_ufd_single.csv'
stackOV_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/unified formatted data/stackOV_GoldLabels_unified.csv'
tweeteval_tr_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/unified formatted data/tweeteval_train_unified.csv'
tweeteval_tst_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/unified formatted data/tweeteval_test_unified.csv'
uni_joy_path = '/content/drive/My Drive/Emotion Data Analysis Project/John/john ufd/universal_joy_sample_ufd_single.csv'
wassa_tr_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/unified formatted data/wassa_train_unified.csv'
wassa_tst_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/unified formatted data/wassa_test_unified.csv'

In [None]:
carer = pd.read_csv(carer_path)
carer_tr, carer_tst = split_df(carer)

In [None]:
covid_worry = pd.read_csv(covid_worry_path)
covid_tr , covid_tst = split_df(covid_worry)

In [None]:
# using our own split to improve on historical performance
emo_event = pd.read_csv(emo_event_path)
emoev_tr , emoev_tst = split_df(emo_event)

In [None]:
isear = pd.read_csv(isear_path)
isear_tr , isear_tst = split_df(isear)

In [None]:
git_tr = pd.read_csv(git_tr_path)
git_tst = pd.read_csv(git_tst_path)

In [None]:
go_emo_tr = pd.read_csv(go_emo_tr_path)
go_emo_tr = go_emo_tr[["labels", "text"]]
go_emo_tst = pd.read_csv(go_emo_tst_path)
go_emo_tst = go_emo_tst[["labels", "text"]]

In [None]:
good_news = pd.read_csv(good_news_path)
gne_tr , gne_tst = split_df(good_news)

In [None]:
stackOV = pd.read_csv(stackOV_path)
stackOV_tr , stackOV_tst = split_df(stackOV)

In [None]:
tweeteval_tr = pd.read_csv(tweeteval_tr_path)
tweeteval_tst = pd.read_csv(tweeteval_tst_path)

tweeteval_tr = tweeteval_tr[["labels", "text"]]
tweeteval_tr = tweeteval_tr.dropna()

tweeteval_tst = tweeteval_tst[["labels", "text"]]
tweeteval_tst = tweeteval_tst.dropna()

In [None]:
uni_joy = pd.read_csv(uni_joy_path)
uni_tr, uni_tst = split_df(uni_joy)

In [None]:
wassa_tr = pd.read_csv(wassa_tr_path)
wassa_tst = pd.read_csv(wassa_tst_path)

# Create Merged Master Training Set

In [None]:
carer_tr['source'] = 'carer'
covid_tr['source'] = 'covid'
emoev_tr['source'] = 'emoevent'
isear_tr['source'] = 'enisear'
git_tr['source'] = 'github'
gne_tr['source']= 'gne'
go_emo_tr['source'] = 'goemotions'
stackOV_tr['source'] = 'stackOV'
tweeteval_tr['source'] = 'tweeteval'
uni_tr['source'] = 'universal joy'
wassa_tr['source'] = 'wassa21'

In [None]:
# List of all training datasets
datasets = [
    carer_tr,
    covid_tr,
    emoev_tr,
    isear_tr,
    git_tr,
    go_emo_tr,
    gne_tr,
    stackOV_tr,
    tweeteval_tr,
    uni_tr,
    wassa_tr
]

In [None]:
master_train = pd.concat(datasets, ignore_index=True)

In [None]:
master_train = shuffle(master_train, random_state=42)
display(master_train)

Unnamed: 0,labels,text,source
142186,joy,specifically choreographed to be appreciated b...,universal joy
148668,joy,[PHOTO] It's [PERSON] 's first birthday. Break...,universal joy
8839,anger,i have looked around for examples of other peo...,carer
57185,neutral,HASHTAG HASHTAG HASHTAG [1] Billionaires rushe...,emoevent
40787,anger,i had a feeling you would be really bitchy and...,carer
...,...,...,...
119879,sadness,[PERSON] ... always remember your early karaok...,universal joy
103694,anger,Pompeo: Iran is responsible for today’s tanker...,gne
131932,sadness,.......No one should have to go through someth...,universal joy
146867,sadness,[PHOTO] hack for war commander 0FRINDS guys en...,universal joy


In [None]:
print(master_train.shape)
print(master_train['labels'].value_counts())

(180615, 3)
labels
joy             61938
sadness         29316
anticipation    28114
anger           19501
neutral         15549
fear            10154
surprise         7423
love             6775
disgust          1746
trust              99
Name: count, dtype: int64


# Classification

## Build Model

In [None]:
master_train_model = build_model(master_train)

##Test Model

### CARER



In [None]:
# run the classifier
predictions = predict_labels(master_train_model, carer_tst)

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

array([[1594,    2,   72,    8,    1,    2,   41,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0],
       [  11,    0, 1282,    8,    1,    0,   46,   83],
       [   1,    1,    6, 3925,  260,    0,   10,   29],
       [   0,    0,    1,   61,  972,    0,    3,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0],
       [  60,    0,   45,    8,    3,    0, 3519,    1],
       [   1,    0,   42,    3,    0,    0,    5,  398]])

              precision    recall  f1-score   support

       anger       0.96      0.93      0.94      1720
anticipation       0.00      0.00      0.00         0
     disgust       0.00      0.00      0.00         0
        fear       0.89      0.90      0.89      1431
         joy       0.98      0.93      0.95      4232
        love       0.79      0.94      0.85      1037
     neutral       0.00      0.00      0.00         0
     sadness       0.97      0.97      0.97      3636
    surprise       0.78      0.89      0.83       449
       trust       0.00      0.00      0.00         0

   micro avg       0.93      0.93      0.93     12505
   macro avg       0.54      0.55      0.54     12505
weighted avg       0.94      0.93      0.94     12505



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Covid Worry



In [None]:
# run the classifier
predictions = predict_labels(master_train_model, covid_tst)

### Emo-Event

In [None]:
# run the classifier
predictions = predict_labels(master_train_model, emoev_tst)

### enISEAR

In [None]:
# run the classifier
predictions = predict_labels(master_train_model, isear_tst)

### Github Love

In [None]:
# run the classifier
predictions = predict_labels(master_train_model, git_tst)

### GoEmotions

In [None]:
# run the classifier
predictions = predict_labels(master_train_model, go_emo_tst)

###GoodNewsEveryone

In [None]:
# run the classifier
predictions = predict_labels(master_train_model, gne_tst)

### Stack-OV

In [None]:
# run the classifier
predictions = predict_labels(master_train_model, stackOV_tst)

### Tweeteval

In [None]:
# run the classifier
predictions = predict_labels(master_train_model, tweeteval_tst)

### Universal Joy

In [None]:
# run the classifier
predictions = predict_labels(master_train_model, uni_tst)

### WASSA-21

In [None]:
# run the classifier
predictions = predict_labels(master_train_model, wassa_tst)