#Functions

In [None]:
import pandas as pd
pd.set_option("max_colwidth", 200)
pd.set_option("display.max_columns", None)

import numpy as np
from sklearn import metrics

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
plt.style.use("seaborn-talk")


def load_data(file_path, indx = True, indx_col = 0):
  '''Parameters:
  file_path: path to your excel or csv file with data,

  indx: boolean - whether there is index column in your file (usually it is the first column) --> *by default it is set to True
  
  indx_col: int - if your file has an index column, specify column number here --> *by default it is equal to 0 (first column)
  '''
  if indx == True and file_path.endswith(".xlsx"):
    data = pd.read_excel(file_path, index_col = indx_col)
  elif indx == False and file_path.endswith(".xlsx"):
    data = pd.read_excel(file_path)

  elif indx == True and file_path.endswith(".csv"):
    data = pd.read_csv(file_path, index_col = indx_col)
  elif indx == False and file_path.endswith(".csv"):
    data = pd.read_csv(file_path)

  return data

#Emotion mining

In [None]:
def clean_text(df, text_column):
  import re
  new_texts = []
  for text in df[text_column]:
    text_list = str(text).lower().split(" ")
    new_string_list = []
    for word in text_list:
      if 'http' in word:
        word = "url"
      elif ('@' in word) and (len(word) > 1):
        word = "@user"
      new_string_list.append(word)
    new_string = " ".join(new_string_list)
    new_string = new_string.strip()
    new_texts.append(new_string)
  df["clean_Text"] = new_texts
  return df

In [None]:
df = load_data("/content/drive/MyDrive/Colab Notebooks/reddit_IAA/plots_TNE/Polaris/PolarIs1-VaccRed_Emo-Ensmbl.xlsx")
# "/content/drive/MyDrive/Colab Notebooks/reddit_IAA/plots_TNE/Polaris/PolarIs2-VacRed_Emo-IMA.xlsx"
# /content/drive/MyDrive/Colab Notebooks/reddit_IAA/plots_TNE/Polaris/PolarIs-CChRed_Emo-IMA.xlsx
# /content/drive/MyDrive/Colab Notebooks/reddit_IAA/plots_TNE/Polaris/PolarIs-CChTw_Emo-IMA.xlsx
# "/content/drive/MyDrive/Colab Notebooks/reddit_IAA/plots_TNE/Polaris/PolarIs1-VaccRed_Emo-Ensmbl.xlsx"
print(df.shape)
df.head(2)

(2373, 3)


Unnamed: 0,sentence,original_id,source
0,"Now, double vaccinated dying from COVID are blaming unvaccinated ones.",0,Effective_Ad4588
1,It really is starting to make no sense.,0,Effective_Ad4588


In [None]:
models = [
    'suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets',
 'stevevee0101/distilbert-base-uncased-finetuned-emotion',
 'Emanuel/bertweet-emotion-base',
 'bhadresh-savani/albert-base-v2-emotion',
 'Emanuel/twitter-emotion-deberta-v3-base',
 
 'michauhl/distilbert-base-uncased-finetuned-emotion',
 'Zia/distilbert-base-uncased-finetuned-emotion',
       'jkhan447/sentiment-model-sample-5-emotion',
       'bhadresh-savani/distilbert-base-uncased-emotion',
        'bhadresh-savani/distilbert-base-uncased-finetuned-emotion',

        "j-hartmann/emotion-english-distilroberta-base"
] # best 5 models w and w/o EMPTY label + new one

In [None]:
df = clean_text(df, 'sentence')
df.tail(2)

Unnamed: 0,sentence,original_id,source,clean_Text
2373,unless I sit on you.,962,twitchspank,unless i sit on you.
2374,you being unvaccinated can increase my chances of catching Covid,962,twitchspank,you being unvaccinated can increase my chances of catching covid


In [None]:
!pip install  transformers[sentencepiece] --q
!pip install emoji==0.6.0 --q

from transformers import pipeline

In [None]:
models[2:3] # Token indices sequence length is longer than the specified maximum sequence length for this model (245 > 128). Running this sequence through the model will result in indexing errors


['Emanuel/bertweet-emotion-base']

In [None]:
for mod in models:
  model_path = str(mod)
  try:
    print(mod)
    sentiment_task = pipeline(task = "text-classification", model = model_path, tokenizer = model_path, device=0)    
    sequence = df['clean_Text'].tolist()
    if 'Emanuel' in model_path:
      sequence = [x[:128] for x in sequence]
    result = sentiment_task(sequence)
    conf = [ x['score'] for x in result ]
    labels = [ x['label'] for x in result ]
    print('success')
    print()
    df['EMO_'+str(mod)] = labels
    df['CF_'+str(mod)] = conf
  except:
    print('except')
    print()
    continue

suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets


Downloading:   0%|          | 0.00/860 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

success

stevevee0101/distilbert-base-uncased-finetuned-emotion


Downloading:   0%|          | 0.00/883 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/360 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

success

Emanuel/bertweet-emotion-base


Downloading:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/540M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/359 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/843k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

success

bhadresh-savani/albert-base-v2-emotion


Downloading:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/428 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/245 [00:00<?, ?B/s]

success

Emanuel/twitter-emotion-deberta-v3-base


Downloading:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/738M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/392 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/156 [00:00<?, ?B/s]

  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"


success

michauhl/distilbert-base-uncased-finetuned-emotion


Downloading:   0%|          | 0.00/860 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

success

Zia/distilbert-base-uncased-finetuned-emotion


Downloading:   0%|          | 0.00/883 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

success

jkhan447/sentiment-model-sample-5-emotion


Downloading:   0%|          | 0.00/995 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/321 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

success

bhadresh-savani/distilbert-base-uncased-emotion


Downloading:   0%|          | 0.00/768 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

success

bhadresh-savani/distilbert-base-uncased-finetuned-emotion


Downloading:   0%|          | 0.00/861 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

success

j-hartmann/emotion-english-distilroberta-base


Downloading:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/294 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

success



In [None]:
df.head(2)

Unnamed: 0,sentence,original_id,source,clean_Text,EMO_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets,CF_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets,EMO_stevevee0101/distilbert-base-uncased-finetuned-emotion,CF_stevevee0101/distilbert-base-uncased-finetuned-emotion,EMO_Emanuel/bertweet-emotion-base,CF_Emanuel/bertweet-emotion-base,EMO_bhadresh-savani/albert-base-v2-emotion,CF_bhadresh-savani/albert-base-v2-emotion,EMO_Emanuel/twitter-emotion-deberta-v3-base,CF_Emanuel/twitter-emotion-deberta-v3-base,EMO_michauhl/distilbert-base-uncased-finetuned-emotion,CF_michauhl/distilbert-base-uncased-finetuned-emotion,EMO_Zia/distilbert-base-uncased-finetuned-emotion,CF_Zia/distilbert-base-uncased-finetuned-emotion,EMO_jkhan447/sentiment-model-sample-5-emotion,CF_jkhan447/sentiment-model-sample-5-emotion,EMO_bhadresh-savani/distilbert-base-uncased-emotion,CF_bhadresh-savani/distilbert-base-uncased-emotion,EMO_bhadresh-savani/distilbert-base-uncased-finetuned-emotion,CF_bhadresh-savani/distilbert-base-uncased-finetuned-emotion,EMO_j-hartmann/emotion-english-distilroberta-base,CF_j-hartmann/emotion-english-distilroberta-base
0,"Now, double vaccinated dying from COVID are blaming unvaccinated ones.",0,Effective_Ad4588,"now, double vaccinated dying from covid are blaming unvaccinated ones.",anger,0.626292,LABEL_3,0.445884,sadness,0.913226,anger,0.757638,fear,0.308491,anger,0.689712,LABEL_0,0.820676,LABEL_3,0.983253,anger,0.7931,sadness,0.426712,anger,0.48729
1,It really is starting to make no sense.,0,Effective_Ad4588,it really is starting to make no sense.,fear,0.65089,LABEL_4,0.260814,joy,0.592781,anger,0.608296,joy,0.76722,anger,0.339555,LABEL_1,0.654405,LABEL_0,0.776056,sadness,0.450886,joy,0.588468,sadness,0.415946


In [None]:
df.shape

(2373, 26)

In [None]:
df.to_excel("/content/drive/MyDrive/Colab Notebooks/reddit_IAA/plots_TNE/Polaris/PolarIs1-VaccRed_Emo-Ensmbl.xlsx")

#Ensemble model - threshold applied

In [None]:
path = "/content/drive/MyDrive/Colab Notebooks/reddit_IAA/plots_TNE/Polaris/PolarIs2-VacRed_Emo-IMA.xlsx"

In [None]:
df_org = load_data(path)
# "/content/drive/MyDrive/Colab Notebooks/reddit_IAA/plots_TNE/Polaris/PolarIs2-VacRed_Emo-IMA.xlsx"
# "/content/drive/MyDrive/Colab Notebooks/reddit_IAA/plots_TNE/Polaris/PolarIs-CChRed_Emo-IMA.xlsx"
# "/content/drive/MyDrive/Colab Notebooks/reddit_IAA/plots_TNE/Polaris/PolarIs-CChTw_Emo-IMA.xlsx"
# "/content/drive/MyDrive/Colab Notebooks/reddit_IAA/plots_TNE/Polaris/PolarIs1-VaccRed_Emo-Ensmbl.xlsx"
print(df_org.shape)
df_org.head(2)

(3641, 32)


Unnamed: 0,index,full_text_id,conversation_id,source,full_text,sentence,EMO_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets,CF_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets,EMO_stevevee0101/distilbert-base-uncased-finetuned-emotion,CF_stevevee0101/distilbert-base-uncased-finetuned-emotion,EMO_bhadresh-savani/albert-base-v2-emotion,CF_bhadresh-savani/albert-base-v2-emotion,EMO_michauhl/distilbert-base-uncased-finetuned-emotion,CF_michauhl/distilbert-base-uncased-finetuned-emotion,EMO_Zia/distilbert-base-uncased-finetuned-emotion,CF_Zia/distilbert-base-uncased-finetuned-emotion,EMO_jkhan447/sentiment-model-sample-5-emotion,CF_jkhan447/sentiment-model-sample-5-emotion,EMO_bhadresh-savani/distilbert-base-uncased-emotion,CF_bhadresh-savani/distilbert-base-uncased-emotion,EMO_bhadresh-savani/distilbert-base-uncased-finetuned-emotion,CF_bhadresh-savani/distilbert-base-uncased-finetuned-emotion,EMO_Emanuel/bertweet-emotion-base,CF_Emanuel/bertweet-emotion-base,EMO_Emanuel/twitter-emotion-deberta-v3-base,CF_Emanuel/twitter-emotion-deberta-v3-base,EMO_j-hartmann/emotion-english-distilroberta-base,CF_j-hartmann/emotion-english-distilroberta-base,clean_Text,label_ensemble,cf_ensemble,label_ensemble2
0.0,0.0,0.0,1.559359e+18,marsons80,@ThePurrrple @MAGAThassell @The_VerminatorD @MattHopkins_au Its not a vaccine.,Its not a vaccine.,fear,0.811617,LABEL_4,0.525668,anger,0.617615,anger,0.401311,LABEL_4,0.901119,LABEL_3,0.983357,anger,0.733883,anger,0.681173,joy,0.441959,joy,0.391673,neutral,0.890355,its not a vaccine.,anger,0.5,EMPTY
1.0,1.0,1.0,1.559359e+18,marsons80,"@ThePurrrple @MAGAThassell @The_VerminatorD @MattHopkins_au Oh and make sure to keep on taking them boosters, can never be to safe.","Oh and make sure to keep on taking them boosters, can never be to safe.",joy,0.98813,LABEL_1,0.986144,joy,0.989787,joy,0.999405,LABEL_1,0.989185,LABEL_1,0.999891,joy,0.997552,joy,0.996328,joy,0.638236,joy,0.981926,neutral,0.663714,"oh and make sure to keep on taking them boosters, can never be to safe.",joy,0.88,joy


##calculate label

In [None]:
#df_org = df_org.reset_index()
df = df_org.copy()

In [None]:
mdls = list(df.filter(regex='EMO_').columns)
mdls

['EMO_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets',
 'EMO_stevevee0101/distilbert-base-uncased-finetuned-emotion',
 'EMO_bhadresh-savani/albert-base-v2-emotion',
 'EMO_michauhl/distilbert-base-uncased-finetuned-emotion',
 'EMO_Zia/distilbert-base-uncased-finetuned-emotion',
 'EMO_jkhan447/sentiment-model-sample-5-emotion',
 'EMO_bhadresh-savani/distilbert-base-uncased-emotion',
 'EMO_bhadresh-savani/distilbert-base-uncased-finetuned-emotion',
 'EMO_Emanuel/bertweet-emotion-base',
 'EMO_Emanuel/twitter-emotion-deberta-v3-base',
 'EMO_j-hartmann/emotion-english-distilroberta-base']

In [None]:
len(mdls)

11

In [None]:
mdls2 = []
for m in mdls:
  print(m)
  #print(df[m].unique(), '\n')
  if not 'LABEL_0' in df[m].unique():
    mdls2.append(m)
mdls2

EMO_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets
EMO_stevevee0101/distilbert-base-uncased-finetuned-emotion
EMO_bhadresh-savani/albert-base-v2-emotion
EMO_michauhl/distilbert-base-uncased-finetuned-emotion
EMO_Zia/distilbert-base-uncased-finetuned-emotion
EMO_jkhan447/sentiment-model-sample-5-emotion
EMO_bhadresh-savani/distilbert-base-uncased-emotion
EMO_bhadresh-savani/distilbert-base-uncased-finetuned-emotion
EMO_Emanuel/bertweet-emotion-base
EMO_Emanuel/twitter-emotion-deberta-v3-base
EMO_j-hartmann/emotion-english-distilroberta-base


['EMO_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets',
 'EMO_bhadresh-savani/albert-base-v2-emotion',
 'EMO_michauhl/distilbert-base-uncased-finetuned-emotion',
 'EMO_bhadresh-savani/distilbert-base-uncased-emotion',
 'EMO_bhadresh-savani/distilbert-base-uncased-finetuned-emotion',
 'EMO_Emanuel/bertweet-emotion-base',
 'EMO_Emanuel/twitter-emotion-deberta-v3-base',
 'EMO_j-hartmann/emotion-english-distilroberta-base']

In [None]:
mdls = mdls2
len(mdls)

8

In [None]:
cfs = [str(c).replace('EMO_', 'CF_') for c in mdls]
cfs

['CF_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets',
 'CF_bhadresh-savani/albert-base-v2-emotion',
 'CF_michauhl/distilbert-base-uncased-finetuned-emotion',
 'CF_bhadresh-savani/distilbert-base-uncased-emotion',
 'CF_bhadresh-savani/distilbert-base-uncased-finetuned-emotion',
 'CF_Emanuel/bertweet-emotion-base',
 'CF_Emanuel/twitter-emotion-deberta-v3-base',
 'CF_j-hartmann/emotion-english-distilroberta-base']

In [None]:
df.tail(3)

Unnamed: 0,index,full_text_id,conversation_id,source,full_text,sentence,EMO_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets,CF_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets,EMO_stevevee0101/distilbert-base-uncased-finetuned-emotion,CF_stevevee0101/distilbert-base-uncased-finetuned-emotion,EMO_bhadresh-savani/albert-base-v2-emotion,CF_bhadresh-savani/albert-base-v2-emotion,EMO_michauhl/distilbert-base-uncased-finetuned-emotion,CF_michauhl/distilbert-base-uncased-finetuned-emotion,EMO_Zia/distilbert-base-uncased-finetuned-emotion,CF_Zia/distilbert-base-uncased-finetuned-emotion,EMO_jkhan447/sentiment-model-sample-5-emotion,CF_jkhan447/sentiment-model-sample-5-emotion,EMO_bhadresh-savani/distilbert-base-uncased-emotion,CF_bhadresh-savani/distilbert-base-uncased-emotion,EMO_bhadresh-savani/distilbert-base-uncased-finetuned-emotion,CF_bhadresh-savani/distilbert-base-uncased-finetuned-emotion,EMO_Emanuel/bertweet-emotion-base,CF_Emanuel/bertweet-emotion-base,EMO_Emanuel/twitter-emotion-deberta-v3-base,CF_Emanuel/twitter-emotion-deberta-v3-base,EMO_j-hartmann/emotion-english-distilroberta-base,CF_j-hartmann/emotion-english-distilroberta-base,clean_Text,label_ensemble,cf_ensemble,label_ensemble2
3638.0,3638.0,1590.0,1.562223e+18,fucklongcovid,"@narstyfakker oh, i meant exactly what you're describing.. going to hospital / urgent care / drugstore full of sick people. not work the additional risk imho but some people need to know if they'r...","oh, i meant exactly what you're describing.. going to hospital /",anger,0.254268,LABEL_3,0.39875,anger,0.533023,joy,0.462682,LABEL_4,0.413208,LABEL_3,0.965538,joy,0.495366,joy,0.377448,sadness,0.556666,joy,0.743066,surprise,0.862832,"oh, i meant exactly what you're describing.. going to hospital /",joy,0.5,EMPTY
3639.0,3639.0,1590.0,1.562223e+18,fucklongcovid,"@narstyfakker oh, i meant exactly what you're describing.. going to hospital / urgent care / drugstore full of sick people. not work the additional risk imho but some people need to know if they'r...",urgent care / drugstore full of sick people.,fear,0.774418,LABEL_3,0.497917,anger,0.759238,fear,0.696654,LABEL_4,0.977165,LABEL_4,0.546234,fear,0.947293,fear,0.883537,fear,0.547622,fear,0.536578,disgust,0.595976,urgent care / drugstore full of sick people.,fear,0.75,fear
3640.0,3640.0,1590.0,1.562223e+18,fucklongcovid,"@narstyfakker oh, i meant exactly what you're describing.. going to hospital / urgent care / drugstore full of sick people. not work the additional risk imho but some people need to know if they'r...","not work the additional risk imho but some people need to know if they're positive so they can inform their employer, etc",joy,0.992235,LABEL_1,0.992871,joy,0.991676,joy,0.999409,LABEL_1,0.997578,LABEL_1,0.999919,joy,0.997865,joy,0.986842,joy,0.991602,joy,0.986612,neutral,0.881734,"not work the additional risk imho but some people need to know if they're positive so they can inform their employer, etc",joy,0.88,joy


In [None]:
for i, m in enumerate(mdls):
  df[m] = np.where(df[cfs[i]] >= 0.5, df[m], np.nan)

In [None]:
df.head(3)

Unnamed: 0,index,full_text_id,conversation_id,source,full_text,sentence,EMO_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets,CF_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets,EMO_stevevee0101/distilbert-base-uncased-finetuned-emotion,CF_stevevee0101/distilbert-base-uncased-finetuned-emotion,EMO_bhadresh-savani/albert-base-v2-emotion,CF_bhadresh-savani/albert-base-v2-emotion,EMO_michauhl/distilbert-base-uncased-finetuned-emotion,CF_michauhl/distilbert-base-uncased-finetuned-emotion,EMO_Zia/distilbert-base-uncased-finetuned-emotion,CF_Zia/distilbert-base-uncased-finetuned-emotion,EMO_jkhan447/sentiment-model-sample-5-emotion,CF_jkhan447/sentiment-model-sample-5-emotion,EMO_bhadresh-savani/distilbert-base-uncased-emotion,CF_bhadresh-savani/distilbert-base-uncased-emotion,EMO_bhadresh-savani/distilbert-base-uncased-finetuned-emotion,CF_bhadresh-savani/distilbert-base-uncased-finetuned-emotion,EMO_Emanuel/bertweet-emotion-base,CF_Emanuel/bertweet-emotion-base,EMO_Emanuel/twitter-emotion-deberta-v3-base,CF_Emanuel/twitter-emotion-deberta-v3-base,EMO_j-hartmann/emotion-english-distilroberta-base,CF_j-hartmann/emotion-english-distilroberta-base,clean_Text,label_ensemble,cf_ensemble,label_ensemble2
0.0,0.0,0.0,1.559359e+18,marsons80,@ThePurrrple @MAGAThassell @The_VerminatorD @MattHopkins_au Its not a vaccine.,Its not a vaccine.,fear,0.811617,LABEL_4,0.525668,anger,0.617615,,0.401311,LABEL_4,0.901119,LABEL_3,0.983357,anger,0.733883,anger,0.681173,,0.441959,,0.391673,neutral,0.890355,its not a vaccine.,anger,0.5,EMPTY
1.0,1.0,1.0,1.559359e+18,marsons80,"@ThePurrrple @MAGAThassell @The_VerminatorD @MattHopkins_au Oh and make sure to keep on taking them boosters, can never be to safe.","Oh and make sure to keep on taking them boosters, can never be to safe.",joy,0.98813,LABEL_1,0.986144,joy,0.989787,joy,0.999405,LABEL_1,0.989185,LABEL_1,0.999891,joy,0.997552,joy,0.996328,joy,0.638236,joy,0.981926,neutral,0.663714,"oh and make sure to keep on taking them boosters, can never be to safe.",joy,0.88,joy
2.0,2.0,2.0,1.559359e+18,BourgaultGilles,"@DevineSpeak @yates_brit @catfish8888 @MattHopkins_au When you use antivax bullshit has excuse not get vax, you are antivax. 🤷‍♂️","When you use antivax bullshit has excuse not get vax, you are antivax.",anger,0.907409,LABEL_3,0.981543,anger,0.668361,anger,0.997937,LABEL_3,0.995523,LABEL_3,0.999802,anger,0.994301,anger,0.991591,,0.462121,anger,0.985883,anger,0.672423,"when you use antivax bullshit has excuse not get vax, you are antivax.",anger,1.0,anger


In [None]:
dfm = df.melt("index", mdls)
dfm.columns = ['index', 'model', 'label_ensemble']
dfm.head()

Unnamed: 0,index,model,label_ensemble
0,0.0,EMO_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets,fear
1,1.0,EMO_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets,joy
2,2.0,EMO_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets,anger
3,3.0,EMO_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets,joy
4,4.0,EMO_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets,joy


In [None]:
maj_dev = pd.DataFrame(dfm.groupby(['index'], as_index = True)['label_ensemble'].value_counts())
maj_dev.columns = ['label_size']
maj_dev = maj_dev.reset_index()
maj_dev = maj_dev.sort_values(by = ['index', 'label_size', 'label_ensemble'], ascending = [True, False, False])
maj_dev.head()

Unnamed: 0,index,label_ensemble,label_size
0,0.0,anger,3
2,0.0,neutral,1
1,0.0,fear,1
3,1.0,joy,7
4,1.0,neutral,1


In [None]:
texts = []
hard = []
cnt = []

In [None]:
for t in maj_dev['index'].unique():
  texts.append(t)
  em = maj_dev[maj_dev['index'] == t]['label_ensemble'].iloc[0]
  hard.append(em)
  num = round(maj_dev[maj_dev['index'] == t]['label_size'].iloc[0] / maj_dev[maj_dev['index'] == t]['label_size'].sum(), 2)
  cnt.append(num)
len(cnt), len(hard)

(3641, 3641)

In [None]:
df_res = pd.DataFrame({'index': texts, 'label_ensemble_threshold': hard, 'cf_ensemble_threshold': cnt})
df_res.head()

Unnamed: 0,index,label_ensemble_threshold,cf_ensemble_threshold
0,0.0,anger,0.6
1,1.0,joy,0.88
2,2.0,anger,1.0
3,3.0,joy,0.5
4,4.0,joy,0.67


In [None]:
df_res.label_ensemble_threshold.value_counts(normalize=True).round(3)*100

joy         41.4
anger       34.6
sadness     11.6
fear         9.2
surprise     1.4
neutral      1.3
love         0.5
Name: label_ensemble_threshold, dtype: float64

In [None]:
df_res.describe().round(3)

Unnamed: 0,index,cf_ensemble_threshold
count,3641.0,3641.0
mean,1820.0,0.76
std,1051.21,0.184
min,0.0,0.2
25%,910.0,0.62
50%,1820.0,0.8
75%,2730.0,0.88
max,3640.0,1.0


In [None]:
df_res[df_res.cf_ensemble_threshold < 0.5].shape[0]/df.shape[0], df_res[df_res.cf_ensemble_threshold <= 0.5].shape[0]/df.shape[0]

(0.0634441087613293, 0.15297995056303212)

In [None]:
df_res['label_ensemble_threshold2'] = np.where(df_res['cf_ensemble_threshold'] > 0.5, df_res['label_ensemble_threshold'], 'EMPTY')

In [None]:
df_res.shape

(3641, 4)

In [None]:
df_res.label_ensemble_threshold2.value_counts(normalize=True).round(3)*100

joy         36.4
anger       31.3
EMPTY       15.3
sadness      8.8
fear         6.9
surprise     0.9
love         0.4
neutral      0.1
Name: label_ensemble_threshold2, dtype: float64

In [None]:
df.merge(df_res, on = 'index', how='left').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3641 entries, 0 to 3640
Data columns (total 35 columns):
 #   Column                                                              Non-Null Count  Dtype  
---  ------                                                              --------------  -----  
 0   index                                                               3641 non-null   float64
 1   full_text_id                                                        3641 non-null   float64
 2   conversation_id                                                     3641 non-null   float64
 3   source                                                              3641 non-null   object 
 4   full_text                                                           3641 non-null   object 
 5   sentence                                                            3641 non-null   object 
 6   EMO_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets  2513 non-null   object 
 7   CF_suvrobaner/d

##save

In [None]:
df_res.shape[0] == df_org.shape[0]

True

In [None]:
df_res.shape, df_org.shape

((3641, 4), (3641, 32))

In [None]:
df_org = df_org.merge(df_res, on = 'index', how='left')
df_org.to_excel(str(path))

#Ensemble model w/o threshold

In [None]:
path = "/content/drive/MyDrive/Colab Notebooks/reddit_IAA/plots_TNE/Polaris/PolarIs1-VaccRed_Emo-Ensmbl.xlsx"

In [None]:
df = load_data(path)
# "/content/drive/MyDrive/Colab Notebooks/reddit_IAA/plots_TNE/Polaris/PolarIs2-VacRed_Emo-IMA.xlsx"
# /content/drive/MyDrive/Colab Notebooks/reddit_IAA/plots_TNE/Polaris/PolarIs-CChRed_Emo-IMA.xlsx
# /content/drive/MyDrive/Colab Notebooks/reddit_IAA/plots_TNE/Polaris/PolarIs-CChTw_Emo-IMA.xlsx
# "/content/drive/MyDrive/Colab Notebooks/reddit_IAA/plots_TNE/Polaris/PolarIs1-VaccRed_Emo-Ensmbl.xlsx"
print(df.shape)
df.head(2)

(2373, 26)


Unnamed: 0,sentence,original_id,source,clean_Text,EMO_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets,CF_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets,EMO_stevevee0101/distilbert-base-uncased-finetuned-emotion,CF_stevevee0101/distilbert-base-uncased-finetuned-emotion,EMO_Emanuel/bertweet-emotion-base,CF_Emanuel/bertweet-emotion-base,EMO_bhadresh-savani/albert-base-v2-emotion,CF_bhadresh-savani/albert-base-v2-emotion,EMO_Emanuel/twitter-emotion-deberta-v3-base,CF_Emanuel/twitter-emotion-deberta-v3-base,EMO_michauhl/distilbert-base-uncased-finetuned-emotion,CF_michauhl/distilbert-base-uncased-finetuned-emotion,EMO_Zia/distilbert-base-uncased-finetuned-emotion,CF_Zia/distilbert-base-uncased-finetuned-emotion,EMO_jkhan447/sentiment-model-sample-5-emotion,CF_jkhan447/sentiment-model-sample-5-emotion,EMO_bhadresh-savani/distilbert-base-uncased-emotion,CF_bhadresh-savani/distilbert-base-uncased-emotion,EMO_bhadresh-savani/distilbert-base-uncased-finetuned-emotion,CF_bhadresh-savani/distilbert-base-uncased-finetuned-emotion,EMO_j-hartmann/emotion-english-distilroberta-base,CF_j-hartmann/emotion-english-distilroberta-base
0,"Now, double vaccinated dying from COVID are blaming unvaccinated ones.",0,Effective_Ad4588,"now, double vaccinated dying from covid are blaming unvaccinated ones.",anger,0.626292,LABEL_3,0.445884,sadness,0.913226,anger,0.757638,fear,0.308491,anger,0.689712,LABEL_0,0.820676,LABEL_3,0.983253,anger,0.7931,sadness,0.426712,anger,0.48729
1,It really is starting to make no sense.,0,Effective_Ad4588,it really is starting to make no sense.,fear,0.65089,LABEL_4,0.260814,joy,0.592781,anger,0.608296,joy,0.76722,anger,0.339555,LABEL_1,0.654405,LABEL_0,0.776056,sadness,0.450886,joy,0.588468,sadness,0.415946


##calculate label

In [None]:
df = df.reset_index()

In [None]:
mdls = list(df.filter(regex='EMO_').columns)
mdls[:3]

['EMO_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets',
 'EMO_stevevee0101/distilbert-base-uncased-finetuned-emotion',
 'EMO_Emanuel/bertweet-emotion-base']

In [None]:
len(mdls)

11

In [None]:
mdls2 = []
for m in mdls:
  print(m)
  print(df[m].unique(), '\n')
  if not 'LABEL_0' in df[m].unique():
    mdls2.append(m)
mdls2

EMO_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets
['anger' 'fear' 'sadness' 'joy' 'surprise' 'love'] 

EMO_stevevee0101/distilbert-base-uncased-finetuned-emotion
['LABEL_3' 'LABEL_4' 'LABEL_0' 'LABEL_1' 'LABEL_5' 'LABEL_2'] 

EMO_Emanuel/bertweet-emotion-base
['sadness' 'joy' 'anger' 'fear' 'surprise' 'love'] 

EMO_bhadresh-savani/albert-base-v2-emotion
['anger' 'sadness' 'fear' 'joy' 'surprise' 'love'] 

EMO_Emanuel/twitter-emotion-deberta-v3-base
['fear' 'joy' 'sadness' 'anger' 'surprise' 'love'] 

EMO_michauhl/distilbert-base-uncased-finetuned-emotion
['anger' 'fear' 'sadness' 'joy' 'surprise' 'love'] 

EMO_Zia/distilbert-base-uncased-finetuned-emotion
['LABEL_0' 'LABEL_1' 'LABEL_4' 'LABEL_3' 'LABEL_5' 'LABEL_2'] 

EMO_jkhan447/sentiment-model-sample-5-emotion
['LABEL_3' 'LABEL_0' 'LABEL_4' 'LABEL_1' 'LABEL_2' 'LABEL_5'] 

EMO_bhadresh-savani/distilbert-base-uncased-emotion
['anger' 'sadness' 'fear' 'joy' 'surprise' 'love'] 

EMO_bhadresh-savani/distilbert-base-unca

['EMO_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets',
 'EMO_Emanuel/bertweet-emotion-base',
 'EMO_bhadresh-savani/albert-base-v2-emotion',
 'EMO_Emanuel/twitter-emotion-deberta-v3-base',
 'EMO_michauhl/distilbert-base-uncased-finetuned-emotion',
 'EMO_bhadresh-savani/distilbert-base-uncased-emotion',
 'EMO_bhadresh-savani/distilbert-base-uncased-finetuned-emotion',
 'EMO_j-hartmann/emotion-english-distilroberta-base']

In [None]:
mdls = mdls2
len(mdls)

8

In [None]:
dfm = df.melt("index", mdls)
dfm.columns = ['index', 'model', 'label_ensemble']
dfm.head(2)

Unnamed: 0,index,model,label_ensemble
0,0,EMO_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets,anger
1,1,EMO_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets,fear


In [None]:
maj_dev = pd.DataFrame(dfm.groupby(['index'], as_index = True)['label_ensemble'].value_counts())
maj_dev.columns = ['label_size']
maj_dev = maj_dev.reset_index()
maj_dev = maj_dev.sort_values(by = ['index', 'label_size', 'label_ensemble'], ascending = [True, False, False])
maj_dev.head()

Unnamed: 0,index,label_ensemble,label_size
0,0,anger,5
1,0,sadness,2
2,0,fear,1
3,1,joy,3
5,1,sadness,2


In [None]:
texts = []
hard = []
cnt = []

In [None]:
for t in maj_dev['index'].unique():
  texts.append(t)
  em = maj_dev[maj_dev['index'] == t]['label_ensemble'].iloc[0]
  hard.append(em)
  num = round(maj_dev[maj_dev['index'] == t]['label_size'].iloc[0] / len(mdls), 2)
  cnt.append(num)
len(cnt), len(hard)

(2373, 2373)

In [None]:
df_res = pd.DataFrame({'index': texts, 'label_ensemble': hard, 'cf_ensemble': cnt})
df_res.head()

Unnamed: 0,index,label_ensemble,cf_ensemble
0,0,anger,0.62
1,1,joy,0.38
2,2,fear,0.62
3,3,sadness,0.88
4,4,sadness,0.88


In [None]:
df_res.label_ensemble.value_counts(normalize=True).round(3)*100

joy         40.0
anger       33.8
fear        13.3
sadness     11.8
surprise     0.6
love         0.4
Name: label_ensemble, dtype: float64

In [None]:
df_res.describe().round(3)

Unnamed: 0,index,cf_ensemble
count,2373.0,2373.0
mean,1187.257,0.688
std,685.596,0.181
min,0.0,0.25
25%,594.0,0.5
50%,1187.0,0.62
75%,1781.0,0.88
max,2374.0,1.0


In [None]:
df_res[df_res.cf_ensemble < 0.5].shape[0]/df.shape[0], df_res[df_res.cf_ensemble <= 0.5].shape[0]/df.shape[0]

(0.08512431521281079, 0.281078803202697)

In [None]:
df_res['label_ensemble2'] = np.where(df_res['cf_ensemble'] > 0.5, df_res['label_ensemble'], 'EMPTY')

In [None]:
df_res.shape

(2373, 4)

In [None]:
df_res.label_ensemble2.value_counts(normalize=True).round(3)*100

joy         31.4
EMPTY       28.1
anger       25.7
sadness      7.7
fear         6.3
surprise     0.5
love         0.3
Name: label_ensemble2, dtype: float64

In [None]:
df.merge(df_res, on = 'index').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2373 entries, 0 to 2372
Data columns (total 30 columns):
 #   Column                                                              Non-Null Count  Dtype  
---  ------                                                              --------------  -----  
 0   index                                                               2373 non-null   int64  
 1   sentence                                                            2373 non-null   object 
 2   original_id                                                         2373 non-null   int64  
 3   source                                                              2373 non-null   object 
 4   clean_Text                                                          2373 non-null   object 
 5   EMO_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets  2373 non-null   object 
 6   CF_suvrobaner/distilbert-base-uncased-finetuned-emotion-en-tweets   2373 non-null   float64
 7   EMO_stevevee010

##save

In [None]:
df_res.shape[0] == df.shape[0]

True

In [None]:
df = df.merge(df_res, on = 'index')
df.to_excel(str(path))