This file contains code that reads in the harmful emotes we extracted from HasanAbi's chat logs and maps them to the corresponding vector in emote embedding space (we do not need entire embedding space since we are only interested in emotes)

# Reading tsv files into dataframes

In [1]:
import pandas as pd
import numpy as np
import csv  

In [2]:
#read emote embedding space into dataframe. Each row corresponds to an emote where every emote is represented as vector of 500 numbers (made using Word2Vec model)
embeddings_df = pd.read_csv("emote_embeddings.tsv", sep = "\t", encoding='utf-8-sig', header = None, quoting= csv.QUOTE_NONE)
print(embeddings_df)

            0         1         2         3         4         5         6    \
0      1.286686  0.975437 -0.105877  0.404266 -0.069233  0.262404  0.398642   
1      0.529400  0.667009 -0.286177  0.307736  0.286603 -0.901467 -0.519312   
2      0.743785  1.555274  1.418502  0.394715 -0.421091 -0.952254  0.439094   
3      0.043058  0.305662  0.128614 -0.744834 -0.107481 -0.263151  0.488091   
4      1.025381  0.266222  0.659756  0.703869  0.033678  0.297028 -0.840579   
...         ...       ...       ...       ...       ...       ...       ...   
65261  0.123776 -0.046080 -0.207358  0.306132 -0.025957  0.054415 -0.058345   
65262 -0.078786 -0.208091 -0.120529  0.192288 -0.080695  0.001563  0.107449   
65263  0.035936  0.121559  0.153296  0.225215  0.017556 -0.115271  0.007538   
65264 -0.010854 -0.175074 -0.021993  0.171712  0.091793  0.198943 -0.111441   
65265 -0.039198 -0.101730  0.035865  0.102303  0.016584  0.064804  0.012547   

            7         8         9    ...       490 

In [3]:
#read emote labels into dataframe.
labels_df = pd.read_csv("emote_labels.tsv", sep = '\t', encoding='utf-8-sig', quoting= csv.QUOTE_NONE)
print(labels_df)

               word          source
0               LUL   GLOBAL_TWITCH
1                <3   GLOBAL_TWITCH
2              Clap      GLOBAL_7TV
3                gg  TWITCH_CHANNEL
4                :)   GLOBAL_TWITCH
...             ...             ...
65261  brucegbatman  TWITCH_CHANNEL
65262   matthewbruh  TWITCH_CHANNEL
65263    maxyyagony  TWITCH_CHANNEL
65264     maximumyk  TWITCH_CHANNEL
65265   foggedcorgi  TWITCH_CHANNEL

[65266 rows x 2 columns]


In [4]:
print(labels_df.shape) 
print(embeddings_df.shape) 


(65266, 2)
(65266, 500)


In [5]:
print(labels_df["source"].value_counts())

TWITCH_CHANNEL    64851
GLOBAL_TWITCH       297
GLOBAL_BTTV          61
GLOBAL_7TV           44
GLOBAL_FFZ           13
Name: source, dtype: int64


In [6]:
print(labels_df.dtypes)
print(embeddings_df.dtypes)

word      object
source    object
dtype: object
0      float64
1      float64
2      float64
3      float64
4      float64
        ...   
495    float64
496    float64
497    float64
498    float64
499    float64
Length: 500, dtype: object


In [12]:
#Read harmful emotes extracted from HasanAbi's channel into a dataframe (this includes both HasanAbi and global emotes extracted)
harmful_emotes_df = pd.read_csv("Harmfulemotes_analysis17.csv")
print(harmful_emotes_df)

    Unnamed: 0.1  Unnamed: 0             Emote  \
0              0          82             Gayge   
1              1          97            Wokege   
2              2          99             Sussy   
3              3         105      pepeMeltdown   
4              4         110            monkaS   
5              5         112          has0head   
6              6         126    DANKHACKERMANS   
7              7         160          PepeSpit   
8              8         168         hasSammie   
9              9         171           PoroSad   
10            10         172      StinkyCheese   
11            11         178            MmmHmm   
12            12         179         WhatChamp   
13            13         180          PUNCHIES   
14            14         181      DIESOFCRINGE   
15            15         198           hasKapp   
16            16         207          hasSilly   
17            17         208             FLUSH   
18            18         215            Tastge   


# Mapping harmful emotes to an embedding vector

In [13]:
emotes_used = harmful_emotes_df["Emote"]
print(emotes_used)
print(type(emotes_used))

0                Gayge
1               Wokege
2                Sussy
3         pepeMeltdown
4               monkaS
5             has0head
6       DANKHACKERMANS
7             PepeSpit
8            hasSammie
9              PoroSad
10        StinkyCheese
11              MmmHmm
12           WhatChamp
13            PUNCHIES
14        DIESOFCRINGE
15             hasKapp
16            hasSilly
17               FLUSH
18              Tastge
19               KKona
20              RIDING
21        ItalianHands
22          HarleyWink
23          DarkKnight
24    TransgenderPride
25                Grrr
26                  :P
27          dankCrayon
28            peepoBye
Name: Emote, dtype: object
<class 'pandas.core.series.Series'>


In [14]:
emote_and_embedding_df = harmful_emotes_df["Emote"].to_frame()
emote_and_embedding_df["Global emote"] = harmful_emotes_df["Global emote"]
emote_and_embedding_df["Channel emote"] = harmful_emotes_df["Channel emote"]
emote_and_embedding_df["embedding"] = np.nan


index = 0
for current_emote in emotes_used: #go through each emote
    
    # Search for the index of where we find this emote in the labels dataframe
    matching_index = labels_df.index[labels_df['word'] == current_emote].tolist()
    
    if len(matching_index) == 0: #maybe emote is stored in lowercase form
        matching_index = labels_df.index[labels_df['word'].str.lower() == current_emote.lower()].tolist()

    if len(matching_index) == 0: #if emote was not found in labels dataframe
        emote_and_embedding_df.at[index, "embedding"] = "Does not exist"
        index += 1
        continue
    
    #Retrieve the 500 element vector corresponding to this emote using the matching_index
    embedding_vector = embeddings_df.iloc[matching_index[0]].values

    emote_and_embedding_df.iloc[index, emote_and_embedding_df.columns.get_loc("embedding")] = np.array_str(embedding_vector)
    index += 1

print(emote_and_embedding_df)

               Emote  Global emote Channel emote  \
0              Gayge          True         False   
1             Wokege         False      HasanAbi   
2              Sussy         False      HasanAbi   
3       pepeMeltdown         False      HasanAbi   
4             monkaS          True         False   
5           has0head         False      HasanAbi   
6     DANKHACKERMANS         False      HasanAbi   
7           PepeSpit         False      HasanAbi   
8          hasSammie         False      HasanAbi   
9            PoroSad          True         False   
10      StinkyCheese          True         False   
11            MmmHmm         False      HasanAbi   
12         WhatChamp         False      HasanAbi   
13          PUNCHIES         False      HasanAbi   
14      DIESOFCRINGE         False      HasanAbi   
15           hasKapp         False      HasanAbi   
16          hasSilly         False      HasanAbi   
17             FLUSH         False      HasanAbi   
18          

# Write emote to embedding vector mapping to new csv file

In [15]:
emote_and_embedding_df.to_csv("harmfulemote_with_embeddingvector.csv", index = False)