This file contains code that reads in the embedding space and labelled metadata, and extracts only the embeddings for emotes. 
This new emote only embedding space along with the metadata is written to new files called emote_embeddings.tsv and emote_labels.tsv

# Reading tsv files into dataframes

In [1]:
import pandas as pd
import numpy as np
import csv  

In [2]:
#read embedding space into dataframe. Each row corresponds to an emote where every emote is represented as vector of 500 numbers (made using Word2Vec model)
embeddings_df = pd.read_csv("Data\embeddings.tsv", sep = "\t", encoding='utf-8-sig', header = None, quoting= csv.QUOTE_NONE)
print(embeddings_df)

             0         1         2         3         4         5         6    \
0       2.425322  1.059797  2.948307  2.549122 -1.447378 -3.522552  0.309947   
1       1.424481 -0.929496  0.849419 -1.344445 -1.060065 -1.953172 -0.767917   
2       2.917746 -0.125978  2.447257  1.013905 -1.086458  0.725502 -0.158030   
3       0.457494  0.281609  0.095384  0.734075  1.968895  1.396560  1.091783   
4       0.358243  0.419274  3.300781  0.834661 -0.654189 -1.543205  1.313539   
...          ...       ...       ...       ...       ...       ...       ...   
732997  0.002698 -0.074987 -0.112321  0.362402  0.105553  0.014751 -0.144500   
732998 -0.099925  0.053256 -0.119749 -0.155997 -0.066058 -0.022135 -0.058278   
732999  0.013896  0.024456 -0.068355 -0.013302 -0.170996  0.074749 -0.185176   
733000  0.204142 -0.003675 -0.000538  0.072707  0.037361 -0.187628 -0.139005   
733001 -0.029968  0.074007  0.139851  0.094036  0.027837 -0.069320 -0.019845   

             7         8         9    .

In [3]:
#read token labels into dataframe.
labels_df = pd.read_csv("Data\labeled_metadata.tsv", sep = '\t', encoding='utf-8-sig', quoting= csv.QUOTE_NONE)
print(labels_df)

                    word source
0                    the   TEXT
1                      !   TEXT
2                      ?   TEXT
3                      @   TEXT
4                      a   TEXT
...                  ...    ...
732998            senril   TEXT
732999           nobbels   TEXT
733000  pokemntrainerred   TEXT
733001              auot   TEXT
733002            aeirus   TEXT

[733003 rows x 2 columns]


In [4]:
print(labels_df.shape) #733003 rows
print(embeddings_df.shape) #733002 rows


(733003, 2)
(733002, 500)


In [5]:
print(labels_df["source"].value_counts()) #We will extract all rows whose source is not "TEXT"

TEXT              667737
TWITCH_CHANNEL     64851
GLOBAL_TWITCH        297
GLOBAL_BTTV           61
GLOBAL_7TV            44
GLOBAL_FFZ            13
Name: source, dtype: int64


# Extracting rows with emotes from embedding space and labelled metadata

In [10]:
#Extract indices in labelled metadata where source is an emote (not TEXT)
matching_index_list = labels_df.index[labels_df['source'] != "TEXT"].tolist()


In [11]:
print(matching_index_list)
print(len(matching_index_list))

[20, 55, 104, 128, 145, 156, 162, 174, 219, 239, 245, 254, 275, 276, 304, 325, 357, 362, 367, 368, 370, 387, 388, 419, 457, 494, 522, 549, 585, 589, 644, 647, 652, 667, 679, 693, 728, 742, 752, 756, 768, 771, 775, 777, 780, 783, 787, 804, 820, 823, 824, 830, 831, 846, 855, 883, 896, 898, 899, 910, 911, 914, 918, 931, 937, 951, 955, 966, 972, 973, 1000, 1001, 1052, 1055, 1079, 1102, 1106, 1125, 1136, 1137, 1140, 1165, 1170, 1196, 1201, 1203, 1206, 1220, 1245, 1257, 1264, 1279, 1281, 1297, 1313, 1315, 1318, 1345, 1347, 1359, 1363, 1374, 1378, 1395, 1401, 1403, 1404, 1417, 1420, 1463, 1482, 1494, 1495, 1506, 1513, 1514, 1520, 1522, 1524, 1550, 1557, 1560, 1566, 1567, 1580, 1582, 1598, 1600, 1615, 1620, 1621, 1627, 1628, 1637, 1642, 1659, 1662, 1663, 1675, 1685, 1703, 1710, 1764, 1767, 1780, 1785, 1794, 1795, 1807, 1809, 1815, 1828, 1839, 1849, 1858, 1878, 1883, 1909, 1916, 1927, 1943, 1951, 1976, 1980, 1982, 2002, 2006, 2014, 2020, 2041, 2060, 2061, 2078, 2094, 2102, 2136, 2139, 2148, 215

In [14]:
emotes_only_embedding_df = embeddings_df.iloc[matching_index_list]
emotes_only_embedding_df = emotes_only_embedding_df.reset_index(drop = True)
emotes_only_embedding_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,1.286686,0.975437,-0.105877,0.404266,-0.069233,0.262404,0.398642,-0.315449,-1.087698,-1.060235,...,-0.187224,-0.958284,0.234938,0.003340,-0.104484,-0.293105,-0.475162,-0.332355,-1.018994,-1.131760
1,0.529400,0.667009,-0.286177,0.307736,0.286603,-0.901467,-0.519312,-0.045616,0.182374,1.221941,...,-0.205459,0.157584,0.821111,0.474919,-0.379334,0.377733,0.040605,0.962311,-0.105893,-1.231472
2,0.743785,1.555274,1.418502,0.394715,-0.421091,-0.952254,0.439094,-0.824591,-0.556452,-1.017358,...,0.541599,-0.434617,1.614444,0.418718,-0.602230,0.333248,0.285769,-0.152497,-0.302453,-2.602358
3,0.043058,0.305662,0.128614,-0.744834,-0.107481,-0.263151,0.488091,0.013400,-0.440657,-1.324904,...,0.685416,1.168294,0.483025,-0.566916,-0.884935,0.412866,-1.231713,-0.045327,-0.155488,-0.278389
4,1.025381,0.266222,0.659756,0.703869,0.033678,0.297028,-0.840579,0.118151,-1.409984,0.229814,...,0.256568,0.060377,0.313571,0.657656,-0.541019,-0.109496,-0.799253,-0.311534,-1.136861,-1.726974
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65261,0.123776,-0.046080,-0.207358,0.306132,-0.025957,0.054415,-0.058345,0.373204,-0.003007,-0.011925,...,0.116089,0.154081,0.028771,0.059752,0.029517,-0.039812,0.073769,-0.125390,0.106057,-0.059767
65262,-0.078786,-0.208091,-0.120529,0.192288,-0.080695,0.001563,0.107449,-0.003260,-0.066386,0.379041,...,0.076811,0.153004,0.021823,0.113981,0.213592,0.032059,-0.000415,-0.115231,-0.140055,-0.288852
65263,0.035936,0.121559,0.153296,0.225215,0.017556,-0.115271,0.007538,0.010117,-0.134507,0.063472,...,-0.059425,0.197370,0.087605,-0.047777,0.107747,0.026922,0.013917,0.043370,-0.200364,-0.127454
65264,-0.010854,-0.175074,-0.021993,0.171712,0.091793,0.198943,-0.111441,0.093867,0.130741,-0.016562,...,0.090605,0.164713,0.056258,0.120644,0.153712,0.027234,-0.150531,-0.101712,-0.026250,-0.098374


In [15]:
emotes_only_labels_df = labels_df.iloc[matching_index_list]
emotes_only_labels_df = emotes_only_labels_df.reset_index(drop = True)
emotes_only_labels_df

Unnamed: 0,word,source
0,LUL,GLOBAL_TWITCH
1,<3,GLOBAL_TWITCH
2,Clap,GLOBAL_7TV
3,gg,TWITCH_CHANNEL
4,:),GLOBAL_TWITCH
...,...,...
65261,brucegbatman,TWITCH_CHANNEL
65262,matthewbruh,TWITCH_CHANNEL
65263,maxyyagony,TWITCH_CHANNEL
65264,maximumyk,TWITCH_CHANNEL


In [16]:
emotes_only_labels_df["source"].value_counts() #As we can see, this only includes emotes now

TWITCH_CHANNEL    64851
GLOBAL_TWITCH       297
GLOBAL_BTTV          61
GLOBAL_7TV           44
GLOBAL_FFZ           13
Name: source, dtype: int64

# Writing emote only dataframes to tsv files

In [17]:
emotes_only_embedding_df.to_csv("emote_embeddings.tsv", index = False, sep = "\t")
emotes_only_labels_df.to_csv("emote_labels.tsv", index = False, sep = "\t")