In [1]:
import json

import pandas as pd
import numpy as np

In [2]:
df_raw = pd.read_csv('data/goemotions.csv')

In [3]:
emotions_cols = ['admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral']

In [4]:
emotion_categories = {'admiration': 'Positive', # Proposed simplified classification
                      'amusement':'Positive',
                      'anger':'Negative',
                      'annoyance':'Negative',
                      'approval':'Positive',
                      'caring':'Positive',
                      'confusion':'Neutral',
                      'curiosity':'Positive',
                      'desire':'Positive',
                      'disappointment':'Negative',
                      'disapproval':'Negative',
                      'disgust':'Negative',
                      'embarrassment':'Neutral',
                      'excitement':'Positive',
                      'fear':'Negative',
                      'gratitude':'Positive',
                      'grief':'Negative',
                      'joy':'Positive',
                      'love':'Positive',
                      'nervousness':'Neutral',
                      'optimism':'Positive',
                      'pride':'Positive',
                      'realization':'Neutral',
                      'relief':'Positive',
                      'remorse':'Negative',
                      'sadness':'Negative',
                      'surprise':'Neutral',
                      'neutral':'Neutral'
                       }

In [21]:
emotion_groups = {'admiration': 1, 
                      'amusement':2,
                      'anger':3,
                      'annoyance':4,
                      'approval':5,
                      'caring':6,
                      'confusion':7,
                      'curiosity':8,
                      'desire':9,
                      'disappointment':10,
                      'disapproval':11,
                      'disgust':12,
                      'embarrassment':13,
                      'excitement':14,
                      'fear':15,
                      'gratitude':16,
                      'grief':17,
                      'joy':18,
                      'love':19,
                      'nervousness':20,
                      'optimism':21,
                      'pride':22,
                      'realization':23,
                      'relief':24,
                      'remorse':25,
                      'sadness':26,
                      'surprise':27,
                      'neutral':28
                       }

In [5]:
df = df_raw.set_index(['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear'])

df = df[df==1].stack().reset_index().drop(0,1).rename(columns = {'level_9': 'sentiment'})
df.head()

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,sentiment
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381039,1,False,sadness
1,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546427744,37,False,neutral
2,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965054,18,False,love
3,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546668601,2,False,neutral
4,Right? Considering it’s such an important docu...,eespn2i,ImperialBoss,TrueReddit,t3_aizyuz,t1_eesoak0,1548280208,61,False,gratitude


In [23]:
df['sentiment_rating'] = df['sentiment'].map(emotion_categories).map({ 'Negative': -1, "Neutral": 0, "Positive": 1})

df.head()

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,sentiment,sentiment_rating,group
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381039,1,False,sadness,-1,26
1,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546427744,37,False,neutral,0,28
2,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965054,18,False,love,1,19
3,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546668601,2,False,neutral,0,28
4,Right? Considering it’s such an important docu...,eespn2i,ImperialBoss,TrueReddit,t3_aizyuz,t1_eesoak0,1548280208,61,False,gratitude,1,16


In [29]:
# Get top sentiment for each subreddit
top_sentiment_per_subreddit = df[df['sentiment'] != 'neutral'].groupby(['subreddit', 'sentiment'], as_index = False).count().sort_values(['subreddit', 'text'], ascending = False).groupby('subreddit', as_index = False).first()[['subreddit', 'sentiment', 'text']].rename(columns = { 'text': 'n', 'subreddit': 'name'})

top_sentiment_per_subreddit['group'] = top_sentiment_per_subreddit['sentiment'].map(emotion_groups)
top_sentiment_per_subreddit['id'] = top_sentiment_per_subreddit['name']

top_sentiment_per_subreddit.head()

Unnamed: 0,name,sentiment,n,group,id
0,2meirl4meirl,sadness,70,26,2meirl4meirl
1,49ers,admiration,50,1,49ers
2,4PanelCringe,amusement,73,2,4PanelCringe
3,90DayFiance,admiration,76,1,90DayFiance
4,90dayfianceuncensored,approval,63,5,90dayfianceuncensored


In [53]:
nodes = top_sentiment_per_subreddit.to_dict('records')

In [41]:
# graph links
links_data = []
for index, row in df.iterrows():
    author = row['author']
    other_posts = df[(df['id'] != row['id']) & (df['author'] == author)]
    other_subreddits = list(set(other_posts['subreddit']))
    for subreddit in other_subreddits:
        links_data.append({ 'source': row['subreddit'], 'target': subreddit, 'count': 1})
    
    clear_output()
    print(index, "/", len(df))


15468 / 249529


KeyboardInterrupt: 

In [39]:
from IPython.display import clear_output

In [57]:
links_df = pd.DataFrame(links_data).groupby(['source', 'target'], as_index = False).count().sort_values(['source', 'count'], ascending = False)
# .groupby(['source']).head(20)[['source', 'target']]
# links_df['value'] = 1
links_df.tail(50)

Unnamed: 0,source,target,count
338,90dayfianceuncensored,tennis,7
339,90dayfianceuncensored,terriblefacebookmemes,7
340,90dayfianceuncensored,texas,7
341,90dayfianceuncensored,thatHappened,7
342,90dayfianceuncensored,tifu,7
343,90dayfianceuncensored,timberwolves,7
344,90dayfianceuncensored,torontoraptors,7
345,90dayfianceuncensored,traaaaaaannnnnnnnnns,7
346,90dayfianceuncensored,trees,7
347,90dayfianceuncensored,truegaming,7


In [52]:
links = links_df.to_dict('records')

In [55]:
arc_diagram_graph = {
    'nodes': nodes,
    'links': links
}

with open('arc-data.json', 'w') as fout:
    json.dump(arc_diagram_graph , fout)

In [62]:
# For Network graph
df_users = df[['author', 'rater_id', 'subreddit', 'sentiment_rating']]
user_nodes = df_users.groupby(['author'], as_index = False).agg({ 'sentiment_rating': 'mean', 'subreddit': 'nunique'}).rename(columns = {'subreddit': 'connections', 'author': 'name'})
user_nodes['type'] = 'user'
user_nodes = user_nodes.to_dict('records')

subreddit_nodes = df_users.groupby(['subreddit'], as_index = False).agg({ 'sentiment_rating': 'mean', 'author': 'nunique'}).rename(columns = {'author': 'connections', 'subreddit': 'name'})
subreddit_nodes['type'] = 'subreddit'
subreddit_nodes = subreddit_nodes.to_dict('records')

with open('nodes.json', 'w') as fout:
    json.dump(user_nodes + subreddit_nodes , fout)

In [68]:
authors_and_subreddits = df_users[['author', "subreddit"]].drop_duplicates(keep = "first")
subreddits = authors_and_subreddits['subreddit'].drop_duplicates(keep = "first")

links = []

for subreddit in subreddits:
    authors = authors_and_subreddits[authors_and_subreddits['subreddit'] == subreddit]['author'].to_list()
    for author in authors:
        links.append({ 'source': subreddit, 'target': author})

with open('links.json', 'w') as fout:
    json.dump(links , fout)

In [67]:
len(links)

51833

In [56]:
len(authors)

49150

In [14]:
df = df_raw.groupby(['id', 'subreddit', 'text'])[emotions_cols].sum().reset_index() # Aggregates by post

In [6]:
# Only keeps emotion(s) with the most votes for each post
"""
df['max_val'] = df[emotions_cols].max(axis=1)
for col in emotions_cols:
    df[col] = df[col] - df['max_val']
del df['max_val']
df_new = (df.melt(['id', 'subreddit'], var_name='emotion').query('value >= 0')
       .groupby(['id', 'subreddit'])['emotion']
       .apply(', '.join)
       .reset_index())
df_grouped = df_new.groupby(['subreddit', 'emotion']).count().reset_index()
"""

"\ndf['max_val'] = df[emotions_cols].max(axis=1)\nfor col in emotions_cols:\n    df[col] = df[col] - df['max_val']\ndel df['max_val']\ndf_new = (df.melt(['id', 'subreddit'], var_name='emotion').query('value >= 0')\n       .groupby(['id', 'subreddit'])['emotion']\n       .apply(', '.join)\n       .reset_index())\ndf_grouped = df_new.groupby(['subreddit', 'emotion']).count().reset_index()\n"

In [15]:
df.groupby(['subreddit']).count().sort_values('id')

Unnamed: 0_level_0,id,text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
farcry,25,25,25,25,25,25,25,25,25,25,...,25,25,25,25,25,25,25,25,25,25
shieldbro,32,32,32,32,32,32,32,32,32,32,...,32,32,32,32,32,32,32,32,32,32
Anticonsumption,34,34,34,34,34,34,34,34,34,34,...,34,34,34,34,34,34,34,34,34,34
SSBM,35,35,35,35,35,35,35,35,35,35,...,35,35,35,35,35,35,35,35,35,35
darknet,38,38,38,38,38,38,38,38,38,38,...,38,38,38,38,38,38,38,38,38,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vanderpumprules,221,221,221,221,221,221,221,221,221,221,...,221,221,221,221,221,221,221,221,221,221
AnimalsBeingBros,231,231,231,231,231,231,231,231,231,231,...,231,231,231,231,231,231,231,231,231,231
socialanxiety,232,232,232,232,232,232,232,232,232,232,...,232,232,232,232,232,232,232,232,232,232
cringe,239,239,239,239,239,239,239,239,239,239,...,239,239,239,239,239,239,239,239,239,239


In [20]:
subreddit_df = pd.DataFrame(columns = ['subreddit', 'emotion', 'count'])
for emotion in emotions_cols:
    gp_df = df.groupby(['subreddit'])[emotion].sum()
    d = {'subreddit': gp_df.index, 'emotion': [emotion]*len(gp_df), 'count':gp_df.values}
    subreddit_df = subreddit_df.append(pd.DataFrame(data = d), ignore_index=True)

In [9]:
#hv.extension('bokeh')
#hv.Sankey(subreddit_df.loc[lambda f: f['subreddit'] == 'socialanxiety'])

In [59]:
#subreddit_df.to_csv('subreddit_emotion.csv')

In [39]:
text_df = pd.DataFrame(columns = ['subreddit', 'emotion', 'text'])
for emotion in emotions_cols:
    df_long = df[df['text'].str.split().str.len().ge(4)] # Remove short posts
    df_sorted = df_long.sort_values(emotion, ascending=False)
    gp_df = df_sorted.groupby('subreddit').first()
    d = {'subreddit': gp_df.index, 'emotion': [emotion]*len(gp_df), 'text':gp_df['text']}
    text_df = text_df.append(pd.DataFrame(data = d), ignore_index=True)

In [44]:
full_df = pd.merge(text_df, subreddit_df, how='left', on=['subreddit', 'emotion'])

In [46]:
full_df.columns = ['source', 'target', 'text', 'value']

In [47]:
full_df.to_csv('subreddit_emotion_2.csv')

In [45]:
full_df

Unnamed: 0,subreddit,emotion,text,count
0,2meirl4meirl,admiration,Or embalmed! Green burial is the best burial,47
1,49ers,admiration,Can we please just call out defense next year ...,50
2,4PanelCringe,admiration,Seriously. [NAME] is a hero. I heard he was a ...,52
3,90DayFiance,admiration,maybe. Id praise her for it though. In a world...,76
4,90dayfianceuncensored,admiration,She could and should pursue modeling! It seeme...,43
...,...,...,...,...
13519,worldpolitics,neutral,The original video was debunked. Do you agree ...,156
13520,yesyesyesyesno,neutral,This is like 4 years old from vine,198
13521,youseeingthisshit,neutral,Imagine being upset over who someone loves.,198
13522,youtube,neutral,Naming comes from the heart,95
