# Creating the Reddit Network of Climate Discussion Contributors 

Using the pre-trained Climate Change Sentiment classifier on submissions and comments from Redittors, we model a network of Redittors being pro, neutral or anti to Climate Change.


In [1]:
import os
from pathlib import Path
import time, datetime

import pickle
from collections import Counter

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()

from nltk import word_tokenize, PorterStemmer
from nltk.corpus import stopwords

import networkx as nx

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
warnings.simplefilter("ignore", UserWarning)

## 1) Load Reddit data

Initially, the data extracted from Reddit is loaded.

In [2]:
# Data directory
DATA_DIR = Path(os.getcwd()).parent / 'data'
#DATA_DIR = Path('/work3/s194253')

# year
year = 2019

In [3]:
start = time.time()

# load data
comments = pd.read_json(DATA_DIR / f'{year}/comments90k_opinion_{year}.json.bz2')
submissions = pd.read_json(DATA_DIR / f'{year}/submissions_opinion_{year}.json.bz2')

end = time.time()
print(f"Took {end-start} seconds to load dataframe...")

Took 27.186524868011475 seconds to load dataframe...


In [4]:
submissions.shape

(135946, 17)

In [5]:
comments.shape

(238433, 17)

## 2) Link comment authors to submission authors

In [6]:
# create dictionaries
comment_authors = dict(zip(comments.id, comments.author))
parent = dict(zip(comments.id, comments.parent_id))
submission_authors = dict(zip(submissions.id, submissions.author))

In [7]:
def parent_author(comment_id, comment_authors=comment_authors, parent=parent, submission_authors=submission_authors):
    '''Links the comment id to the author of its parent.
    
    input: comment_id
    returns: author'''
    
    parent_id = parent[comment_id]
    
    try: # try to look for the parent_id key
        if parent_id[:3] == 't1_':
            return comment_authors[parent_id[3:]]
        if parent_id[:3] == 't3_':
            return submission_authors[parent_id[3:]]    
    except KeyError: # if parent_id was not extracted in comments or submissions
        return np.nan

In [8]:
comments['parent_author'] = comments['id'].apply(lambda x: parent_author(x))

In [9]:
print(f"Number of IDs that could not be extracted: {comments['parent_author'].isnull().sum()}/{comments['parent_author'].__len__()} = {comments['parent_author'].isnull().sum() / comments['parent_author'].__len__() :.4f}")

Number of IDs that could not be extracted: 387/238433 = 0.0016


In [10]:
# remove NaN parent authors
comments = comments[-comments.parent_author.isnull()].reset_index(drop=True)

# size of data
comments.shape

(238046, 17)

## 3) Filter the Reddit comments and submissions

In [11]:
# join title and selftext to text attribute in submissions
submissions['text'] = submissions.title + " " + submissions.selftext

In [12]:
# Remove NaN values for awarders by setting it to 0 or empty list
submissions['all_awardings'] = submissions['all_awardings'].fillna("").apply(list)
submissions['awarders'] = submissions['awarders'].fillna("").apply(list)
submissions['total_awards_received'] = submissions['total_awards_received'].fillna(0)

comments['all_awardings'] = comments['all_awardings'].fillna("").apply(list)
comments['total_awards_received'] = comments['total_awards_received'].fillna(0)

In [13]:
# filter comments and remove rows with deleted users
filtered_comments = comments

deleted_users_idx = np.logical_or(filtered_comments.author == '[deleted]', filtered_comments.parent_author == '[deleted]')
filtered_comments = filtered_comments[-deleted_users_idx]
print(f"Number of deleted users: {deleted_users_idx.sum()}")

# update index
filtered_comments.reset_index(drop=True, inplace=True)
filtered_comments.shape

Number of deleted users: 8501


(229545, 17)

In [14]:
# filter submissions and remove rows with deleted users
filtered_submissions = submissions

deleted_users_idx = filtered_submissions.author == '[deleted]'
filtered_submissions = filtered_submissions[-deleted_users_idx]
print(f"Number of deleted users: {deleted_users_idx.sum()}")

# update index
filtered_submissions.reset_index(drop=True, inplace=True)
filtered_submissions.shape

Number of deleted users: 100


(135846, 17)

In [15]:
op_dict = {'News': 0,
          'Neutral': 0,
          'Pro': 1,
          'Anti': -1}

filtered_submissions['opinion_score'] = filtered_submissions.opinion.apply(lambda x: op_dict[x])
filtered_comments['opinion_score'] = filtered_comments.opinion.apply(lambda x: op_dict[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_submissions['opinion_score'] = filtered_submissions.opinion.apply(lambda x: op_dict[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_comments['opinion_score'] = filtered_comments.opinion.apply(lambda x: op_dict[x])


In [16]:
filtered_submissions.head()

Unnamed: 0,id,author,title,selftext,score,subreddit,num_comments,all_awardings,awarders,total_awards_received,date,text,tokens,processed_text,year,opinion,opinion_probs,opinion_score
0,abdopc,-en-,@washingtonpost: Extreme weather in 2018 was a...,,1,newsbotbot,0,[],[],0.0,2019-01-01,@washingtonpost: Extreme weather in 2018 was a...,"{'washingtonpost': 1, 'extrem': 1, 'weather': ...",washingtonpost extrem weather rage howl signal...,2019,News,"[0.1275265063, 0.12615991340000002, 0.54378136...",0
1,abdn8t,Ama98,Predictions about the next year,"The wall will never get funding, instead Trump...",1,ChapoTrapHouse,15,[],[],0.0,2019-01-01,Predictions about the next year The wall will ...,"{'predict': 1, 'next': 1, 'year': 1, 'wall': 1...",predict next year wall never get fund instead ...,2019,Neutral,"[0.14072116540000001, 0.6952812997000001, 0.04...",0
2,abdjq5,hjbarraza,The Story of 2018 Was Climate Change,,1,climate,0,[],[],0.0,2019-01-01,The Story of 2018 Was Climate Change,"{'stori': 1, 'climat': 1, 'chang': 1}",stori climat chang,2019,Neutral,"[0.14858327670000002, 0.5444978271, 0.12987582...",0
3,abdhf9,EcoInternetNewsfeed,"Galapagos, Evolution &amp; Climate Change: Tra...",,1,climate,0,[],[],0.0,2019-01-01,"Galapagos, Evolution &amp; Climate Change: Tra...","{'galapago': 1, 'evolut': 1, 'amp': 2, 'climat...",galapago evolut amp climat chang travel book r...,2019,Neutral,"[0.0664715475, 0.3567104474, 0.2596014974, 0.3...",0
4,abdh4b,EcoInternetNewsfeed,"Galapagos, Evolution &amp; Climate Change: Tra...",,1,EcoInternet,0,[],[],0.0,2019-01-01,"Galapagos, Evolution &amp; Climate Change: Tra...","{'galapago': 1, 'evolut': 1, 'amp': 2, 'climat...",galapago evolut amp climat chang travel book r...,2019,Neutral,"[0.0664715475, 0.3567104474, 0.2596014974, 0.3...",0


## 4) Handle author metadata 

In [17]:
def get_metadata(df, reddit_type='comment'):
    
    author_df = pd.DataFrame()
    
    groups = df.groupby(by='author')

    author_df['text'] = groups.text.apply(lambda x: list(x))
    author_df['all_awardings'] = groups.all_awardings.apply(lambda x: np.concatenate([*x]))
    author_df['total_awards_received'] = groups.total_awards_received.sum()
    author_df['total_awards_received'] = groups.total_awards_received.sum()
    author_df['score'] = groups.score.sum()
    author_df[f'first_{reddit_type}'] = groups.date.apply(lambda x: x.sort_values(ascending=True).iloc[0].timestamp())
    author_df[f'last_{reddit_type}'] = groups.date.apply(lambda x: x.sort_values(ascending=True).iloc[-1].timestamp())
    author_df[f'num_{reddit_type}s'] = groups.apply(lambda x: x.__len__())
    author_df['opinion_score'] = groups.opinion_score.mean()
    
    if reddit_type == 'comment':
        author_df['controversiality'] = groups.controversiality.sum()
    
    return author_df

In [18]:
#extract metadata
print("Extracting metadata for comments...")
author_comment = get_metadata(filtered_comments, reddit_type='comment')

print("\nExtracting metadata for submissions...")
author_submission = get_metadata(filtered_submissions, reddit_type='submission')

Extracting metadata for comments...

Extracting metadata for submissions...


In [19]:
author_joined = author_comment.join(author_submission, on='author', lsuffix='_c', rsuffix='_s')

In [20]:
author = pd.DataFrame()
print("Running...")
for attr, fill in {'text': 'list', 'all_awardings': 'list', 'total_awards_received': 'num', 'score': 'num', 'opinion_score':'num'}.items():
    for letter in ['s', 'c']:
        
        # reformat rows
        if fill == 'list':
            author_joined[f'{attr}_{letter}'] = author_joined[f'{attr}_{letter}'].fillna("").apply(list)
        elif fill == 'num':
            author_joined[f'{attr}_{letter}'] = author_joined[f'{attr}_{letter}'].fillna(0)
            
    # create combined dataframe
    author[f'{attr}'] = author_joined[f'{attr}_s'] + author_joined[f'{attr}_c']
    if attr == 'opinion_score':
        author[f'{attr}'] /= 2
    
print("Successfully combined dataframe!")

# keep relevant attributes
aoi = ['first_comment', 'last_comment', 
       'first_submission', 'last_submission', 
       'num_comments', 'num_submissions',  
       'controversiality']
author[aoi] = author_joined[aoi]

# modify list of texts to one large string
author['text'] = author['text'].apply(lambda x: ' '.join(str(v) for v in x))

# rename
author = author.rename(columns={'controversiality':'comment_controversiality'})
author.sample(10)

Running...
Successfully combined dataframe!


Unnamed: 0_level_0,text,all_awardings,total_awards_received,score,opinion_score,first_comment,last_comment,first_submission,last_submission,num_comments,num_submissions,comment_controversiality
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Crazypenguin314,"You’re right, but it’s about the message. Nobo...",[],0.0,1.0,0.0,1552608000.0,1552608000.0,,,1,,0.0
killcats,Twice,[],0.0,1.0,0.0,1568074000.0,1568074000.0,,,1,,0.0
theghostofQEII,At this point we have to figure out how to geo...,[],0.0,2.0,-0.25,1568938000.0,1571875000.0,,,2,,0.0
illiberation,I think he's more or less alluding to the idea...,[],0.0,-1.0,-0.5,1572739000.0,1572739000.0,,,1,,0.0
gross_burrito,lmaoo If only it was that bad,[],0.0,1.0,0.0,1566691000.0,1566691000.0,,,1,,0.0
hachiman,She's cut from the same garbage cloth he is. H...,[],0.0,1.0,0.0,1576282000.0,1576282000.0,,,1,,0.0
DGDownUnder,Combating Climate Change The Socialism Way Oh...,[],0.0,179.0,0.25,1568938000.0,1568938000.0,1567728000.0,1568938000.0,1,4.0,0.0
timetobehappy,((Hugs)) vent away.,[],0.0,1.0,0.0,1577232000.0,1577232000.0,,,1,,0.0
acb1971,"Be best, Melania!",[],0.0,1.0,0.0,1576282000.0,1576282000.0,,,1,,0.0
mj1127,Revolution is nice because it covers heartworm...,[],0.0,3.0,-0.25,1557360000.0,1557446000.0,,,2,,0.0


In [21]:
# load stop-words
stop_words = set(stopwords.words('english'))

# add webpages to stopwords
stop_words.add('http') 
stop_words.add('https')

# Preprocess the text 
porter = PorterStemmer()
exclusions = {'RT'}

# define tokenizing function
clean = lambda x: Counter([porter.stem(word_token).lower() for word_token in word_tokenize(x) \
                       if word_token.lower() not in stop_words \
                       and word_token.isalpha() \
                       and word_token not in exclusions])

# apply tokenizing to texts - apply for seeing progress bar WHEN running
tokens = author['text'].apply(lambda text: clean(text))
author['tokens'] = tokens

# join tokens to one string
author['processed_text'] = author['tokens'].apply(lambda x: ' '.join(str(v) for v in x))

In [22]:
author.to_json(DATA_DIR / f'author_opinion_{year}.json.bz2')

## 5) Create ClimateGraph from edgelist

In [23]:
author = pd.read_json(DATA_DIR / f'author_opinion_{year}.json.bz2')

In [24]:
author.sample(5)

Unnamed: 0,text,all_awardings,total_awards_received,score,opinion_score,first_comment,last_comment,first_submission,last_submission,num_comments,num_submissions,comment_controversiality,tokens,processed_text
zombieslayer287,So hard life &gt;&gt;&gt; no life/ not exist a...,[],0,2,-0.25,1575244800,1575244800,,,2,,0,"{'hard': 1, 'life': 1, 'gt': 3, 'exist': 1, 'w...",hard life gt exist wrong selfish idiot yike
ThePittyInTheKitty,http://www.olcv.org/\nCall. Call. Call.,[],0,1,0.0,1561420800,1561420800,,,1,,0,{'call': 3},call
GrownUpTurk,Technically not wrong to have less kids when o...,[],0,1,0.0,1566518400,1566518400,,,1,,0,"{'technic': 1, 'wrong': 1, 'less': 2, 'kid': 3...",technic wrong less kid averag cost brought hom...
KingKooooZ,"Link? So... she got divorced from Ted Turner, ...",[],0,2,0.0,1572048000,1572048000,,,2,,0,"{'link': 1, 'got': 1, 'divorc': 1, 'ted': 1, '...",link got divorc ted turner cocreat captain planet
self_saucing,September 20 :),[],0,1,0.0,1568505600,1568505600,,,1,,0,{'septemb': 1},septemb


In [25]:
# computing the weighted edgelist by counting - using score as a randomly picked attributed to obtain a single pd.Series
weighted_edgelist = filtered_comments.groupby(by=['author', 'parent_author']).count().score
weighted_edgelist = weighted_edgelist.reset_index().rename(columns={'score':'weight'})

In [26]:
weighted_edgelist.sample(5, random_state=42)

Unnamed: 0,author,parent_author,weight
131178,hauska_juoppo,mvea,1
1260,1norcal415,deadfisher,1
137591,joeydsa,MrLongWalk,2
142827,lizzieroarden,hrimfaxi_work,1
63132,Nefertirri,AlwaysHangry12,1


In [27]:
# reformat weighted edgelist to 3-tuples
edgelist = list(zip(weighted_edgelist.author, weighted_edgelist.parent_author, weighted_edgelist.weight))

# construct graph
ClimateGraph = nx.DiGraph()
ClimateGraph.add_weighted_edges_from(edgelist)

In [28]:
# get weight of edge of first link
ClimateGraph.get_edge_data('redwolf177', 'TheNoHeart')

## 6) Add node attributes to ClimateGraph

In [29]:
for redditor in tqdm(author.index):
    meta = {redditor: author.loc[redditor].to_dict()}
    nx.set_node_attributes(ClimateGraph, meta)

  0%|          | 0/95320 [00:00<?, ?it/s]

In [30]:
# clean graph
ClimateGraph.remove_edges_from(nx.selfloop_edges(ClimateGraph))

# remove nodes that do not have metadata
remove_nodes = []
for k, v in ClimateGraph.nodes(data=True):
    try: 
        check = v['opinion_score']
    except KeyError:
        remove_nodes.append(k)

ClimateGraph.remove_nodes_from(remove_nodes)

## 5) Save ClimateGraph

In [31]:
# save graph as json
from networkx.readwrite import json_graph
import json

# specify save location
filename = DATA_DIR / f'ClimateGraph_{year}.json'
data = json_graph.node_link_data(ClimateGraph)

In [32]:
with open(filename, 'w') as fp:
    json.dump(data, fp)