In [70]:
pd.__version__

'1.4.1'

# Creating the Reddit Network of Climate Discussion Contributors 

Using the pre-trained Climate Change Sentiment classifier on submissions and comments from Redittors, we model a network of Redittors being pro, neutral or anti to Climate Change.


In [1]:
import os
from pathlib import Path
import time, datetime

import pickle
from collections import Counter

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()

from nltk import word_tokenize, PorterStemmer
from nltk.corpus import stopwords

import networkx as nx

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
warnings.simplefilter("ignore", UserWarning)

## 1) Load Reddit data

Initially, the data extracted from Reddit is loaded.

In [2]:
# Data directory
DATA_DIR = Path(os.getcwd()).parent / 'data'
#DATA_DIR = Path('/work3/s194253')

# year
year = 2020

In [3]:
start = time.time()

# load data
comments = pd.read_json(DATA_DIR / f'{year}/comments90k_opinion_{year}.json.bz2')
submissions = pd.read_json(DATA_DIR / f'{year}/submissions_opinion_{year}.json.bz2')

end = time.time()
print(f"Took {end-start} seconds to load dataframe...")

Took 45.28191637992859 seconds to load dataframe...


In [4]:
submissions.shape

(82046, 17)

In [5]:
comments.shape

(151622, 17)

## 2) Link comment authors to submission authors

In [6]:
# create dictionaries
comment_authors = dict(zip(comments.id, comments.author))
parent = dict(zip(comments.id, comments.parent_id))
submission_authors = dict(zip(submissions.id, submissions.author))

In [7]:
def parent_author(comment_id, comment_authors=comment_authors, parent=parent, submission_authors=submission_authors):
    '''Links the comment id to the author of its parent.
    
    input: comment_id
    returns: author'''
    
    parent_id = parent[comment_id]
    
    try: # try to look for the parent_id key
        if parent_id[:3] == 't1_':
            return comment_authors[parent_id[3:]]
        if parent_id[:3] == 't3_':
            return submission_authors[parent_id[3:]]    
    except KeyError: # if parent_id was not extracted in comments or submissions
        return np.nan

In [8]:
comments['parent_author'] = comments['id'].progress_apply(lambda x: parent_author(x))

  0%|          | 0/151622 [00:00<?, ?it/s]

In [9]:
print(f"Number of IDs that could not be extracted: {comments['parent_author'].isnull().sum()}/{comments['parent_author'].__len__()} = {comments['parent_author'].isnull().sum() / comments['parent_author'].__len__() :.4f}")

Number of IDs that could not be extracted: 960/151622 = 0.0063


In [10]:
# remove NaN parent authors
comments = comments[-comments.parent_author.isnull()].reset_index(drop=True)

# size of data
comments.shape

(150662, 17)

## 3) Filter the Reddit comments and submissions

In [11]:
# join title and selftext to text attribute in submissions
submissions['text'] = submissions.title + " " + submissions.selftext

In [12]:
# Remove NaN values for awarders by setting it to 0 or empty list
submissions['all_awardings'] = submissions['all_awardings'].fillna("").apply(list)
submissions['awarders'] = submissions['awarders'].fillna("").apply(list)
submissions['total_awards_received'] = submissions['total_awards_received'].fillna(0)

comments['all_awardings'] = comments['all_awardings'].fillna("").apply(list)
comments['total_awards_received'] = comments['total_awards_received'].fillna(0)

In [13]:
# filter comments and remove rows with deleted users
filtered_comments = comments

deleted_users_idx = np.logical_or(filtered_comments.author == '[deleted]', filtered_comments.parent_author == '[deleted]')
filtered_comments = filtered_comments[-deleted_users_idx]
print(f"Number of deleted users: {deleted_users_idx.sum()}")

# update index
filtered_comments.reset_index(drop=True, inplace=True)
filtered_comments.shape

Number of deleted users: 9173


(141489, 17)

In [14]:
# filter submissions and remove rows with deleted users
filtered_submissions = submissions

deleted_users_idx = filtered_submissions.author == '[deleted]'
filtered_submissions = filtered_submissions[-deleted_users_idx]
print(f"Number of deleted users: {deleted_users_idx.sum()}")

# update index
filtered_submissions.reset_index(drop=True, inplace=True)
filtered_submissions.shape

Number of deleted users: 659


(81387, 17)

In [15]:
op_dict = {'News': 0,
          'Neutral': 0,
          'Pro': 1,
          'Anti': -1}

filtered_submissions['opinion_score'] = filtered_submissions.opinion.apply(lambda x: op_dict[x])
filtered_comments['opinion_score'] = filtered_comments.opinion.apply(lambda x: op_dict[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_submissions['opinion_score'] = filtered_submissions.opinion.apply(lambda x: op_dict[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_comments['opinion_score'] = filtered_comments.opinion.apply(lambda x: op_dict[x])


In [16]:
filtered_submissions.head()

Unnamed: 0,id,author,title,selftext,score,subreddit,num_comments,all_awardings,awarders,total_awards_received,date,text,tokens,processed_text,year,opinion,opinion_probs,opinion_score
0,eicndu,toronto_news,Could algae be a secret weapon in the climate ...,,1,u_toronto_news,0,[],[],0,2020-01-01,Could algae be a secret weapon in the climate ...,"{'could': 1, 'alga': 1, 'secret': 1, 'weapon':...",could alga secret weapon climat chang crisi ma...,2020,News,"[0.2649832925, 0.1244659196, 0.4741758891, 0.1...",0
1,eicmqm,bogbodybutch,Know their Names: Eight Activists Assassinated...,,1,ClimateOffensive,15,[],[],0,2020-01-01,Know their Names: Eight Activists Assassinated...,"{'know': 1, 'name': 1, 'eight': 1, 'activist':...",know name eight activist assassin fight climat...,2020,Pro,"[0.145520545, 0.13338547550000002, 0.320116649...",1
2,eicm79,jackson-on-reddit,The Australian Prime Minister still refuses to...,,1,ABoringDystopia,5,[],[],0,2020-01-01,The Australian Prime Minister still refuses to...,"{'australian': 1, 'prime': 1, 'minist': 1, 'st...",australian prime minist still refus take actio...,2020,Pro,"[0.12853447040000002, 0.08645230100000001, 0.1...",1
3,eichqb,babyyourearichman111,/u/nowyourmad on CMV: Disregarding Economists'...,[removed],1,TalkativePeople,0,[],[],0,2020-01-01,/u/nowyourmad on CMV: Disregarding Economists'...,"{'cmv': 1, 'disregard': 2, 'economist': 1, 'co...",cmv disregard economist consensu thing like fr...,2020,Anti,"[0.3567703754, 0.2436021857, 0.1100517669, 0.2...",-1
4,eicgj2,YeetOnMyKids,Climate change fake,,1,okbuddyretard,0,[],[],0,2020-01-01,Climate change fake,"{'climat': 1, 'chang': 1, 'fake': 1}",climat chang fake,2020,Anti,"[0.4198382285, 0.3151598512, 0.0729579884, 0.1...",-1


## 4) Handle author metadata 

In [17]:
def get_metadata(df, reddit_type='comment'):
    
    author_df = pd.DataFrame()
    
    groups = df.groupby(by='author')

    author_df['text'] = groups.text.progress_apply(lambda x: list(x))
    author_df['all_awardings'] = groups.all_awardings.progress_apply(lambda x: np.concatenate([*x]))
    author_df['total_awards_received'] = groups.total_awards_received.sum()
    author_df['total_awards_received'] = groups.total_awards_received.sum()
    author_df['score'] = groups.score.sum()
    author_df[f'first_{reddit_type}'] = groups.date.progress_apply(lambda x: x.sort_values(ascending=True).iloc[0].timestamp())
    author_df[f'last_{reddit_type}'] = groups.date.progress_apply(lambda x: x.sort_values(ascending=True).iloc[-1].timestamp())
    author_df[f'num_{reddit_type}s'] = groups.progress_apply(lambda x: x.__len__())
    author_df['opinion_score'] = groups.opinion_score.mean()
    
    if reddit_type == 'comment':
        author_df['controversiality'] = groups.controversiality.sum()
    
    return author_df

In [18]:
#extract metadata
print("Extracting metadata for comments...")
author_comment = get_metadata(filtered_comments, reddit_type='comment')

print("\nExtracting metadata for submissions...")
author_submission = get_metadata(filtered_submissions, reddit_type='submission')

Extracting metadata for comments...


  0%|          | 0/64260 [00:00<?, ?it/s]

  0%|          | 0/64260 [00:00<?, ?it/s]

  0%|          | 0/64260 [00:00<?, ?it/s]

  0%|          | 0/64260 [00:00<?, ?it/s]

  0%|          | 0/64260 [00:00<?, ?it/s]


Extracting metadata for submissions...


  0%|          | 0/33624 [00:00<?, ?it/s]

  0%|          | 0/33624 [00:00<?, ?it/s]

  0%|          | 0/33624 [00:00<?, ?it/s]

  0%|          | 0/33624 [00:00<?, ?it/s]

  0%|          | 0/33624 [00:00<?, ?it/s]

In [19]:
author_joined = author_comment.join(author_submission, on='author', lsuffix='_c', rsuffix='_s')

In [20]:
author = pd.DataFrame()
print("Running...")
for attr, fill in {'text': 'list', 'all_awardings': 'list', 'total_awards_received': 'num', 'score': 'num', 'opinion_score':'num'}.items():
    for letter in ['s', 'c']:
        
        # reformat rows
        if fill == 'list':
            author_joined[f'{attr}_{letter}'] = author_joined[f'{attr}_{letter}'].fillna("").apply(list)
        elif fill == 'num':
            author_joined[f'{attr}_{letter}'] = author_joined[f'{attr}_{letter}'].fillna(0)
            
    # create combined dataframe
    author[f'{attr}'] = author_joined[f'{attr}_s'] + author_joined[f'{attr}_c']
    if attr == 'opinion_score':
        author[f'{attr}'] /= 2
    
print("Successfully combined dataframe!")

# keep relevant attributes
aoi = ['first_comment', 'last_comment', 
       'first_submission', 'last_submission', 
       'num_comments', 'num_submissions',  
       'controversiality']
author[aoi] = author_joined[aoi]

# modify list of texts to one large string
author['text'] = author['text'].progress_apply(lambda x: ' '.join(str(v) for v in x))

# rename
author = author.rename(columns={'controversiality':'comment_controversiality'})
author.sample(10)

Running...
Successfully combined dataframe!


  0%|          | 0/64260 [00:00<?, ?it/s]

Unnamed: 0_level_0,text,all_awardings,total_awards_received,score,opinion_score,first_comment,last_comment,first_submission,last_submission,num_comments,num_submissions,comment_controversiality
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
brd3001,Check out the protopic / elidel withdrawal gro...,[],0.0,1.0,0.0,1607818000.0,1607818000.0,,,1,,0.0
lotofmurkamiinthehal,Rent free 100% r/MGTOW in the wild,[],0.0,6.0,0.0,1583798000.0,1583798000.0,,,2,,0.0
sl_1138,We all died inside last night,[],0.0,1.0,0.0,1601424000.0,1601424000.0,,,1,,0.0
WaywardTraveller,Here's the thing: Many parts of Alberta are re...,[],0.0,5.0,0.0,1591661000.0,1591661000.0,,,1,,0.0
fukwhutuheard,one crises at a time please,[],0.0,1.0,0.0,1588896000.0,1588896000.0,,,1,,0.0
blubburtron,It's clearly someone recording security cam fo...,[],0.0,4.0,0.0,1600128000.0,1600128000.0,,,1,,0.0
childish-grambino,Apply this generosity to your employees and th...,[],0.0,1.0,0.0,1581898000.0,1581898000.0,,,1,,0.0
Wakarimasen420,And Biden helped put a whole bunch more people...,[],0.0,13.0,0.0,1605744000.0,1605830000.0,,,4,,0.0
bNyeTheVRGuy,Use bots to launch a mass information campaign...,[],0.0,2.0,0.0,1598573000.0,1598573000.0,1598486000.0,1598486000.0,1,1.0,0.0
spec2re,"Agreed, you can't explain the tides, let alone...",[],0.0,4.0,-0.5,1603325000.0,1603325000.0,,,1,,0.0


In [21]:
# load stop-words
stop_words = set(stopwords.words('english'))

# add webpages to stopwords
stop_words.add('http') 
stop_words.add('https')

# Preprocess the text 
porter = PorterStemmer()
exclusions = {'RT'}

# define tokenizing function
clean = lambda x: Counter([porter.stem(word_token).lower() for word_token in word_tokenize(x) \
                       if word_token.lower() not in stop_words \
                       and word_token.isalpha() \
                       and word_token not in exclusions])

# apply tokenizing to texts - progress_apply for seeing progress bar WHEN running
tokens = author['text'].progress_apply(lambda text: clean(text))
author['tokens'] = tokens

# join tokens to one string
author['processed_text'] = author['tokens'].progress_apply(lambda x: ' '.join(str(v) for v in x))

  0%|          | 0/64260 [00:00<?, ?it/s]

  0%|          | 0/64260 [00:00<?, ?it/s]

In [22]:
author.to_json(DATA_DIR / f'author_opinion_{year}.json.bz2')

## 5) Create ClimateGraph from edgelist

In [23]:
author = pd.read_json(DATA_DIR / f'author_opinion_{year}.json.bz2')

In [24]:
author.sample(5)

Unnamed: 0,text,all_awardings,total_awards_received,score,opinion_score,first_comment,last_comment,first_submission,last_submission,num_comments,num_submissions,comment_controversiality,tokens,processed_text
warwellian,Thank you for your effort! This gives a cleare...,[],0,1,0.0,1579046400,1579046400,,,1,,0,"{'thank': 1, 'effort': 1, 'give': 1, 'clearer'...",thank effort give clearer pictur without nois ...
LordYoshi00,I guess you only see that if you're trying to....,[],0,4,0.0,1578182400,1578182400,,,1,,0,"{'guess': 1, 'see': 1, 'tri': 1, 'mayb': 1, 's...",guess see tri mayb said aborigin brought atten...
banananuhhh,I think it's actually much worse than you say....,[],0,11,-0.272727,1598400000,1605744000,,,11,,0,"{'think': 5, 'actual': 1, 'much': 1, 'wors': 2...",think actual much wors say dem take senat lot ...
Nem48,puppet(s)* It feels good to bash someone else ...,[],0,2,0.0,1605830400,1605830400,,,2,,0,"{'puppet': 2, 'feel': 1, 'good': 1, 'bash': 1,...",puppet feel good bash someon els tho like knew...
internetguy226,This would require 8 million times more coding...,[],0,1,-0.5,1583280000,1583280000,,,1,,0,"{'would': 1, 'requir': 1, 'million': 1, 'time'...",would requir million time code made


In [25]:
# computing the weighted edgelist by counting - using score as a randomly picked attributed to obtain a single pd.Series
weighted_edgelist = filtered_comments.groupby(by=['author', 'parent_author']).count().score
weighted_edgelist = weighted_edgelist.reset_index().rename(columns={'score':'weight'})

In [26]:
weighted_edgelist.sample(5, random_state=42)

Unnamed: 0,author,parent_author,weight
95252,redwolf177,TheNoHeart,1
98475,simstim_addict,PragmatistAntithesis,1
22950,GrievenLeague,Sleaz274,1
108708,zeyore,avogadros_number,1
85251,lonewolf392,Avenflar,2


In [43]:
# reformat weighted edgelist to 3-tuples
edgelist = list(zip(weighted_edgelist.author, weighted_edgelist.parent_author, weighted_edgelist.weight))

# construct graph
ClimateGraph = nx.DiGraph()
ClimateGraph.add_weighted_edges_from(edgelist)

In [44]:
# get weight of edge of first link
ClimateGraph.get_edge_data('redwolf177', 'TheNoHeart')

{'weight': 1}

## 6) Add node attributes to ClimateGraph

In [45]:
for redditor in tqdm(author.index):
    meta = {redditor: author.loc[redditor].to_dict()}
    nx.set_node_attributes(ClimateGraph, meta)

  0%|          | 0/64260 [00:00<?, ?it/s]

In [None]:
# clean graph
ClimateGraph.remove_edges_from(nx.selfloop_edges(ClimateGraph))

# remove nodes that do not have metadata
remove_nodes = []
for k, v in ClimateGraph.nodes(data=True):
    try: 
        check = v['opinion_score']
    except KeyError:
        remove_nodes.append(k)

Climategraph.remove_nodes_from(remove_nodes)

## 5) Save ClimateGraph

In [47]:
# save graph as json
from networkx.readwrite import json_graph
import json

# specify save location
filename = DATA_DIR / f'ClimateGraph_{year}.json'
data = json_graph.node_link_data(ClimateGraph)

In [48]:
json.dump(data, filename)