# Creating the Reddit Network of Climate Discussion Contributors 

Using the pre-trained Climate Change Sentiment classifier on submissions and comments from Redittors, we model a network of Redittors being pro, neutral or anti to Climate Change.


In [1]:
import os
from pathlib import Path
import time, datetime

import pickle
from collections import Counter

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()

from nltk import word_tokenize, PorterStemmer
from nltk.corpus import stopwords

import networkx as nx

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
warnings.simplefilter("ignore", UserWarning)

## 1) Load Reddit data

Initially, the data extracted from Reddit is loaded.

In [2]:
# Data directory
DATA_DIR = Path(os.getcwd()).parent / 'data'
#DATA_DIR = Path('/work3/s194253')

# year
year = 2021

In [3]:
start = time.time()

# load data
comments = pd.read_json(DATA_DIR / f'{year}/comments90k_opinion_{year}.json.bz2')
submissions = pd.read_json(DATA_DIR / f'{year}/submissions_opinion_{year}.json.bz2')

end = time.time()
print(f"Took {end-start} seconds to load dataframe...")

Took 40.629838705062866 seconds to load dataframe...


In [4]:
submissions.shape

(109702, 17)

In [5]:
comments.shape

(225763, 17)

## 2) Link comment authors to submission authors

In [6]:
# create dictionaries
comment_authors = dict(zip(comments.id, comments.author))
parent = dict(zip(comments.id, comments.parent_id))
submission_authors = dict(zip(submissions.id, submissions.author))

In [7]:
def parent_author(comment_id, comment_authors=comment_authors, parent=parent, submission_authors=submission_authors):
    '''Links the comment id to the author of its parent.
    
    input: comment_id
    returns: author'''
    
    parent_id = parent[comment_id]
    
    try: # try to look for the parent_id key
        if parent_id[:3] == 't1_':
            return comment_authors[parent_id[3:]]
        if parent_id[:3] == 't3_':
            return submission_authors[parent_id[3:]]    
    except KeyError: # if parent_id was not extracted in comments or submissions
        return np.nan

In [8]:
comments['parent_author'] = comments['id'].apply(lambda x: parent_author(x))

In [9]:
print(f"Number of IDs that could not be extracted: {comments['parent_author'].isnull().sum()}/{comments['parent_author'].__len__()} = {comments['parent_author'].isnull().sum() / comments['parent_author'].__len__() :.4f}")

Number of IDs that could not be extracted: 2357/225763 = 0.0104


In [10]:
# remove NaN parent authors
comments = comments[-comments.parent_author.isnull()].reset_index(drop=True)

# size of data
comments.shape

(223406, 17)

## 3) Filter the Reddit comments and submissions

In [11]:
# join title and selftext to text attribute in submissions
submissions['text'] = submissions.title + " " + submissions.selftext

In [12]:
# Remove NaN values for awarders by setting it to 0 or empty list
submissions['all_awardings'] = submissions['all_awardings'].fillna("").apply(list)
submissions['awarders'] = submissions['awarders'].fillna("").apply(list)
submissions['total_awards_received'] = submissions['total_awards_received'].fillna(0)

comments['all_awardings'] = comments['all_awardings'].fillna("").apply(list)
comments['total_awards_received'] = comments['total_awards_received'].fillna(0)

In [13]:
# filter comments and remove rows with deleted users
filtered_comments = comments

deleted_users_idx = np.logical_or(filtered_comments.author == '[deleted]', filtered_comments.parent_author == '[deleted]')
filtered_comments = filtered_comments[-deleted_users_idx]
print(f"Number of deleted users: {deleted_users_idx.sum()}")

# update index
filtered_comments.reset_index(drop=True, inplace=True)
filtered_comments.shape

Number of deleted users: 20163


(203243, 17)

In [14]:
# filter submissions and remove rows with deleted users
filtered_submissions = submissions

deleted_users_idx = filtered_submissions.author == '[deleted]'
filtered_submissions = filtered_submissions[-deleted_users_idx]
print(f"Number of deleted users: {deleted_users_idx.sum()}")

# update index
filtered_submissions.reset_index(drop=True, inplace=True)
filtered_submissions.shape

Number of deleted users: 657


(109045, 17)

In [15]:
op_dict = {'News': 0,
          'Neutral': 0,
          'Pro': 1,
          'Anti': -1}

filtered_submissions['opinion_score'] = filtered_submissions.opinion.apply(lambda x: op_dict[x])
filtered_comments['opinion_score'] = filtered_comments.opinion.apply(lambda x: op_dict[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_submissions['opinion_score'] = filtered_submissions.opinion.apply(lambda x: op_dict[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_comments['opinion_score'] = filtered_comments.opinion.apply(lambda x: op_dict[x])


In [16]:
filtered_submissions.head()

Unnamed: 0,id,author,title,selftext,score,subreddit,num_comments,all_awardings,awarders,total_awards_received,date,text,tokens,processed_text,year,opinion,opinion_probs,opinion_score
0,ko3ttz,austria9000,Am I strange or is this fear of 2021 just not ...,I know there is a pandemic and other shit like...,1,collapse,26,[],[],0,2021-01-01,Am I strange or is this fear of 2021 just not ...,"{'strang': 1, 'fear': 1, 'logic': 1, 'know': 1...",strang fear logic know pandem shit like climat...,2021,Neutral,"[0.2206094134, 0.5277520404, 0.0356988014, 0.2...",0
1,ko3gwm,honolulu_oahu_mod,Progress On Climate Change Is Pathway For Econ...,,1,HawaiiPlantMedicine,1,[],[],0,2021-01-01,Progress On Climate Change Is Pathway For Econ...,"{'progress': 1, 'climat': 1, 'chang': 1, 'path...",progress climat chang pathway econom recoveri ...,2021,Pro,"[0.1505784031, 0.2912976865, 0.238229698700000...",1
2,ko3gk2,honolulu_oahu_mod,Progress On Climate Change Is Pathway For Econ...,,1,Honolulu,0,[],[],0,2021-01-01,Progress On Climate Change Is Pathway For Econ...,"{'progress': 1, 'climat': 1, 'chang': 1, 'path...",progress climat chang pathway econom recoveri ...,2021,Pro,"[0.1505784031, 0.2912976865, 0.238229698700000...",1
3,ko3fww,TeaDelta,I (21F) have bladder problems and I think it's...,Before I start I wanna say that after I saw a ...,1,raisedbynarcissists,2,[],[],0,2021-01-01,I (21F) have bladder problems and I think it's...,"{'bladder': 4, 'problem': 5, 'think': 3, 'due'...",bladder problem think due mother start wan na ...,2021,Neutral,"[0.2643444703, 0.5466458693, 0.0554297532, 0.1...",0
4,ko38lj,lavenderah,Could someone suggest me a book which may be a...,"\nsample 1:\n\nSan Lorenzo, center of the anci...",1,suggestmeabook,1,[],[],0,2021-01-01,Could someone suggest me a book which may be a...,"{'could': 2, 'someon': 1, 'suggest': 3, 'book'...",could someon suggest book may bit amalgam anth...,2021,Neutral,"[0.2643444703, 0.5466458693, 0.0554297532, 0.1...",0


## 4) Handle author metadata 

In [17]:
def get_metadata(df, reddit_type='comment'):
    
    author_df = pd.DataFrame()
    
    groups = df.groupby(by='author')

    author_df['text'] = groups.text.apply(lambda x: list(x))
    author_df['all_awardings'] = groups.all_awardings.apply(lambda x: np.concatenate([*x]))
    author_df['total_awards_received'] = groups.total_awards_received.sum()
    author_df['total_awards_received'] = groups.total_awards_received.sum()
    author_df['score'] = groups.score.sum()
    author_df[f'first_{reddit_type}'] = groups.date.apply(lambda x: x.sort_values(ascending=True).iloc[0].timestamp())
    author_df[f'last_{reddit_type}'] = groups.date.apply(lambda x: x.sort_values(ascending=True).iloc[-1].timestamp())
    author_df[f'num_{reddit_type}s'] = groups.apply(lambda x: x.__len__())
    author_df['opinion_score'] = groups.opinion_score.mean()
    
    if reddit_type == 'comment':
        author_df['controversiality'] = groups.controversiality.sum()
    
    return author_df

In [18]:
#extract metadata
print("Extracting metadata for comments...")
author_comment = get_metadata(filtered_comments, reddit_type='comment')

print("\nExtracting metadata for submissions...")
author_submission = get_metadata(filtered_submissions, reddit_type='submission')

Extracting metadata for comments...

Extracting metadata for submissions...


In [19]:
author_joined = author_comment.join(author_submission, on='author', lsuffix='_c', rsuffix='_s')

In [20]:
author = pd.DataFrame()
print("Running...")
for attr, fill in {'text': 'list', 'all_awardings': 'list', 'total_awards_received': 'num', 'score': 'num', 'opinion_score':'num'}.items():
    for letter in ['s', 'c']:
        
        # reformat rows
        if fill == 'list':
            author_joined[f'{attr}_{letter}'] = author_joined[f'{attr}_{letter}'].fillna("").apply(list)
        elif fill == 'num':
            author_joined[f'{attr}_{letter}'] = author_joined[f'{attr}_{letter}'].fillna(0)
            
    # create combined dataframe
    author[f'{attr}'] = author_joined[f'{attr}_s'] + author_joined[f'{attr}_c']
    if attr == 'opinion_score':
        author[f'{attr}'] /= 2
    
print("Successfully combined dataframe!")

# keep relevant attributes
aoi = ['first_comment', 'last_comment', 
       'first_submission', 'last_submission', 
       'num_comments', 'num_submissions',  
       'controversiality']
author[aoi] = author_joined[aoi]

# modify list of texts to one large string
author['text'] = author['text'].apply(lambda x: ' '.join(str(v) for v in x))

# rename
author = author.rename(columns={'controversiality':'comment_controversiality'})
author.sample(10)

Running...
Successfully combined dataframe!


Unnamed: 0_level_0,text,all_awardings,total_awards_received,score,opinion_score,first_comment,last_comment,first_submission,last_submission,num_comments,num_submissions,comment_controversiality
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
TheGudu,Sooooooooooooo funny!!!! Well stop spreading s...,[],0.0,-13.0,-0.25,1635466000.0,1637885000.0,,,4,,0.0
Ask_Who_Owes_Me_Gold,Consumer preferences are a big factor too. The...,[],0.0,2.0,0.0,1628122000.0,1628122000.0,,,1,,0.0
animalsanddepression,seconding this. I have an insane needle phobia...,[],0.0,1.0,0.0,1628986000.0,1628986000.0,,,1,,0.0
ArtfulArtificer,You ever play the start of FF7 and see what ou...,[],0.0,1.0,0.0,1628467000.0,1628467000.0,,,1,,0.0
THICCTHIGHSNOLIES,"The elites will, they already have their survi...",[],0.0,2.0,0.0,1620778000.0,1620778000.0,,,1,,0.0
Roboticsammy,"Like it's so easy to ""just move to where your ...",[],0.0,3.0,0.166667,1639872000.0,1639872000.0,,,3,,0.0
ThinkImpermanence,The science behind that film has been debunked...,[],0.0,3.0,-0.5,1633392000.0,1633392000.0,,,1,,0.0
Limbolocal,Or aliens,[],0.0,1.0,0.0,1630886000.0,1630886000.0,,,1,,0.0
InFearn0,"""Soylent Gas is people!"" Gaia worlds aren't mo...",[],0.0,18.0,0.0,1621382000.0,1621382000.0,,,2,,0.0
bennzo1238,"Yeah, sure - but surely a v small percentage? ...",[],0.0,93.0,0.0,1631923000.0,1631923000.0,,,3,,0.0


In [21]:
# load stop-words
stop_words = set(stopwords.words('english'))

# add webpages to stopwords
stop_words.add('http') 
stop_words.add('https')

# Preprocess the text 
porter = PorterStemmer()
exclusions = {'RT'}

# define tokenizing function
clean = lambda x: Counter([porter.stem(word_token).lower() for word_token in word_tokenize(x) \
                       if word_token.lower() not in stop_words \
                       and word_token.isalpha() \
                       and word_token not in exclusions])

# apply tokenizing to texts - apply for seeing progress bar WHEN running
tokens = author['text'].apply(lambda text: clean(text))
author['tokens'] = tokens

# join tokens to one string
author['processed_text'] = author['tokens'].apply(lambda x: ' '.join(str(v) for v in x))

In [22]:
author.to_json(DATA_DIR / f'author_opinion_{year}.json.bz2')

## 5) Create ClimateGraph from edgelist

In [23]:
author = pd.read_json(DATA_DIR / f'author_opinion_{year}.json.bz2')

In [24]:
author.sample(5)

Unnamed: 0,text,all_awardings,total_awards_received,score,opinion_score,first_comment,last_comment,first_submission,last_submission,num_comments,num_submissions,comment_controversiality,tokens,processed_text
MeMuzzta,They're a bunch of morons and anyone condoning...,[],0,2,0.0,1636156800,1636243200,,,2,,0,"{'bunch': 2, 'moron': 3, 'anyon': 1, 'condon':...",bunch moron anyon condon behaviour also see en...
Sp0il,Yall really get your panties in a bunch about ...,[],0,13,0.0,1625270400,1625270400,,,1,,0,"{'yall': 1, 'realli': 1, 'get': 1, 'panti': 1,...",yall realli get panti bunch everi slightli mem...
DanCastellaneta,"With context to the US, the Middle East and Au...",[],0,6,-0.5,1626307200,1626307200,,,1,,0,"{'context': 1, 'us': 2, 'middl': 2, 'east': 2,...",context us middl east australia part boil ment...
spoobydoo,If people want to make a collective effort to ...,[],0,1,0.0,1639872000,1639872000,,,1,,0,"{'peopl': 1, 'want': 1, 'make': 1, 'collect': ...",peopl want make collect effort battl climat ch...
Comfortable-Event239,"Karen \nit’s to get it in the news, and it wor...",[],0,2,0.0,1631923200,1631923200,,,1,,0,"{'karen': 1, 'get': 1, 'news': 1, 'work': 1, '...",karen get news work emerg die address littl ho...


In [25]:
# computing the weighted edgelist by counting - using score as a randomly picked attributed to obtain a single pd.Series
weighted_edgelist = filtered_comments.groupby(by=['author', 'parent_author']).count().score
weighted_edgelist = weighted_edgelist.reset_index().rename(columns={'score':'weight'})

In [26]:
weighted_edgelist.sample(5, random_state=42)

Unnamed: 0,author,parent_author,weight
87331,TiredForEternity,lost_castle,1
22987,DeadlyLemming,throwaway12131214121,1
49807,LichPineapple,newnemo,1
62488,Ok_Carrot_5475,Im_Sapphire,1
41926,Infernalism,Godhealer,1


In [27]:
# reformat weighted edgelist to 3-tuples
edgelist = list(zip(weighted_edgelist.author, weighted_edgelist.parent_author, weighted_edgelist.weight))

# construct graph
ClimateGraph = nx.DiGraph()
ClimateGraph.add_weighted_edges_from(edgelist)

In [28]:
# get weight of edge of first link
ClimateGraph.get_edge_data('redwolf177', 'TheNoHeart')

## 6) Add node attributes to ClimateGraph

In [29]:
for redditor in tqdm(author.index):
    meta = {redditor: author.loc[redditor].to_dict()}
    nx.set_node_attributes(ClimateGraph, meta)

  0%|          | 0/95507 [00:00<?, ?it/s]

In [30]:
# clean graph
ClimateGraph.remove_edges_from(nx.selfloop_edges(ClimateGraph))

# remove nodes that do not have metadata
remove_nodes = []
for k, v in ClimateGraph.nodes(data=True):
    try: 
        check = v['opinion_score']
    except KeyError:
        remove_nodes.append(k)

ClimateGraph.remove_nodes_from(remove_nodes)

## 5) Save ClimateGraph

In [31]:
# save graph as json
from networkx.readwrite import json_graph
import json

# specify save location
filename = DATA_DIR / f'ClimateGraph_{year}.json'
data = json_graph.node_link_data(ClimateGraph)

In [32]:
with open(filename, 'w') as fp:
    json.dump(data, fp)