## Initialization

In [None]:
save_path_prefix = 'covid_influence/files/'
query_keywords = ['covid|corona|quarantine|pandemic'] # Use '+' or '|' to connect multiple keywords; leave as [''] without searching for specific keywords
query_subreddits = ['Gifts,GiftIdeas', 'personalfinance', 'jobs,careerguidence,GetEmployed', 'CasualConversation', 'depression', 'books', 'teenagers', 'parenting', 'fitness', 
                   'gaming', 'relationships'] # Use ',' to connect multiple subreddits, leave as [''] if search across all subreddits
query_date_ranges = []
for m in range(1, 13):
    query_date_ranges.append(['2019-'+str(m).zfill(2)+'-01', '2019-'+str(m).zfill(2)+'-05'])
    query_date_ranges.append(['2019-'+str(m).zfill(2)+'-06', '2019-'+str(m).zfill(2)+'-10'])
    query_date_ranges.append(['2019-'+str(m).zfill(2)+'-11', '2019-'+str(m).zfill(2)+'-15'])
    query_date_ranges.append(['2019-'+str(m).zfill(2)+'-16', '2019-'+str(m).zfill(2)+'-20'])
    query_date_ranges.append(['2019-'+str(m).zfill(2)+'-21', '2019-'+str(m).zfill(2)+'-25'])
    query_date_ranges.append(['2019-'+str(m).zfill(2)+'-26', '2019-'+str(m).zfill(2)+'-28'])

# Parameters to plot the wordclouds
name_months = ['Jan.', 'Feb.', 'Mar.', 'Apr.', 'May.', 'June', 'July', 'Aug.', 'Sept.', 'Oct.', 'Nov.', 'Dec.']
plt_cfg = dict()
plt_cfg['path_save'] = 'covid_influence/plots/'
plt_cfg['size'] = (50, 25)
plt_cfg['xSub'] = 3
plt_cfg['ySub'] = 4
plt_cfg['title'] = [month for month in name_months]

# Parameters to fetch submissions and comments
cfg_subm = dict()
cfg_subm['field'] = 'title,selftext,num_comments,author'
cfg_subm['rm_dupe'] = 'title'
cfg_subm['sort'] = 'num_comments'
cfg_subm['sort_type'] = 'desc'
cfg_subm['query_type'] = 'submission'

cfg_cmt = dict()
cfg_cmt['field'] = 'body,author,score'
cfg_cmt['rm_dupe'] = 'body'
cfg_cmt['sort'] = 'score'
cfg_cmt['sort_type'] = 'desc'
cfg_cmt['query_type'] = 'comment'

# Add additional stopwords
add_stopwords = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'day', 'week', 'month', 'year', 'thing', 'app', 'new', 'old', 
                 'hundred', 'thousand']

In [None]:
import funcs_pushshift
import os
import sys
import pandas as pd
import numpy as np
import pickle
import nltk
# nltk.download('wordnet')
# nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

## Getting submissions and comments from reddit using [Pushshift](https://reddit-api.readthedocs.io/en/latest/#comments-search)

The original code of the function *GetPushshiftData* is from [
dylankilkenny/PushShift.py](https://gist.github.com/dylankilkenny/3dbf6123527260165f8c5c3bc3ee331b) and [Rare Loot](https://rareloot.medium.com/using-pushshifts-api-to-extract-reddit-submissions-fb517b286563).

### 1. Assemble queries

In [None]:
save_path = []
fname = []
query_main = []
query_subm = []
query_cmt = []
suffix_subm = '&filter='+cfg_subm['field']+'&sort_type='+cfg_subm['sort']+'&sort=desc'+'&size=100'
suffix_cmt = '&filter='+cfg_cmt['field']+'&sort_type='+cfg_cmt['sort']+'&sort=desc'+'&size=100'
for keyword in query_keywords:
    query_temp = 'q='+keyword
    for subreddit in query_subreddits:
        query_main.append(keyword+'_'+subreddit)
        for date_range in query_date_ranges:
            query_subm.append(query_temp+'&subreddit='+subreddit+'&after='+ date_range[0]+'&before='+date_range[1]+suffix_subm)
            query_cmt.append(query_temp+'&subreddit='+subreddit+'&after='+ date_range[0]+'&before='+date_range[1]+suffix_cmt)
            save_path.append(save_path_prefix+keyword+'_'+subreddit+'/')
            fname.append(date_range[0]+'_'+date_range[1])
    del query_temp
del suffix_subm, suffix_cmt, keyword, subreddit

In [None]:
# Testing code
# subms_all = []
# cmts_all = []
# for idx in range(len(fname)):
#     try:
#         subms_all.append(pd.read_csv(save_path[idx]+cfg_subm['query_type']+'_'+fname[idx]+'.csv'))
#     except:
#         subms_all.append([])
#     try:
#         cmts_all.append(pd.read_csv(save_path[idx]+cfg_cmt['query_type']+'_'+fname[idx]+'.csv'))
#     except:
#         cmts_all.append([])

### 2. Collect the submissions and comments from Pushshift server.

In [None]:
subms_all = []
cmts_all = []
for idx in range(0, len(fname)):
    print('> Processing query: '+str(idx+1)+' / '+str(len(query_subm)) + '. Save to: '+save_path[idx]+fname[idx]+'/')
    cfg_subm['path_save'] = save_path[idx]
    cfg_subm['save_suffix'] = fname[idx]
    df_subm = funcs_pushshift.fetch_data(query_subm[idx], cfg_subm)
    subms_all.append(df_subm)

    cfg_cmt['path_save'] = save_path[idx]
    cfg_cmt['save_suffix'] = fname[idx]
    df_cmt = funcs_pushshift.fetch_data(query_cmt[idx], cfg_cmt)
    cmts_all.append(df_cmt)
    del df_subm, df_cmt
del query_subm, query_cmt

### 3. Assemble text of each month and calculate the number of active redditors

In [None]:
def mid_dividor(n):
    tmp = []
    i = 1
    while i <= n: 
        if (n % i==0) : 
            tmp.append(i) 
        i = i + 1
    if len(tmp) % 2 == 0:
        return [tmp[int(len(tmp)/2-1)], tmp[int(len(tmp)/2)]]
    else:
        return [tmp[round(len(tmp)/2)], tmp[round(len(tmp)/2)]]

In [None]:
# Parameters for plotting
if not os.path.exists(plt_cfg['path_save']):
    os.makedirs(plt_cfg['path_save'])
sub_idx = mid_dividor(len(query_main))
fig, axs = plt.subplots(sub_idx[0], sub_idx[1])

# Get column names
subm_cols = cfg_subm['field'].split(',')
cmt_cols = cfg_cmt['field'].split(',')
# Combine the data from the same month for each year and calculate the number of active redditors
num_files_month = int(len(fname) / (len(name_months) * len(query_keywords) * len(query_subreddits)))
subms = dict()
cmts = dict()
idx_data = 0
for cnt_query in range(len(query_main)):
    subms[query_main[cnt_query]] = []
    cmts[query_main[cnt_query]] = []
    num_redditor = []
    num_subm = []
    num_cmt = []
    for idx in range(0, len(name_months)):
        subm_temp = []
        cmt_temp = []
        for ii in range(0, num_files_month):
            if subms_all[idx_data].shape[0] != 0:
                subm_temp.append(subms_all[idx_data])
            if cmts_all[idx_data].shape[0] != 0:
                cmt_temp.append(cmts_all[idx_data])
            idx_data += 1
        if subm_temp == []:
            subms[query_main[cnt_query]].append(pd.DataFrame(columns=subm_cols))
        else:
            subms[query_main[cnt_query]].append(pd.concat(subm_temp).reset_index()[subm_cols])
        if cmt_temp == []:
            cmts[query_main[cnt_query]].append(pd.DataFrame(columns=subm_cols))
        else:
            cmts[query_main[cnt_query]].append(pd.concat(cmt_temp).reset_index()[cmt_cols])

        # Calculate the number of redditors
        authors = pd.concat([subms[query_main[cnt_query]][idx]['author'], cmts[query_main[cnt_query]][idx]['author']])
        temp = authors.shape[0]
        authors = authors.drop_duplicates().reset_index()
        num_redditor.append(authors.shape[0])
        if len(subms[query_main[cnt_query]][idx]) == 0:
            num_subm.append(0)
        else:
            num_subm.append(len(subms[query_main[cnt_query]][idx]))
        num_cmt.append(len(cmts[query_main[cnt_query]][idx]))
        del subm_temp, cmt_temp, temp, authors

    if axs.ndim == 1:
        ax = axs[0]
    elif axs.ndim == 2:
        sub = np.unravel_index(cnt_query, (sub_idx[0], sub_idx[1]))
        ax = axs[sub[0], sub[1]]
        del sub   
    
    num_post = pd.DataFrame({'No. submissions': num_subm, 'No. comments': num_cmt, 'No. redditors':num_redditor}, index=name_months)
    ax.plot(num_post['No. submissions'], color='#3d405b', label='No. submissions', linewidth=5)
    ax.plot(num_post['No. comments'], color='#81b29a', label='No. comments', linewidth=5)
    ax.plot(num_post['No. redditors'], color='#e07a5f', label='No. redditors', linewidth=5)
    ax.axhline(y=num_files_month*100, color='r', linestyle='--', alpha=0.3)
    ax.legend(loc='upper left', frameon=False, fontsize=15)
    ax.set_xticklabels(name_months)
    ax.set_xlabel('Months')
    ax.set_ylabel('Number')
    ax.set_title(query_main[cnt_query], fontsize=25)
    ax.spines['right'].set_visible(0)
    ax.spines['top'].set_visible(0)
    del num_redditor, num_subm, num_cmt, num_post, ax
    
fig.set_size_inches(20*sub_idx[0], 6*sub_idx[1])
if len(query_main) > 1:
    plt.savefig(plt_cfg['path_save']+'stats_'+query_main[0]+'_etc.jpg', bbox_inches='tight')
elif len(query_main) == 1:
    plt.savefig(plt_cfg['path_save']+'stats_'+query_main[0]+'.jpg', bbox_inches='tight')
del subm_cols, cmt_cols, num_files_month, idx_data, cnt_query, fig, axs, sub_idx

## Process the text for word cloud
Part of the code is adapted from the one originally produced by Zolzaya Luvsandorj ([Medium](https://towardsdatascience.com/introduction-to-nlp-part-1-preprocessing-text-in-python-8f007d44ca96)).  

The following steps are performed in order:
1. Concatenate all text from submission title, content and comments.
2. Tokenize
3. Normalize
4. Remove stopwords
5. Remove numbers, underscore, or words consist of less than two characters.
6. Reverse processed words to a big paragraph of text.

In [None]:
def process_text(text, additional_stopwords=[]):
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stopwords
    if additional_stopwords != []:
        keywords = [lemma for lemma in lemmas if lemma not in stopwords.words('english')+additional_stopwords]
    else:
        keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    
    # Remove words with numbers, underscore, or words consist of less than three characters.
    keywords = [word for word in keywords if not (any(char.isdigit() for char in word) or ('_' in word) or (len(word) < 3))]

    return keywords

In [None]:
word_freq = dict()
for query in query_main:
    word_freq[query] = []
    for month in range(0, len(subms[query])):
        # Concatenate text
        subm = subms[query][month]
        cmt = cmts[query][month]

        txt = ''
        for n in range(0, subm.shape[0]):
            if type(subm['title'][n]) == str:
                txt += subm['title'][n] + ' '
            if type(subm['selftext'][n]) == str:      
                txt += subm['selftext'][n] + ' '

        for n in range(0, cmt.shape[0]):
            if type(cmt['body'][n]) == str:
                txt += cmt['body'][n]
        
        # Preprocess words
        keywords = process_text(txt, add_stopwords)
        
        # Produce text frequency
        word_freq[query].append({word: keywords.count(word) for word in set(keywords)})
        del subm, cmt, keywords, txt
    pickle.dump(word_freq[query], open(save_path_prefix+query+'/word_frequency.p', 'wb'))

## Produce Word Clouds

In [None]:
wordclouds = dict()
for query in query_main:
    fig, axs = plt.subplots(plt_cfg['xSub'], plt_cfg['ySub'])
    wordclouds[query] = []
    cnt = 0
    for freq in word_freq[query]:
        if len(freq) == 0:
            wordclouds[query].append([])
        else:
            wordclouds[query].append(WordCloud(width = 3000, height = 2000, random_state=1, background_color='black', 
                                               colormap='Set2').generate_from_frequencies(frequencies=freq))        
            sub = np.unravel_index(cnt, (plt_cfg['xSub'], plt_cfg['ySub']))
            ax = axs[sub[0], sub[1]]
            ax.imshow(wordclouds[query][cnt])
            ax.set_title(plt_cfg['title'][cnt], fontsize=20)
            ax.spines['right'].set_visible(0)
            ax.spines['top'].set_visible(0)
            ax.axis('off')
            del ax, sub
        cnt += 1
    fig.set_size_inches(plt_cfg['size'][0], plt_cfg['size'][1])
    plt.savefig(plt_cfg['path_save']+query+'.jpg', bbox_inches='tight')
    pickle.dump(wordclouds, open(save_path_prefix+query+'/wordclouds.p', 'wb'))
del fig, axs