# Additional analysis to analysis the COVID influence on people's daily life

1. Produce only one word cloud use all words collected from January - December, 2020 and 2019.
2. Compare the wordclouds of the same keyword & subreddit between 2019 and 2020.

In [None]:
save_path_prefix = 'covid_influence/'
keyword = 'covid|corona|quarantine|pandemic'
subreddits = ['Gifts,GiftIdeas', 'personalfinance', 'jobs,careerguidence,GetEmployed', 'CasualConversation', 'depression', 'books', 'teenagers', 'parenting', 'fitness', 
                   'gaming', 'relationships']
subm_field = ['selftext', 'title']
cmt_field = 'body'

# Manually picked stopwords.
temp_stopword = 'covid|corona|quarantine|pandemic'.split('|')+['day', 'days', 'week', 'weeks', 'month', 'months', 'year', 'years', 'hours',  
                                                               'could', 'would', 'should', 
                                                               'https', 'www', 'com', 'redditor', 'reddit', 'people', 'way', 'thoght', 'thoughts',
                                                               'get', 'make', 'give', 'take', 'think', 'use', 'want', 'like', 'know', 
                                                               'say', 'look', 'come', 'ask', 'feel', 'see', 'thank', 'thanks', 'seem', 'sure', 'able', 'unable',
                                                               'many', 'much', 'some', 'someone', 'something', 'sth', 'thing', 'things', 'anyone', 'guy', 
                                                               'every', 'might', 'may', 'maybe', 'probably', 'also', 'already', 'always', 'really',                                                               
                                                               'first', 'last', 'lot', 'with', 'without', 'well', 'best', 'good', 'better', 
                                                               'include', 'etc', 'else', 'since', 'back', 'however', 'even', 'ago', 'pretty', 
                                                               'actual', 'actually', 'another', 'due', 'likely', 'kind', 'anymore', 
                                                               'still', 'often', 'question',  'help', 'ever', 'post', 'please', 'around', 
                                                               'coronavirus', 'lockdown', 'lockdowns', 'outbreak', 'epidemic', 'virus', 'outbreak', 
                                                               'time', 'webp', 'start']

stopwords = dict()
for subreddit in subreddits:
    if subreddit == 'Gifts,GiftIdeas':
        stopwords[subreddit] = temp_stopword+subreddit.lower().split(',')+['gift']
    elif subreddit == 'personalfinance':
        stopwords[subreddit] = temp_stopword+subreddit.lower().split(',')+['finance']
    elif subreddit == 'creditcards':
        stopwords[subreddit] = temp_stopword+subreddit.lower().split(',')+['credit', 'card']
    elif subreddit == 'realestate':
        stopwords[subreddit] = temp_stopword+subreddit.lower().split(',')+['house', 'home']
    elif subreddit == 'smallbusiness':
        stopwords[subreddit] = temp_stopword+subreddit.lower().split(',')+['small', 'business']
    elif subreddit == 'jobs,careerguidence,GetEmployed':
        stopwords[subreddit] = temp_stopword+subreddit.lower().split(',')+['job', 'jobs', 'work', '']
    elif subreddit == 'books':
        stopwords[subreddit] = temp_stopword+subreddit.lower().split(',')+['read', 'book']
    elif subreddit == 'gaming':
        stopwords[subreddit] = temp_stopword+subreddit.lower().split(',')+['game']
    elif subreddit == 'relationships':
        stopwords[subreddit] = temp_stopword+subreddit.lower().split(',')+['relationship']
    elif subreddit == 'china':
        stopwords[subreddit] = temp_stopword+subreddit.lower().split(',')+['chinese']
    elif subreddit == 'india':
        stopwords[subreddit] = temp_stopword+subreddit.lower().split(',')+['indian']
    elif subreddit == 'australia':
        stopwords[subreddit] = temp_stopword+subreddit.lower().split(',')+['australian', 'australians']
    else:
        stopwords[subreddit] = temp_stopword+subreddit.lower().split(',')

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
def combine_word_freq(pickle_path):
    freq_all = pickle.load(open(pickle_path, 'rb'))
    comb_freq = dict()
    for freq in freq_all:
        for key in freq.keys():
            if key in comb_freq.keys():
                comb_freq[key] += freq[key]
            else:
                comb_freq[key] = freq[key]
    return comb_freq

In [None]:
word_freq_20 = dict()
word_freq_19 = dict()
word_freq_diff = dict()
for subreddit in subreddits:
    word_freq_20[subreddit] = combine_word_freq(save_path_prefix+'files/'+keyword+'_'+subreddit+'/'+'word_frequency.p')
    word_freq_19[subreddit] = combine_word_freq(save_path_prefix+'files/_'+subreddit+'/'+'word_frequency.p')
    # Compare the word frequency in 2020 to that in 2019
    word_freq_diff[subreddit] = dict()
    for key in word_freq_20[subreddit].keys():
        if key not in word_freq_19[subreddit].keys():
            word_freq_diff[subreddit][key] = word_freq_20[subreddit][key]
        elif (key in word_freq_19[subreddit].keys()) and (word_freq_20[subreddit][key] - word_freq_19[subreddit][key] > 0):
            word_freq_diff[subreddit][key] = word_freq_20[subreddit][key] - word_freq_19[subreddit][key]

    for stopword in stopwords[subreddit]:
        word_freq_19[subreddit].pop(stopword, None)
        word_freq_20[subreddit].pop(stopword, None)
        word_freq_diff[subreddit].pop(stopword, None)

In [None]:
def mid_dividor(n):
    tmp = []
    i = 1
    while i <= n: 
        if (n % i==0) : 
            tmp.append(i) 
        i = i + 1
    if len(tmp) % 2 == 0:
        return [tmp[int(len(tmp)/2-1)], tmp[int(len(tmp)/2)]]
    else:
        return [tmp[round(len(tmp)/2)], tmp[round(len(tmp)/2)]]

In [None]:
sub_idx = mid_dividor(len(subreddits))
for idx in range(2, 3):
    if idx == 0:
        word_freq = word_freq_19
        fsave = save_path_prefix+'plots/wordcloud_queries_19.jpg'
    elif idx == 1:
        word_freq = word_freq_20
        fsave = save_path_prefix+'plots/wordcloud_queries_20.jpg'
    else:
        word_freq = word_freq_diff
        fsave = save_path_prefix+'plots/wordcloud_queries_diff.jpg'
    
    fig, axs = plt.subplots(sub_idx[0], sub_idx[1])
    cnt = 0
    for subreddit in subreddits:
        wc = WordCloud(width = 3000, height = 2000, background_color='black', colormap='Set2').generate_from_frequencies(frequencies=word_freq[subreddit])       
        sub = np.unravel_index(cnt, (sub_idx[0], sub_idx[1]))
        ax = axs[sub[0], sub[1]]
        ax.imshow(wc)
        ax.set_title(subreddit, fontsize=20)
        ax.spines['right'].set_visible(0)
        ax.spines['top'].set_visible(0)
        ax.axis('off')
        cnt += 1
        del wc, ax, sub
    
    fig.set_size_inches(17*sub_idx[0], 5*sub_idx[1])
    plt.savefig(fsave, bbox_inches='tight')
    del word_freq, fsave, fig, axs, cnt