In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sb
import pandas as pd
import praw
import numpy as np
from collections import Counter
import os
import nltk
style.use('ggplot')

In [2]:
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag
from wordcloud import WordCloud

In [3]:
r = praw.Reddit(client_id='bvqtjKrg-4OJzA', 
                     client_secret='aNb_AKNWjcidfCftt85u86VUnsNpOw', 
                     user_agent='SCRIPT_NAME', 
                     username='peer_max_alon', 
                     password='peerproduction1')

Version 7.0.0 of praw is outdated. Version 7.1.0 was released Tuesday June 23, 2020.


## Functions

### Subreddit distribution functions

In [4]:
def get_mods(subreddit):
    bots = ['AutoModerator', 'modlog_research_bot', 'PoliticsModeratorBot', 'rGameMods']
    mods = [mod.name for mod in subreddit.moderator() if mod.name not in bots]
    return mods

In [5]:
def get_posts(mods, sub, sub_name):
    for mod in mods:
        posts_list = []
        posts = r.redditor(mod).submissions.new(limit=500)
        for post in posts:
            d = {}
            d['subreddit'] = post.subreddit.display_name
            d['title'] = post.title
#           d['url'] = "https://www.reddit.com" + post.permalink
            posts_list.append(d)
        df = pd.DataFrame(posts_list)
        df.to_csv(f'data/{sub_name}/posts/{mod}.csv', index=False)
    distribution_aggregator(mods, sub_name, 'posts')

In [6]:
def get_comments(mods, sub, sub_name):
    for mod in mods:
        comments_list = []
        comments = r.redditor(mod).comments.new(limit=500)
        for comment in comments:
            d = {}
            d['subreddit'] = comment.subreddit.display_name
            d['body'] = comment.body
#           d['url'] = "https://www.reddit.com" + comment.submission.permalink
            comments_list.append(d)
        df = pd.DataFrame(comments_list)
        df.to_csv(f'data/{sub_name}/comments/{mod}.csv', index=False)
    distribution_aggregator(mods, sub_name, 'comments')

In [7]:
def distribution_aggregator(mods, sub_name, submission_type):
    total = []
    for mod in mods:
        try:
            data = pd.read_csv(f'data/{sub_name}/{submission_type}/{mod}.csv')
        except pd.errors.EmptyDataError:
            total.append(dict())
            continue
        counter = dict(Counter(data['subreddit']))
        total.append(counter)
    distribution = pd.DataFrame(total, index=mods, dtype=np.int)
    distribution.fillna(0,inplace=True)
    distribution[distribution.columns] = distribution[distribution.columns].astype(int)
    distribution.to_excel(f'data/{sub_name}/{submission_type}/{submission_type}_aggreagated.xlsx')

### Language Processing Functions

In [8]:
def column_aggregator(mods, sub_name, submission_type, col_name):
    all_values = []
    for mod in mods:
        data = pd.read_csv(f'data/{sub_name}/{submission_type}/{mod}.csv')
        all_values.extend(data[col_name].tolist())
    return all_values

In [9]:
def word_freq_counter(normalized_arrays, sub_name, submission_type):
    word_freq = Counter()
    for token_array in normalized_arrays:
        word_freq = word_freq + Counter(token_array)
    most_common = word_freq.most_common()[0:140]
    df_word_freq = pd.DataFrame(most_common, columns=["word", "count"]).set_index('word')
    file_name = 'posts_title' if submission_type=='posts' else submission_type
    df_word_freq.to_excel(f'data/{sub_name}/{submission_type}/{file_name}_wordfreq.xlsx')
    return most_common

In [10]:
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [11]:
def lemmatization(tokens):
    pos_tagged = pos_tag(tokens)
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))
    lemmatizer = WordNetLemmatizer()
    lemmatized = [] 
    for word, tag in wordnet_tagged: 
        if tag is None: 
            lemmatized.append(word) 
        else:         
            lemmatized.append(lemmatizer.lemmatize(word, tag)) 
    return lemmatized

In [12]:
def tokenizer(lines):
    tk = RegexpTokenizer(r'\b[^\d\W]+\b') # only words without numbers
    tokenized_arrays = [tk.tokenize(line) for line in lines]
    return tokenized_arrays

In [13]:
def normalizer(tokens_arrays):
    normalized = []
    stop_words = set(stopwords.words('english'))
    for tokens_array in tokens_arrays:
        lowercase = [w.lower() for w in tokens_array if 2 < len(w) < 12] # lowercase tokenized sentence
        filtered_stopwords = [w.lower() for w in lowercase if not w in stop_words] # filter stopwords 
        normalized.append(lemmatization(filtered_stopwords))
    return normalized

In [14]:
def generate_word_cloud(counter, sub_name, submission_type):
    wordcloud = WordCloud(
        width=1024, height=1024, background_color='white', min_font_size=10
    ).generate_from_frequencies(counter)
    plt.figure(figsize=(10, 10), facecolor=None)
    plt.imshow(wordcloud)
    file_name = f'posts titles' if submission_type=='posts' else 'comments'
    plt.title(f'{sub_name} - {file_name} of moderators - word cloud', fontsize=18)
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.savefig(f'data/{sub_name}/{submission_type}/word_cloud.png',dpi=100)
    plt.close()

In [15]:
def folder_creation(sub_name):
    if os.path.isdir(f'data/{sub_name}'):
        return
    else:
        os.mkdir(f'data/{sub_name}')
        os.mkdir(f'data/{sub_name}/posts')
        os.mkdir(f'data/{sub_name}/comments')

## Main procedures

In [16]:
#['politics', 'games', 'askdocs']
sub_names = ['gameofthrones']

for sub_name in sub_names:
   
    # create folder structure
    folder_creation(sub_name)
    
    # get mods
    sub = r.subreddit(sub_name)
    mods = get_mods(sub)
    
    # agaggreate posts and comments
    get_posts(mods, sub, sub_name)    
    get_comments(mods, sub, sub_name)
    
    # generate word frequency
    submissions = [('posts', 'title'), ('comments', 'body')]
    for submission in submissions:
        submission_type, col_name = (submission[0], submission[1])
        all_values = column_aggregator(mods, sub_name, submission_type, col_name)
        tokenized = tokenizer(all_values)
        normalized = normalizer(tokenized)
        word_freq = word_freq_counter(normalized, sub_name, submission_type)

        # generate word cloud
        generate_word_cloud(dict(word_freq), sub_name, submission_type)