# Topic modeling on reddit comments

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')
import os
import glob

In [2]:
def concat_submissions_comments(path_submissions, path_comments, filename_comments):
    all_files = glob.glob(os.path.join(path_submissions, 'part-*'))

    dfs = pd.concat([pd.read_csv(f) for f in all_files])
    dfs = dfs[dfs.num_comments != 0]

    comment_folder = glob.glob(os.path.join(path_comments, filename_comments))
    comments_files = [glob.glob(os.path.join(folder, "part-*")) for folder in comment_folder]
    comments_files_all = [file for sublist in comments_files for file in sublist]

    dfc = pd.concat([pd.read_csv(f) for f in comments_files_all])
    
    return dfs, dfc

In [3]:
def concat_comments_per_submission(x):
    return pd.Series(dict(comments=' '.join(x['body'])))

In [31]:
import nltk
import re
from gensim import corpora, models
from nltk.corpus import stopwords
from collections import defaultdict

def topic_modeling(text, num_topics):
    
    #################### Pre-processing ##################
    # Removing numbers
    text = [re.sub(r'\d+', '', t) for t in text]

    # Tokenization
    tokens = [nltk.word_tokenize(doc) for doc in text]
    text_tokens = nltk.Text(tokens)
    
    # Stop words removal
    extra_stop = set(("''", ",",":","...",".",";","``","\'","la","en","le","et","ist",
                      "das","nicht","ich","zu","du","es","von","mit","auch","let","man",
                     "für", "den", "auf", "ein", "dass", "les", "que","un","pas"))
    stops = set(stopwords.words('english')).union(extra_stop)
    filtered_text = [[word for word in t if str(word).lower() not in stops] for t in text_tokens]

    # Remove very frequent and very in-frequent words
    lower_freq = 10
    upper_freq = 1000    
    frequency = defaultdict(int)
    for text in filtered_text:
        for token in text:
            frequency[token] += 1
    texts = [[token for token in text if frequency[token] > lower_freq and frequency[token] < upper_freq and len(token) != 1]
             for text in filtered_text]
    
    # Removing documents less than minimum length
    min_length = 30
    long_texts = [text for text in texts if len(text)>=min_length]
    ########################################################
    
    ############## Topic modeling using LDA #################
    # Dictionary of all the words
    dictionary = corpora.Dictionary(long_texts)
    
    # Vector representation of each document
    corpus = [dictionary.doc2bow(long_text) for long_text in long_texts]
    
    # At this stage, dictionary contains the list of all words, each word with an unique integer id. 
    # 'corpus' contains for each document a bag of words representation (the number of occurrences of each word).
    
    model = models.LdaModel(corpus, num_topics, id2word=dictionary)
    #model.print_topics()
    ########################################################

    return model, corpus, dictionary, long_texts

## Switzerland

In [6]:
path_submissions = "../data/reddit_swiss_submissions_fullcorpus"
path_comments = '../data'
filename_comments = "reddit_swiss_comments_*"

[dfs_swiss, dfc_swiss] = concat_submissions_comments(path_submissions, path_comments, filename_comments)

In [10]:
dfsc_swiss = dfs_swiss.merge(dfc_swiss, how='inner', left_on='name', right_on='link_id')
cps_swiss = dfsc_swiss.groupby(by='name_x').apply(concat_comments_per_submission)
cps_swiss = cps_swiss.reset_index()

dfs_name = pd.DataFrame(dfs_swiss[['name', 'title']])
df_aggcomments_swiss = dfs_name.merge(cps_swiss, how='inner', left_on='name', right_on='name_x')
del df_aggcomments_swiss['name_x']
df_aggcomments_swiss['title_comments'] = df_aggcomments_swiss.apply(lambda x: x['title']+' '+x['comments'], axis=1)
#df_aggcomments

In [11]:
text_swiss = [t for t in df_aggcomments_swiss['title_comments']]

In [32]:
num_topics = 5
[model_swiss, corpus, dictionary, long_texts] = topic_modeling(text_swiss, num_topics)
model_swiss.print_topics()

[(0,
  '0.002*"beautiful" + 0.002*"bank" + 0.002*"food" + 0.002*"hiking" + 0.002*"recommend" + 0.002*"area" + 0.002*"worth" + 0.002*"account" + 0.002*"post" + 0.002*"hike"'),
 (1,
  '0.002*"Schweiz" + 0.002*"army" + 0.002*"aber" + 0.002*"des" + 0.002*"oder" + 0.002*"im" + 0.002*"als" + 0.002*"eine" + 0.002*"sind" + 0.002*"nur"'),
 (2,
  '0.003*"change" + 0.003*"gold" + 0.002*"initiative" + 0.002*"government" + 0.002*"political" + 0.002*"believe" + 0.002*"start" + 0.002*"means" + 0.002*"read" + 0.002*"taxes"'),
 (3,
  '0.003*"TV" + 0.002*"gun" + 0.002*"guns" + 0.002*"name" + 0.002*"love" + 0.002*"put" + 0.002*"watch" + 0.002*"Billag" + 0.002*"almost" + 0.002*"learn"'),
 (4,
  '0.002*"Lausanne" + 0.002*"Basel" + 0.002*"friends" + 0.002*"company" + 0.002*"months" + 0.002*"water" + 0.002*"found" + 0.002*"CERN" + 0.002*"open" + 0.002*"visit"')]

## UK

In [None]:
path_submissions = "../data/reddit_uk_submissions_fullcorpus"
path_comments = '../data'
filename_comments = "reddit_uk_comments_*"

[dfs_uk, dfc_uk] = concat_submissions_comments(path_submissions, path_comments, filename_comments)

In [None]:
dfsc_uk = dfs_uk.merge(dfc_uk, how='inner', left_on='name', right_on='link_id')
cps_uk = dfsc_uk.groupby(by='name_x').apply(concat_comments_per_submission)
cps_uk = cps_uk.reset_index()

dfs_name = pd.DataFrame(dfs_uk[['name', 'title']])
df_aggcomments_uk = dfs_name.merge(cps_uk, how='inner', left_on='name', right_on='name_x')
del df_aggcomments_uk['name_x']
df_aggcomments_uk['title_comments'] = df_aggcomments_uk.apply(lambda x: x['title']+' '+x['comments'], axis=1)
df_aggcomments_uk

In [None]:
text_uk = [t for t in df_aggcomments_uk['title_comments']]

In [None]:
num_topics = 5
[model_swiss, corpus, dictionary, long_texts] = topic_modeling(text_uk, num_topics)
model_swiss.print_topics()

## EU

In [None]:
path_submissions = "../data/reddit_uk_submissions_fullcorpus"
path_comments = '../data'
filename_comments = "reddit_uk_comments_*"

[dfs_eu, dfc_eu] = concat_submissions_comments(path_submissions, path_comments, filename_comments)

In [None]:
germany_subreddit_list = ['germany', 'de', 'German', 'GermanPractice', 'GermanFacts', 'GermanConversation',                  ## germany
     'SCHLAND', 'germanyusa', 'DEjobs', 'bundesliga', 'GermanyPics', 'germusic', 'de_punk', 'germanrap', 'NDH']                 ## germany
france_subreddit_list  = ['blagues', 'cinemacinema', 'europe', 'france', 'FrancePics', 'frenchelectro', 'Frenchhistory', 'guessthefrenchmovie',      ## france
     'Ligue1', 'Livres', 'musiquefrancaise', 'paris', 'pedale', 'philosophie', 'Politique', 'rance', 'ScienceFr', 'SocialFrance']## france
italy_subreddit_list  = ['Calcio', 'ITAGLIA', 'Italianhistory', 'ITALIANMUSIC', 'italy', 'ItalyPhotos', 'Libri', 'Abruzzo', 'Apulia', 'bari',       ## italy
     'Basilicata', 'bologna', 'Calabria', 'Campania', 'Catania', 'emilia_romagna', 'firenze', 'friuli', 'Genova', 'Italia',     ## italy
     'lazio', 'Liguria', 'lombardia', 'Lombardy', 'marche', 'messina', 'milano', 'Modena', 'molise', 'Naples_Italy', 'napoli',  ## italy
     'padova', 'Palermo', 'Perugia', 'Piedmont', 'piemonte', 'Pisa', 'puglia', 'roma', 'rome', 'romesocialclub', 'Sardegna',    ## italy
     'Sardinia', 'Sicilia', 'sicily', 'Siracusa', 'torino', 'Toscana', 'trentino_alto_adige', 'trentod', 'Trieste',             ## italy
     'tuscany', 'Umbria', 'valle_daosta', 'Veneto', 'Venezia']                                                                 ## italy
spain_subreddit_list   =  ['Barcelona', 'EPANA', 'es', 'europe', 'futbol', 'Granada', 'LaLiga', 'Madrid', 'spain', 'Andalucia', 'SpanishHistory']     ## spain 

In [None]:
dfs_germany = dfs_eu[dfs_eu['subreddit'].isin(germany_subreddit_list)]