In [25]:
import matplotlib.pyplot as plt
import gensim
import numpy as np
import pandas as pd
import spacy
import pickle 

from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary, MmCorpus
import pyLDAvis.gensim

import os, re, operator, warnings
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now
%matplotlib inline

In [26]:
from gensim.models.phrases import Phrases, Phraser

## Creating DF

In [3]:
with open('./../w2v/df_clean.csv', 'rb') as fp:
    df_clean = pickle.load(fp)

In [None]:
sdf = pd.read_csv('./../subreddits.csv')
subreddits = sdf.values.tolist()
for s in subreddits:
        sub = str(s)[5:-5]

        temp_df = pd.read_csv(f'./../data/reddit/cm/{sub}_comments.csv')
        temp_df['subreddit'] = sub
        if s == subreddits[0]:
            df = temp_df
        else:
            df = df.append(temp_df, ignore_index=True)
        print(f'added {sub}')

In [6]:
df_clean = df_clean.dropna().drop_duplicates()
df_clean['subreddit'] = df['subreddit']
df_clean['Date'] = df['Publish Date']
df_clean['Parent id'] = df['Parent id']

In [7]:
df_clean = df_clean[~df_clean.clean.str.contains("gt...")]

In [8]:
df_clean.to_pickle(f"df_clean_topic_model.csv")

## Creating topic model by community

In [3]:
with open("df_clean_topic_model.csv", "rb") as fp:
    df_clean = pickle.load(fp)

In [4]:
MR_subreddits = ['LadyMRAs', 'FeMRADebates', 'Masculism', 'MensRants', 'FeMRA', 'MRActivism',
                 'MensRightsLaw', 'MRRef']  # removed againstmansrights

Incel_subreddits = ['askanincel', 'BlackPillScience', 'IncelsWithoutHate', 'Braincels']

MGTOW_subreddits = ['MGTOW']

RedPill_subreddits = ['RedPillParenting', 'TRPOffTopic', 'GEOTRP', 'thankTRP', 'redpillbooks',
                      'becomeaman', 'RedPillWomen', 'TheBluePill', 'asktrp', 'TheRedPill']  # removed exredpill

In [5]:
def topic_modeling(t_subreddits, sub):
    df = df_clean.loc[df_clean['subreddit'].isin(t_subreddits)]
    df = df.groupby('Parent id')['clean'].agg(lambda col: ' '.join(col))
    
    sub_sent = [row.split() for row in df]
    sub_phrases = Phrases(sub_sent, min_count=30)
    sub_bigram = Phraser(sub_phrases)
    sub_sentences = sub_bigram[sub_sent]
    
    with open(f"./data/{sub}_sentences.txt", "wb") as fp:  # Pickling
        pickle.dump(sub_sentences, fp)
    print(f'{sub}_sentences.txt created')
    
    bigram = gensim.models.Phrases(sub_sentences)

    dictionary = Dictionary(sub_sentences)
    dictionary.save(f"./data/{sub}_hdp_dictionary.dict")
    
    print(f"{sub} Dictionary saved as {sub}_hdp_dictionary.dict")
    corpus = [dictionary.doc2bow(text) for text in sub_sentences]
    MmCorpus.serialize(f'./data/{sub}_hdp_corpus.mm', corpus)
    print(f'Corpus saved as {sub} hdp_corpus.mm')

    hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

    hdpmodel.save(f'./data/{sub}_hdp_model_spacy.gensim')
    print(f'{sub} hdp model created')

    hdptopics = [[word for word, prob in topic] for topicid, topic in hdpmodel.show_topics(formatted=False)]

    hdp_coherence = CoherenceModel(topics=hdptopics[:10], texts=sub_sentences,
                                   dictionary=dictionary, window_size=10).get_coherence()

    print(f"The topic coherence is {hdp_coherence}")

In [6]:
topic_modeling(MR_subreddits, 'MR')
topic_modeling(Incel_subreddits, 'Incel')
topic_modeling(MGTOW_subreddits, 'MGTOW')
topic_modeling(RedPill_subreddits, 'RP')

MR_sentences.txt created
MR Dictionary saved as MR_hdp_dictionary.dict
Corpus saved as MR hdp_corpus.mm
MR hdp model created
The topic coherence is 0.1925351518078818
Incel_sentences.txt created
Incel Dictionary saved as Incel_hdp_dictionary.dict
Corpus saved as Incel hdp_corpus.mm
Incel hdp model created
The topic coherence is 0.17516453392302694
MGTOW_sentences.txt created
MGTOW Dictionary saved as MGTOW_hdp_dictionary.dict
Corpus saved as MGTOW hdp_corpus.mm
MGTOW hdp model created
The topic coherence is 0.19287052034094088
RP_sentences.txt created
RP Dictionary saved as RP_hdp_dictionary.dict
Corpus saved as RP hdp_corpus.mm
RP hdp model created
The topic coherence is 0.19249689416454174


In [16]:
def time_topic_creation(model_df, community='reddit', years=[2014, 2015, 2016, 2017, 2018]):
    
    models = []
    
    sub = community
    
    for year in years:
        
        # Creating Sentences by time
        time_df = model_df.loc[model_df['year'] == year]
        time_df = time_df.groupby('Parent id')['clean'].agg(lambda col: ' '.join(col))
        
        sub_sent = [row.split() for row in time_df]
        sub_phrases = Phrases(sub_sent, min_count=30)
        sub_bigram = Phraser(sub_phrases)
        sub_sentences = sub_bigram[sub_sent]

        with open(f"./data/{sub}_sentences_{year}.txt", "wb") as fp:  # Pickling
            pickle.dump(sub_sentences, fp)
        print(f'{sub}_sentences_{year}.txt created')

        bigram = gensim.models.Phrases(sub_sentences)

        dictionary = Dictionary(sub_sentences)
        dictionary.save(f"./data/{sub}_hdp_dictionary_{year}.dict")

        print(f"{sub} Dictionary saved as {sub}_hdp_dictionary_{year}.dict")
        corpus = [dictionary.doc2bow(text) for text in sub_sentences]
        MmCorpus.serialize(f'./data/{sub}_hdp_corpus_{year}.mm', corpus)
        print(f'Corpus saved as {sub}_hdp_corpus_{year}.mm')

        '''hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

        hdpmodel.save(f'./data/{sub}_hdp_model_spacy_{year}.gensim')
        print(f'{sub} hdp model created as {sub}_hdp_model_spacy_{year}.gensim')

        hdptopics = [[word for word, prob in topic] for topicid, topic in hdpmodel.show_topics(formatted=False)]

        hdp_coherence = CoherenceModel(topics=hdptopics[:10], texts=sub_sentences,
                                       dictionary=dictionary, window_size=10).get_coherence()

        print(f"The topic coherence is {hdp_coherence}")'''

In [55]:
years = [2014, 2015, 2016, 2017, 2018]

In [7]:
df_clean['Date'] = pd.to_datetime(df_clean['Date'])
df_clean['year'] = df_clean['Date'].dt.year
df_clean['year'] = df_clean['year'].mask(df_clean['year'] < 2015)
df_clean['year'] = df_clean['year'].fillna(2014)
df_clean = df_clean.astype({"year": int})
df_clean.drop(columns='Date', inplace=True)

In [13]:
MR_df = df_clean.loc[df_clean['subreddit'].isin(MR_subreddits)]
MGTOW_df = df_clean.loc[df_clean['subreddit'].isin(MGTOW_subreddits)]
RedPill_df = df_clean.loc[df_clean['subreddit'].isin(RedPill_subreddits)]

In [15]:
time_topic_creation(MR_df, community="MR")
time_topic_creation(MGTOW_df, community="MGTOW")
time_topic_creation(RedPill_df, community="RP")

MR_sentences_2014.txt created
MR Dictionary saved as MR_hdp_dictionary_2014.dict
Corpus saved as MR_hdp_corpus_2014.mm
MR hdp model created as MR_hdp_model_spacy_2014.gensim
The topic coherence is 0.21977563264363437
MR_sentences_2015.txt created
MR Dictionary saved as MR_hdp_dictionary_2015.dict
Corpus saved as MR_hdp_corpus_2015.mm
MR hdp model created as MR_hdp_model_spacy_2015.gensim
The topic coherence is 0.2669650269251206
MR_sentences_2016.txt created
MR Dictionary saved as MR_hdp_dictionary_2016.dict
Corpus saved as MR_hdp_corpus_2016.mm
MR hdp model created as MR_hdp_model_spacy_2016.gensim
The topic coherence is 0.20654154822571505
MR_sentences_2017.txt created
MR Dictionary saved as MR_hdp_dictionary_2017.dict
Corpus saved as MR_hdp_corpus_2017.mm
MR hdp model created as MR_hdp_model_spacy_2017.gensim
The topic coherence is 0.19934594743999443
MR_sentences_2018.txt created
MR Dictionary saved as MR_hdp_dictionary_2018.dict
Corpus saved as MR_hdp_corpus_2018.mm
MR hdp model c

KeyboardInterrupt: 

In [17]:
time_topic_creation(RedPill_df, community="RP")

RP_sentences_2014.txt created
RP Dictionary saved as RP_hdp_dictionary_2014.dict
Corpus saved as RP_hdp_corpus_2014.mm
RP_sentences_2015.txt created
RP Dictionary saved as RP_hdp_dictionary_2015.dict
Corpus saved as RP_hdp_corpus_2015.mm
RP_sentences_2016.txt created
RP Dictionary saved as RP_hdp_dictionary_2016.dict
Corpus saved as RP_hdp_corpus_2016.mm
RP_sentences_2017.txt created
RP Dictionary saved as RP_hdp_dictionary_2017.dict
Corpus saved as RP_hdp_corpus_2017.mm
RP_sentences_2018.txt created
RP Dictionary saved as RP_hdp_dictionary_2018.dict
Corpus saved as RP_hdp_corpus_2018.mm


## LDA

In [21]:
def LDA_time_model_creation(sub, years=[2014, 2015, 2016, 2017, 2018]):
    for year in years:
        with open(f"./data/{sub}_sentences_{year}.txt", "rb") as fp:   # Unpickling
            sub_sentences = pickle.load(fp)
            
        dictionary = Dictionary.load(f'./data/{sub}_hdp_dictionary_{year}.dict')
        corpus = MmCorpus(f'./data/{sub}_hdp_corpus_{year}.mm')
                  
        ldamodel = LdaModel(corpus=corpus, num_topics=15, id2word=dictionary)
        ldamodel.save(f'./lda/{sub}_lda_model_{year}.gensim')
        print(f'{sub} lda model created as {sub}_lda_model_{year}.gensim')

        ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)]

        lda_coherence = CoherenceModel(topics=ldatopics[:10], texts=sub_sentences,
                                       dictionary=dictionary, window_size=10).get_coherence()

        print(f"The topic coherence is {lda_coherence}")
        print()

In [22]:
LDA_time_model_creation('MR')
LDA_time_model_creation('MGTOW')
LDA_time_model_creation('RP')

MR lda model created as MR_lda_model_2014.gensim
The topic coherence is 0.4303349630739578

MR lda model created as MR_lda_model_2015.gensim
The topic coherence is 0.39077168604890067

MR lda model created as MR_lda_model_2016.gensim
The topic coherence is 0.35329469574491223

MR lda model created as MR_lda_model_2017.gensim
The topic coherence is 0.42222909316824825

MR lda model created as MR_lda_model_2018.gensim
The topic coherence is 0.39659447502140033

MGTOW lda model created as MGTOW_lda_model_2014.gensim
The topic coherence is 0.3123680152310265

MGTOW lda model created as MGTOW_lda_model_2015.gensim
The topic coherence is 0.36693680812053986

MGTOW lda model created as MGTOW_lda_model_2016.gensim
The topic coherence is 0.38146385739503885

MGTOW lda model created as MGTOW_lda_model_2017.gensim
The topic coherence is 0.3828453513299753

MGTOW lda model created as MGTOW_lda_model_2018.gensim
The topic coherence is 0.3951469472043319

RP lda model created as RP_lda_model_2014.ge

In [30]:
def LDA_model_creation(sub):
        
    with open(f"./data/{sub}_sentences.txt", "rb") as fp:   # Unpickling
            sub_sentences = pickle.load(fp)
            
    dictionary = Dictionary.load(f'./data/{sub}_hdp_dictionary.dict')
    corpus = MmCorpus(f'./data/{sub}_hdp_corpus.mm')

    ldamodel = LdaModel(corpus=corpus, num_topics=15, id2word=dictionary)
    ldamodel.save(f'./lda/{sub}_lda_model.gensim')
    print(f'{sub} lda model created as {sub}_lda_model.gensim')

    ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)]

    lda_coherence = CoherenceModel(topics=ldatopics[:10], texts=sub_sentences,
                                   dictionary=dictionary, window_size=10).get_coherence()

    print(f"The topic coherence is {lda_coherence}")
    print()

In [31]:
LDA_model_creation('MR')
LDA_model_creation('MGTOW')
LDA_model_creation('RP')

MR lda model created as MR_lda_model.gensim
The topic coherence is 0.44454499861981916

MGTOW lda model created as MGTOW_lda_model.gensim
The topic coherence is 0.4208849264427322

RP lda model created as RP_lda_model.gensim
The topic coherence is 0.38998915297146425



In [32]:
LDA_model_creation('Incel')

Incel lda model created as Incel_lda_model.gensim
The topic coherence is 0.4343119597329652



In [56]:
from gensim.models.wrappers.dtmmodel import DtmModel
RP_corpus = MmCorpus('./data/RP_hdp_corpus.mm')
RP_dict =  Dictionary.load('./data/RP_hdp_dictionary.dict')
RP_time_seq = []
for year in years:
    c = MmCorpus(f'./data/RP_hdp_corpus_{year}.mm')
    RP_time_seq.append(len(c))
RP_time_seq.append(len(RP_corpus) - sum(RP_time_seq))

In [58]:
dtm_path = "./data/dtm/dtm-linux64"

In [59]:
RP_model = DtmModel(dtm_path, RP_corpus, RP_time_seq, num_topics=10,
                 id2word=RP_dict, initialize_lda=True)

In [60]:
RP_model.save("./lda/RP_dtm.gensim")