In [14]:
import matplotlib.pyplot as plt
import gensim
import numpy as np
import pandas as pd
import spacy
import pickle 

from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary, MmCorpus
import pyLDAvis.gensim

import os, re, operator, warnings
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now
%matplotlib inline

In [15]:
from gensim.models.phrases import Phrases, Phraser

In [16]:
from sqlitedict import SqliteDict

In [39]:
from datetime import datetime

In [58]:
pua_clean = SqliteDict("./../w2v/PUA clean.sqlite", tablename="value", flag="r")

In [32]:
for i in pua_clean.values():
    print(values)

## Creating topic model by community

In [61]:
def topic_modeling(sqlite_dict, sub):
    dict_list = []
    
    for value in sqlite_dict.values():
        if(value["text"] != None):
            dict_list.append(value)
            dict_list.sort(key = lambda i: i['timestamp'])
    
    sub_sent=[value["text"].split() for value in dict_list]
    sub_phrases = Phrases(sub_sent, min_count=3)
    sub_bigram = Phraser(sub_phrases)
    sub_sentences = sub_bigram[sub_sent]
    
    with open(f"./data/{sub}_sentences.txt", "wb") as fp:  # Pickling
        pickle.dump(sub_sentences, fp)
    print(f'{sub}_sentences.txt created')
    
    bigram = gensim.models.Phrases(sub_sentences)

    dictionary = Dictionary(sub_sentences)
    dictionary.save(f"./data/{sub}_hdp_dictionary.dict")
    
    print(f"{sub} Dictionary saved as {sub}_hdp_dictionary.dict")
    corpus = [dictionary.doc2bow(text) for text in sub_sentences]
    MmCorpus.serialize(f'./data/{sub}_hdp_corpus.mm', corpus)
    print(f'Corpus saved as {sub} hdp_corpus.mm')

In [62]:
topic_modeling(pua_clean, 'PUA')

PUA_sentences.txt created
PUA Dictionary saved as PUA_hdp_dictionary.dict
Corpus saved as PUA hdp_corpus.mm


In [None]:
to_request = [(k, v["text"]) for k, v in itertools.islice(db1.items(), args.init, args.end)]

In [51]:
def time_topic_creation(model_sqlite, community):
    
    models = []
    years = []
    
    sub = community
    
    for value in model_sqlite.values():
        dt_object = datetime.fromtimestamp(value["timestamp"]//1000)
        if dt_object.year not in years:
            years.append(dt_object.year)
    
    print(years)
    for year in years:
        sub_sent = []
        for value in model_sqlite.values():
            dt_object = datetime.fromtimestamp(value["timestamp"]//1000)
            if dt_object.year == year and value["text"] != None:
                sub_sent.append(value["text"].split())
        
        sub_phrases = Phrases(sub_sent, min_count=30)
        sub_bigram = Phraser(sub_phrases)
        sub_sentences = sub_bigram[sub_sent]

        with open(f"./data/{sub}_sentences_{year}.txt", "wb") as fp:  # Pickling
            pickle.dump(sub_sentences, fp)
        print(f'{sub}_sentences_{year}.txt created')

        bigram = gensim.models.Phrases(sub_sentences)

        dictionary = Dictionary(sub_sentences)
        dictionary.save(f"./data/{sub}_hdp_dictionary_{year}.dict")

        print(f"{sub} Dictionary saved as {sub}_hdp_dictionary_{year}.dict")
        corpus = [dictionary.doc2bow(text) for text in sub_sentences]
        MmCorpus.serialize(f'./data/{sub}_hdp_corpus_{year}.mm', corpus)
        print(f'Corpus saved as {sub}_hdp_corpus_{year}.mm')

In [52]:
time_topic_creation(pua_clean, "PUA")

oi
[2019, 2018, 2017, 2016]
PUA_sentences_2019.txt created
PUA Dictionary saved as PUA_hdp_dictionary_2019.dict
Corpus saved as PUA_hdp_corpus_2019.mm
PUA_sentences_2018.txt created
PUA Dictionary saved as PUA_hdp_dictionary_2018.dict
Corpus saved as PUA_hdp_corpus_2018.mm
PUA_sentences_2017.txt created
PUA Dictionary saved as PUA_hdp_dictionary_2017.dict
Corpus saved as PUA_hdp_corpus_2017.mm
PUA_sentences_2016.txt created
PUA Dictionary saved as PUA_hdp_dictionary_2016.dict
Corpus saved as PUA_hdp_corpus_2016.mm


## LDA

In [53]:
def LDA_time_model_creation(sub, years=[2018, 2019]):
    for year in years:
        with open(f"./data/{sub}_sentences_{year}.txt", "rb") as fp:   # Unpickling
            sub_sentences = pickle.load(fp)
            
        dictionary = Dictionary.load(f'./data/{sub}_hdp_dictionary_{year}.dict')
        corpus = MmCorpus(f'./data/{sub}_hdp_corpus_{year}.mm')
                  
        ldamodel = LdaModel(corpus=corpus, num_topics=15, id2word=dictionary)
        ldamodel.save(f'./lda/{sub}_lda_model_{year}.gensim')
        print(f'{sub} lda model created as {sub}_lda_model_{year}.gensim')

        ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)]

        lda_coherence = CoherenceModel(topics=ldatopics[:10], texts=sub_sentences,
                                       dictionary=dictionary, window_size=10).get_coherence()

        print(f"The topic coherence is {lda_coherence}")
        print()

In [54]:
LDA_time_model_creation('PUA')

PUA lda model created as PUA_lda_model_2018.gensim
The topic coherence is 0.3018479142328555

PUA lda model created as PUA_lda_model_2019.gensim
The topic coherence is 0.2615656517145621



In [55]:
def LDA_model_creation(sub):
        
    with open(f"./data/{sub}_sentences.txt", "rb") as fp:   # Unpickling
            sub_sentences = pickle.load(fp)
            
    dictionary = Dictionary.load(f'./data/{sub}_hdp_dictionary.dict')
    corpus = MmCorpus(f'./data/{sub}_hdp_corpus.mm')

    ldamodel = LdaModel(corpus=corpus, num_topics=15, id2word=dictionary)
    ldamodel.save(f'./lda/{sub}_lda_model.gensim')
    print(f'{sub} lda model created as {sub}_lda_model.gensim')

    ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)]

    lda_coherence = CoherenceModel(topics=ldatopics[:10], texts=sub_sentences,
                                   dictionary=dictionary, window_size=10).get_coherence()

    print(f"The topic coherence is {lda_coherence}")
    print()

In [56]:
LDA_model_creation('PUA')

PUA lda model created as PUA_lda_model.gensim
The topic coherence is 0.3553723479661005



In [71]:
years = [2016, 2017, 2018, 2019]

In [72]:
from gensim.models.wrappers.dtmmodel import DtmModel
PUA_corpus = MmCorpus('./data/PUA_hdp_corpus.mm')
PUA_dict =  Dictionary.load('./data/PUA_hdp_dictionary.dict')
PUA_time_seq = []
for year in years:
    c = MmCorpus(f'./data/PUA_hdp_corpus_{year}.mm')
    PUA_time_seq.append(len(c))

In [73]:
PUA_time_seq

[13, 529, 4578, 3149]

In [74]:
dtm_path = "./../../_PushshiftReddit/topic_model/data/dtm/dtm-linux64"

In [75]:
PUA_model = DtmModel(dtm_path, PUA_corpus, PUA_time_seq, num_topics=10,
                 id2word=PUA_dict, initialize_lda=True, alpha=0.1)

In [76]:
PUA_model.save("./lda/PUA_dtm.gensim")