In [35]:
# Source: #https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#1introduction
import calendar
import en_core_web_sm
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel, TfidfModel
from gensim.utils import simple_preprocess
import glob
import json
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import numpy as np
import os
import pyLDAvis
import pyLDAvis.gensim
import pandas as pd
import regex
import seaborn as sns
import spacy
from collections import Counter
from datetime import date, datetime
import math
from nltk.corpus import stopwords
from tqdm import tqdm, tqdm_notebook
import warnings
from wordcloud import WordCloud

sns.set_theme(style="whitegrid")
nlp = en_core_web_sm.load()
warnings.filterwarnings("ignore", category=DeprecationWarning)



In [36]:
# set display options for pandas dataframes

pd.options.display.max_columns=2000
pd.options.display.max_rows=2000
pd.options.display.max_colwidth=10000
pd.options.display.max_seq_items=2000

# read in three CSV files into pandas dataframes
df_news_true=pd.read_csv('data/DataSet_Misinfo_TRUE.csv',index_col=False)
df_news_fake=pd.read_csv('data/DataSet_Misinfo_FAKE.csv')
df_news_rpsub=pd.read_csv('data/EXTRA_RussianPropagandaSubset.csv')

In [37]:
df_news_true['text'][0]

'Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year,  President Angry Pants tweeted.  2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America!  Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as you d expect.What kind of president sends a New Year s greeting like this despicable, petty, infantile gibberish? Only Trump! His lack of decency won t ev

In [18]:
# def load_data(file):
#     with open (file, "r", encoding="utf-8") as f:
#         data = json.load(f) 
#     return (data)

# def write_data(file, data):
#     with open (file, "w", encoding="utf-8") as f:
#         json.dump(data, f, indent=4)

# # stopwords = stopwords.words("english")

# # df_new=pd.read_csv('data/df_survay_cont_new.csv')

# def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
#     nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
#     texts_out = []
#     for text in tqdm_notebook(texts):
#         doc = nlp(text)
#         new_text = []
#         for token in doc:
#             if token.pos_ in allowed_postags:
#                 new_text.append(token.lemma_)
#         final = " ".join(new_text)
#         texts_out.append(final)
#     return (texts_out)


# lemmatized_texts = lemmatization(df_news_true[df_news_true['text'].isna()==False]['text'])
# print (lemmatized_texts[0:2])

# def gen_words(texts):
#     final = []
#     for text in texts:
#         new = gensim.utils.simple_preprocess(text, deacc=True)
#         final.append(new)
#     return (final)

# data_words = gen_words(lemmatized_texts)

# print (data_words[0:2])



  0%|          | 0/34946 [00:00<?, ?it/s]





In [38]:
# Function to load data from a JSON file
def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f) 
    return data


# Function to write data to a JSON file
def write_data(file, data):
    with open(file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)


# Function for lemmatizing text
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in tqdm_notebook(texts):
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return texts_out


# Lemmatize the text in a dataframe
lemmatized_texts = lemmatization(df_news_true[df_news_true['text'].isna()==False]['text'])
print(lemmatized_texts[0:2])

# Function for generating words from text
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return final


# Generate words from the lemmatized text
data_words = gen_words(lemmatized_texts)

print(data_words[0:2])



  0%|          | 0/34946 [00:00<?, ?it/s]

['head conservative republican faction vote month huge expansion national debt pay tax cut call fiscal conservative urge budget restraint keep sharp pivot way speak face nation draw hard line federal spending lawmaker brace battle When return holiday lawmaker begin try pass federal budget fight likely link other issue such immigration policy even congressional election campaign approach seek keep control want big budget increase military spending also want proportional increase non - defense discretionary spending program support education scientific research infrastructure public health environmental protection administration already willing say be go increase non - defense discretionary spending about percent meadow chairman small influential say program now say ’ enough need give government pay raise percent fiscal conservative see where rationale eventually run other people money say meadow vote late party debt finance tax overhaul expect balloon federal budget deficit add year nat

In [39]:
# Creating bigram and trigram phrases
bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=100)

# Creating bigram and trigram models
bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

# Function to make bigrams
def make_bigrams(texts):
    return [bigram[doc] for doc in texts]

# Function to make trigrams
def make_trigrams(texts):
    return [trigram[bigram[doc]] for doc in texts]

# Creating bigrams for the preprocessed text data
data_bigrams = make_bigrams(data_words)

# Creating trigrams for the preprocessed text data
data_bigrams_trigrams = make_trigrams(data_bigrams)

print(data_bigrams_trigrams[:2])


[['head', 'conservative', 'republican', 'faction', 'vote', 'month', 'huge', 'expansion', 'national', 'debt', 'pay', 'tax', 'cut', 'call', 'fiscal', 'conservative', 'urge', 'budget', 'restraint', 'keep', 'sharp', 'pivot', 'way', 'speak', 'face', 'nation', 'draw', 'hard', 'line', 'federal', 'spending', 'lawmaker', 'brace', 'battle', 'when', 'return', 'holiday', 'lawmaker', 'begin', 'try', 'pass', 'federal', 'budget', 'fight', 'likely', 'link', 'other', 'issue', 'such', 'immigration', 'policy', 'even', 'congressional', 'election', 'campaign', 'approach', 'seek', 'keep', 'control', 'want', 'big', 'budget', 'increase', 'military', 'spending', 'also', 'want', 'proportional', 'increase', 'non', 'defense', 'discretionary_spending', 'program', 'support', 'education', 'scientific_research', 'infrastructure', 'public', 'health', 'environmental', 'protection', 'administration', 'already', 'willing', 'say', 'be', 'go', 'increase', 'non', 'defense', 'discretionary_spending', 'about', 'percent', 'mea

In [24]:
# id2word = corpora.Dictionary(data_bigrams_trigrams)

# texts = data_bigrams_trigrams

# corpus = [id2word.doc2bow(text) for text in texts]
# # print (corpus[0][0:20])

# tfidf = TfidfModel(corpus, id2word=id2word)

# low_value = 0.03
# words  = []
# words_missing_in_tfidf = []
# for i in tqdm(range(0, len(corpus))):
#     bow = corpus[i]
#     low_value_words = [] #reinitialize to be safe. You can skip this.
#     tfidf_ids = [id for id, value in tfidf[bow]]
#     bow_ids = [id for id, value in bow]
#     low_value_words = [id for id, value in tfidf[bow] if value < low_value]
#     drops = low_value_words+words_missing_in_tfidf
#     for item in drops:
#         words.append(id2word[item])
#     words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

#     new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
#     corpus[i] = new_bow

# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus[:-1],
#                                            id2word=id2word,
#                                            num_topics=10,
#                                            random_state=100,
#                                            update_every=1,
#                                            chunksize=100,
#                                            passes=10,
#                                            alpha="auto")

# test_doc = corpus[-1]

# vector = lda_model[test_doc]
# print (vector)

# def Sort(sub_li):
#     sub_li.sort(key = lambda x: x[1])
#     sub_li.reverse()
#     return (sub_li)
# new_vector = Sort(vector)
# print (new_vector)

# lda_model.save("test_model.model")

# new_model = gensim.models.ldamodel.LdaModel.load("test_model.model")


# test_doc = corpus[-1]

# vector = new_model[test_doc]
# print (vector)

# def Sort(sub_li):
#     sub_li.sort(key = lambda x: x[1])
#     sub_li.reverse()
#     return (sub_li)
# new_vector = Sort(vector)
# print (new_vector)

100%|██████████| 34946/34946 [00:46<00:00, 754.32it/s] 


[(0, 0.19158031), (1, 0.1581502), (2, 0.010289048), (3, 0.038689), (4, 0.030338267), (5, 0.029255997), (6, 0.07623609), (7, 0.08968172), (8, 0.042706594), (9, 0.33307278)]
[(9, 0.33307278), (0, 0.19158031), (1, 0.1581502), (7, 0.08968172), (6, 0.07623609), (8, 0.042706594), (3, 0.038689), (4, 0.030338267), (5, 0.029255997), (2, 0.010289048)]
[(0, 0.19158033), (1, 0.15815122), (2, 0.010287557), (3, 0.038689002), (4, 0.030338377), (5, 0.029256074), (6, 0.07623606), (7, 0.089681976), (8, 0.04270665), (9, 0.33307278)]
[(9, 0.33307278), (0, 0.19158033), (1, 0.15815122), (7, 0.089681976), (6, 0.07623606), (8, 0.04270665), (3, 0.038689002), (4, 0.030338377), (5, 0.029256074), (2, 0.010287557)]


In [40]:
# Create the id2word dictionary
id2word = corpora.Dictionary(data_bigrams_trigrams)

# Create corpus
texts = data_bigrams_trigrams
corpus = [id2word.doc2bow(text) for text in texts]

# Apply tf-idf
tfidf = TfidfModel(corpus, id2word=id2word)

# Remove low tf-idf value words and those missing from tf-idf model
low_value = 0.03
words = []
words_missing_in_tfidf = []
for i in tqdm(range(len(corpus))):
    bow = corpus[i]
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words + words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids]
    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

# Train LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus[:-1],
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

# Get the topic distribution for a test document
test_doc = corpus[-1]
vector = lda_model[test_doc]
print(vector)

# Sort the topic distribution in descending order
def sort_topic_distribution(sub_li):
    sub_li.sort(key=lambda x: x[1])
    sub_li.reverse()
    return sub_li

new_vector = sort_topic_distribution(vector)
print(new_vector)

# Save and load the LDA model
lda_model.save("test_model.model")
new_model = gensim.models.ldamodel.LdaModel.load("test_model.model")

# Get the topic distribution for the same test document using the loaded model
test_doc = corpus[-1]
vector = new_model[test_doc]
print(vector)

# Sort the topic distribution in descending order
new_vector = sort_topic_distribution(vector)
print(new_vector)

100%|██████████| 34946/34946 [01:03<00:00, 547.81it/s] 


[(0, 0.19158031), (1, 0.1581502), (2, 0.010289048), (3, 0.038689), (4, 0.030338267), (5, 0.029255997), (6, 0.07623609), (7, 0.08968172), (8, 0.042706594), (9, 0.33307278)]
[(9, 0.33307278), (0, 0.19158031), (1, 0.1581502), (7, 0.08968172), (6, 0.07623609), (8, 0.042706594), (3, 0.038689), (4, 0.030338267), (5, 0.029255997), (2, 0.010289048)]
[(0, 0.19158033), (1, 0.15815122), (2, 0.010287557), (3, 0.038689002), (4, 0.030338377), (5, 0.029256074), (6, 0.07623606), (7, 0.089681976), (8, 0.04270665), (9, 0.33307278)]
[(9, 0.33307278), (0, 0.19158033), (1, 0.15815122), (7, 0.089681976), (6, 0.07623606), (8, 0.04270665), (3, 0.038689002), (4, 0.030338377), (5, 0.029256074), (2, 0.010287557)]


In [41]:
#enables the display of the visualization in a Jupyter notebook.
pyLDAvis.enable_notebook()

#prepares the data for visualization by generating the topic-term distribution, term frequency, and topic frequency distributions using the lda_model, corpus, and id2word.
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=15)


  default_term_info = default_term_info.sort_values(


In [42]:
# save the results in the html
pyLDAvis.save_html(vis, 'hackaton_topic_modeling_true_news.html')

  from imp import reload
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  from imp import reload
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  from imp import reload
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  from imp import reload
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  from imp import reload
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVe