In [96]:
import nltk
import numpy as np
import pandas as pd
import json
import glob

#Gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim import corpora

#Spacy
import spacy
from nltk.corpus import stopwords


#Visualisation
import pyLDAvis
import pyLDAvis.gensim

In [97]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alexa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [98]:
#Create 2 dataframes with fake and real covid news
fake_news_dataframe = pd.DataFrame()
real_news_dataframe = pd.DataFrame()

fake_news_claims_csv_files = ['../data/NewsFakeCovid-19.csv', '../data/NewsFakeCovid-19_5.csv', '../data/NewsFakeCovid-19_7.csv']
real_news_claims_csv_files = ['../data/NewsRealCOVID-19.csv', '../data/NewsRealCovid-19_5.csv', '../data/NewsRealCovid-19_7.csv']

for file in fake_news_claims_csv_files:
    df = pd.read_csv(file)
    fake_news_dataframe = pd.concat([fake_news_dataframe, df])

for file in real_news_claims_csv_files:
    df = pd.read_csv(file)
    real_news_dataframe = pd.concat([real_news_dataframe, df])

print(fake_news_dataframe.dtypes, '\n')
print(real_news_dataframe.dtypes)

Unnamed: 0         int64
type              object
fact_check_url    object
archieve          object
news_url          object
news_url2         object
news_url3         object
news_url4         object
news_url5         object
title             object
newstitle         object
content           object
abstract          object
publish_date      object
meta_keywords     object
dtype: object 

Unnamed: 0         int64
type              object
fact_check_url    object
news_url          object
title             object
newstitle         object
content           object
abstract          object
publish_date      object
meta_keywords     object
dtype: object


In [99]:
#Prepare the data
stopwords = stopwords.words("english")
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [100]:
#Lemmatize the words in the dataframes. Lemmatize each dataframe into a new text_output_fake and text_output_real lemmatized versions
def lemmatize(texts, allowed_postags=None):
    if allowed_postags is None:
        allowed_postags = ["NOUN", "ADJ", "VERB", "ADV"]
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return texts_out

fake_news_text_in = fake_news_dataframe['title'].str.cat(fake_news_dataframe['content'], sep=' ', na_rep='').str.cat(fake_news_dataframe['abstract'], sep=' ', na_rep='')\
    .str.cat(fake_news_dataframe['meta_keywords'], sep=' ', na_rep='')

real_news_text_in = real_news_dataframe['title'].str.cat(real_news_dataframe['content'], sep=' ', na_rep='').str.cat(real_news_dataframe['abstract'], sep=' ', na_rep='')\
    .str.cat(real_news_dataframe['meta_keywords'], sep=' ', na_rep='')

fake_news_text_out = lemmatize(fake_news_text_in)
real_news_text_out = lemmatize(real_news_text_in)

print(fake_news_text_out)
print(real_news_text_out)





In [101]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return final

fake_news_words = gen_words(fake_news_text_out)
real_news_words = gen_words(real_news_text_out)

print(fake_news_words)




In [102]:
#Bigrams and Trigrams TF-IDF
#1. Fake news words
bigram_phrases_fake = gensim.models.Phrases(fake_news_words, min_count=5, threshold=50)
trigram_phrases_fake = gensim.models.Phrases(bigram_phrases_fake[fake_news_words], threshold=50)

bigram = gensim.models.phrases.Phraser(bigram_phrases_fake)
trigram = gensim.models.phrases.Phraser(trigram_phrases_fake)

#2. Real news words
bigram_phrases_real = gensim.models.Phrases(real_news_words, min_count=5, threshold=50)
trigram_phrases_real = gensim.models.Phrases(bigram_phrases_real[real_news_words], threshold=50)


def make_bigrams(texts):
    return [bigram[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram[bigram[doc]] for doc in texts]

data_bigrams_fake = make_bigrams(fake_news_words)
data_bigrams_trigrams_fake = make_trigrams(data_bigrams_fake)

data_bigrams_real = make_bigrams(real_news_words)
data_bigrams_trigrams_real = make_trigrams(data_bigrams_real)

print(data_bigrams_trigrams_fake)



In [103]:
#TF-IDF Removal
from gensim.models import TfidfModel
#1. Fake TF-IDF Removal
texts_fake = data_bigrams_trigrams_fake
id2word_fake = corpora.Dictionary(texts_fake)
corpus_fake = [id2word_fake.doc2bow(text) for text in texts_fake]

tfidf_fake = TfidfModel(corpus_fake, id2word=id2word_fake)

low_value_fake = 0.05
words_fake = []
words_missing_in_tfidf_fake = []

for i in range(0, len(corpus_fake)):
    bow = corpus_fake[i]
    low_value_words_fake = []
    tfidf_ids = [id for id, value in tfidf_fake[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words_fake = [id for id, value in tfidf_fake[bow] if value < low_value_fake]
    drops = low_value_words_fake + words_missing_in_tfidf_fake
    for item in drops:
        words_fake.append(id2word_fake[item])
    words_missing_in_tfidf_fake = [id for id in bow_ids if id not in tfidf_ids]
    new_bow = [b for b in bow if b[0] not in low_value_words_fake and b[0] not in words_missing_in_tfidf_fake]
    corpus_fake[i] = new_bow

#2. Real TF-IDF Removal
texts_real = data_bigrams_trigrams_real
id2word_real = corpora.Dictionary(texts_real)
corpus_real = [id2word_real.doc2bow(text) for text in texts_real]

tfidf_real = TfidfModel(corpus_real, id2word=id2word_real)

low_value_real = 0.05
words_real = []
words_missing_in_tfidf_real = []

for i in range(0, len(corpus_real)):
    bow = corpus_real[i]
    low_value_words_real = []
    tfidf_ids = [id for id, value in tfidf_real[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words_real = [id for id, value in tfidf_real[bow] if value < low_value_real]
    drops = low_value_words_real + words_missing_in_tfidf_real
    for item in drops:
        words_real.append(id2word_real[item])
    words_missing_in_tfidf_real = [id for id in bow_ids if id not in tfidf_ids]
    new_bow = [b for b in bow if b[0] not in low_value_words_real and b[0] not in words_missing_in_tfidf_real]
    corpus_real[i] = new_bow

In [104]:
#id2word_fake = corpora.Dictionary(fake_news_words)
# id2word_real = corpora.Dictionary(real_news_words)
#
# words = []
# words.extend(fake_news_words)
# words.extend(real_news_words)

# id2word = corpora.Dictionary(words)

#corpus_fake = []
# corpus_real = []
# corpus = []

#for text in fake_news_words:
    #new = id2word_fake.doc2bow(text)
    #corpus_fake.append(new)

# for text in real_news_words:
#     new = id2word_real.doc2bow(text)
#     corpus_real.append(new)
#
# for text in words:
#     new = id2word.doc2bow(text)
#     corpus.append(new)
#
#
# print(corpus)

In [None]:
#Create the LDA model
lda_model_fake = gensim.models.ldamodel.LdaModel(corpus = corpus_fake,
                                            id2word=id2word_fake,
                                            num_topics=5,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=30,
                                            alpha="auto")

lda_model_real = gensim.models.ldamodel.LdaModel(corpus = corpus_real,
                                            id2word=id2word_real,
                                            num_topics=5,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=30,
                                            alpha="auto")

In [None]:
#Visualize the data
pyLDAvis.enable_notebook()

vis_fake = pyLDAvis.gensim.prepare(lda_model_fake, corpus_fake, id2word_fake, mds="mmds", R=30)
vis_real = pyLDAvis.gensim.prepare(lda_model_real, corpus_real, id2word_real, mds="mmds", R=30)

In [None]:
vis_fake

In [None]:
vis_real