# Imports and setup

Adapted from Ola's Lab 4 work.

In [2]:
import numpy as np
import pandas as pd
import nltk
import gensim
from nltk.stem import SnowballStemmer
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing \
import STOPWORDS as stopwords
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

stemmer = SnowballStemmer("english")

# Loading data 

In [3]:
# Loading the data
df_fake = pd.read_csv('Fake.csv', sep=',')
df_real = pd.read_csv('True.csv', sep=',')

df_fake

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"
...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016"
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016"
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016"
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016"


# Preprocessing 

In [4]:
# Preprocessing functions
def stem(text):
	return stemmer.stem(text)

def preprocess(text):
	result = []
	for token in gensim.utils.simple_preprocess(text, min_len=4):
		if token not in stopwords:
			result.append(stem(token))
	return result

def run_processing_on(name, data):
	print("Preprocessing dataset:", name, "...")
	# Preprocessing step
	processed_docs = []

	for i in range(0, len(data)):
		processed_docs.append(preprocess(data[i]))

	print("Total documents:", len(processed_docs))

	dictionary = gensim.corpora.Dictionary(processed_docs)
	print("Dictionary size:", len(dictionary))

	dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=10000)
	print("Dictionary after filtering extremes:", len(dictionary))
	
	bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
	id2word = dictionary
	corpus = bow_corpus
	
	print("Done!")
	
	return (bow_corpus, id2word, corpus)

In [5]:
real_bow_corpus, real_id2word, real_corpus = run_processing_on("Real news", df_real.loc[0:, 'title'].tolist())
fake_bow_corpus, fake_id2word, fake_corpus = run_processing_on("Fake news", df_fake.loc[0:, 'title'].tolist())

Preprocessing dataset: Real news ...
Total documents: 21417
Dictionary size: 8119
Dictionary after filtering extremes: 2078
Done!
Preprocessing dataset: Fake news ...
Total documents: 23481
Dictionary size: 9085
Dictionary after filtering extremes: 2856
Done!


# Models and topic analysis
## Real news

Using an LDA model to find topics in the real news dataset.

In [6]:
real_lda_model = gensim.models.ldamodel.LdaModel(corpus=real_corpus,
											id2word=real_id2word,
											num_topics=4,
											random_state=100,
											update_every=1,
											chunksize=1000,
											passes=10,
											alpha='symmetric',
											iterations=100,
											per_word_topics=True)

### Visualisation

In [8]:
visualisation_real = gensimvis.prepare(real_lda_model, real_corpus, real_id2word)
pyLDAvis.display(visualisation_real)

  default_term_info = default_term_info.sort_values(


## Fake news

Using an LDA model to find topics in the fake news dataset.

In [7]:
fake_lda_model = gensim.models.ldamodel.LdaModel(corpus=fake_corpus,
											id2word=fake_id2word,
											num_topics=5,
											random_state=100,
											update_every=1,
											chunksize=1000,
											passes=10,
											alpha='symmetric',
											iterations=100,
											per_word_topics=True)

### Visualisation

In [8]:
visualisation_fake = gensimvis.prepare(fake_lda_model, fake_corpus, fake_id2word)
pyLDAvis.display(visualisation_fake)

  default_term_info = default_term_info.sort_values(


## Analysing all the data together

In [9]:
combined_bow_corpus, combined_id2word, combined_corpus = run_processing_on("All news", df_fake.loc[0:, 'title'].tolist() + df_real.loc[0:, 'title'].tolist())

Preprocessing dataset: All news ...
Total documents: 44898
Dictionary size: 12255
Dictionary after filtering extremes: 3915
Done!


In [12]:
combined_lda_model = gensim.models.ldamodel.LdaModel(corpus=combined_corpus,
											id2word=combined_id2word,
											num_topics=3,
											random_state=100,
											update_every=1,
											chunksize=1000,
											passes=10,
											alpha='symmetric',
											iterations=100,
											per_word_topics=True)

In [13]:
visualisation_all = gensimvis.prepare(combined_lda_model, combined_corpus, combined_id2word)
pyLDAvis.display(visualisation_all)

  default_term_info = default_term_info.sort_values(
