<a href="https://colab.research.google.com/github/angelina-tsuboi/Fake_News_Detector_CNN/blob/main/Fake_News_Detector_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Loading IN Some Data**

In [12]:
import os
from bs4 import BeautifulSoup as bs
import pickle
  
import requests
import zipfile
import io
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from collections import Counter
import keras
from keras.models import Sequential
from keras.layers import Dense, Conv2D
from keras.layers import Activation, MaxPooling2D, Dropout, Flatten, Reshape
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


# Download class resources...
r = requests.get("https://www.dropbox.com/s/2pj07qip0ei09xt/inspirit_fake_news_resources.zip?dl=1")
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

basepath = '.'

Initializing Dataset

In [None]:
if not os.path.exists('final_data.pkl'):
	print('no saved data was found; generating from scratch...')
	print('loading data')
	# structure of each item: url, html, (1 if fake else 0)
	with open('fakenewsdata/train_val_data.pkl', 'rb') as f:
		train_data, val_data = pickle.load(f)
	with open('fakenewsdata/test_data.pkl', 'rb') as f:
		test_data = pickle.load(f)

	print('making Tokenizer')
	tokenizer = Tokenizer(
		num_words=12_000,  # TUNABLE
		filters='!"#$%&()*+,-./…‘’“”—–:;<=>?@[\\]^_`{|}~\t\n©®™',
		lower=True,
		split=" "
	)

	train_data.pop(232)  # for some reason they cause the parser to hang
	train_data.pop(301)
	train_data.pop(620)
	train_data.pop(1362)
	train_data.pop(1656)
	train_data.pop(1738)

	if not os.path.exists('text_data.pkl'):
		print('no saved text found; converting HTML to text')
		train_texts = [bs(page[1], 'html.parser').get_text() for page in train_data]
		valid_texts = [bs(page[1], 'html.parser').get_text() for page in val_data]
		test_texts = [bs(page[1], 'html.parser').get_text() for page in test_data]

		with open('fakenewsdata/text_data.pkl', 'wb') as f:
			pickle.dump((train_texts, valid_texts, test_texts), f)
	else:
		print('using preconverted text')
		with open('fakenewsdata/text_data.pkl', 'rb') as f:
			train_texts, valid_texts, test_texts = pickle.load(f)

	print('fitting Tokenizer')
	tokenizer.fit_on_texts(train_texts)
	total_words = len(tokenizer.word_index)

	print('generating sequences and labels from data/text from earlier')
	X_train = tokenizer.texts_to_sequences(train_texts)
	X_valid = tokenizer.texts_to_sequences(valid_texts)
	X_test = tokenizer.texts_to_sequences(test_texts)
	y_train = [page[2] for page in train_data]
	y_valid = [page[2] for page in val_data]
	y_test = [page[2] for page in test_data]

	print('pruning bad data')

	to_pop = []
	for i in range(len(X_train)):
		content = train_texts[i]
		sequence = X_train[i]
		if len(sequence) < 15:
			to_pop.append(i)
		elif len(sequence) < 30 and ('403' in content or '404' in content or '401' in content or '500' in content or '502' in content or '503' in content):
			to_pop.append(i)
	for offset, idx_to_pop in enumerate(to_pop):
		X_train.pop(idx_to_pop - offset)  # the array shrinks when we pop, so account for that. This only works since we know the indexes are sorted low-to-high.
		y_train.pop(idx_to_pop - offset)
		# no need to pop the texts since they're deleted
	del to_pop, train_texts
	to_pop = []
	for i in range(len(X_valid)):
		content = valid_texts[i]
		sequence = X_valid[i]
		if len(sequence) < 15:
			to_pop.append(i)
		elif len(sequence) < 30 and ('403' in content or '404' in content or '401' in content or '500' in content or '502' in content or '503' in content):
			to_pop.append(i)
	for offset, idx_to_pop in enumerate(to_pop):
		X_valid.pop(idx_to_pop - offset)
		y_valid.pop(idx_to_pop - offset)
	del to_pop, valid_texts
	to_pop = []
	for i in range(len(X_test)):
		content = test_texts[i]
		sequence = X_test[i]
		if len(sequence) < 15:
			to_pop.append(i)
		elif len(sequence) < 30 and ('403' in content or '404' in content or '401' in content or '500' in content or '502' in content or '503' in content):
			to_pop.append(i)
	for offset, idx_to_pop in enumerate(to_pop):
		X_test.pop(idx_to_pop - offset)
		y_test.pop(idx_to_pop - offset)
	del to_pop, test_texts

	word_idx = tokenizer.word_index
	breakpoint()
	del val_data, tokenizer, train_data
	with open('fakenewsdata/final_data.pkl', 'wb') as f:
		pickle.dump((X_train, y_train, X_valid, y_valid, X_test, y_test, total_words, word_idx), f)
else:
	print('using saved data')
	with open('fakenewsdata/final_data.pkl', 'rb') as f:
		X_train, y_train, X_valid, y_valid, X_test, y_test, total_words, word_idx = pickle.load(f)
	del X_test, y_test


no saved data was found; generating from scratch...
loading data


FileNotFoundError: ignored

Creating a CNN

In [None]:
def CNNClassifier(num_epochs=2, layers=1, dropout=0.15):
  def create_model():
    model = Sequential()
    model.add(Reshape((32, 32, 3)))
    
    for i in range(layers):
      model.add(Conv2D(32, (3, 3), padding='same'))
      model.add(Activation('relu'))
    
    model.add(Conv2D(32, (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(dropout))

    model.add(Conv2D(64, (3, 3), padding='same'))
    model.add(Activation('relu'))
    model.add(Conv2D(64, (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(dropout))

    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dropout(dropout))
    model.add(Dense(2))
    model.add(Activation('softmax'))

    # initiate RMSprop optimizer
    opt = keras.optimizers.RMSprop(lr=0.0001, decay=1e-6)

    # Let's train the model using RMSprop
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    return model
  return KerasClassifier(build_fn=create_model, epochs=num_epochs, batch_size=10, verbose=2)

*Plotting Model Performance*


In [13]:
def plot_acc(history, ax = None, xlabel = 'Epoch #'):
    history = history.history
    history.update({'epoch':list(range(len(history['val_accuracy'])))})
    history = pd.DataFrame.from_dict(history)

    best_epoch = history.sort_values(by = 'val_accuracy', ascending = False).iloc[0]['epoch']

    if not ax:
      f, ax = plt.subplots(1,1)
    sns.lineplot(x = 'epoch', y = 'val_accuracy', data = history, label = 'Validation', ax = ax)
    sns.lineplot(x = 'epoch', y = 'accuracy', data = history, label = 'Training', ax = ax)
    ax.axhline(0.5, linestyle = '--',color='red', label = 'Chance')
    ax.axvline(x = best_epoch, linestyle = '--', color = 'green', label = 'Best Epoch')  
    ax.legend(loc = 1)    
    ax.set_ylim([0.4, 1])

    ax.set_xlabel(xlabel)
    ax.set_ylabel('Accuracy (Fraction)')
    
    plt.show()

Initializing and Using CNN Model

In [None]:
cnn = CNNClassifier(5, 2, 0.5)
cnn.fit(pad_sequences(tf.convert_to_tensor(X_train, dtype=tf.float32)), y_train)
preds = cnn.predict(X_test)
print (cnn.score(inputs_test, y_test))

NameError: ignored