# CAPSTONE

**Identifying Fake News**

**Project Overview**

The goal of this project is to create a fake new identification system using natural language processing (NLP). My data set consists of over 40 thousand articles obtained from Kaggle. A variety of machine learning models have been used to generate the best results.

**Business Case**

Over the course of the past decade, the amount of fake news being shared on social media has seen a dramatic increase. According to NewsGuard, "in 2019, 8 percent of engagement with the 100 top-performing news sources on social media was dubious. In 2020, that number more than doubled to 17 percent" ([Vox](https://www.vox.com/policy-and-politics/2020/12/22/22195488/fake-news-social-media-2020)). In 2021, the Daily Wire, the outlet founded by right-wing comentator Ben Shapiro, was the most popular news platform on Facebook.

From casting doubts on the results of the 2020 presidential election to spreading misinformation about the COVID vaccine, fake news is causing real damage to our society.

Because the main source of fake news is social media sites, such as Facebook and Twitter, I set out to produce a fake news identification system that can be used by social media companies to filter out minsinformation. A altnerative use of this system could be a web app with which users can verify the authenticity of a story themselves.

## Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,\
HashingVectorizer
import string
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords, wordnet
from nltk import word_tokenize
from nltk.probability import FreqDist
from nltk import pos_tag
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, classification_report, plot_confusion_matrix, plot_roc_curve
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
import spacy
import re
from sklearn.dummy import DummyClassifier
from wordcloud import WordCloud, STOPWORDS 
from PIL import Image
import requests
from os import path
import pickle
from nltk.util import ngrams
import nltk, re, string, collections

## Import CSVs and Data Cleaning

**Spacy Preprocessing Functions**

In [None]:
# Import list of stopwords from SpaCy
from spacy.lang.en.stop_words import STOP_WORDS
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
# Create a function to tokenize the text of the articles
punctuation = [*string.punctuation , *[str(x) for x in list(range(0,10))]]
nlp = spacy.load('en_core_web_sm')
def normalize(text):
    text = ''.join([x for x in text if x not in punctuation])
    toks = nlp(text)
    toks = [word.lemma_.lower().strip() for word in toks if word.pos_ != 'PRON']
    toks = [word for word in toks if word not in stop_words]
    return ' '.join(toks)

**Real News**

In [None]:
# Import real news articles
df_true = pd.read_csv('True.csv')
df_true.head()

In [None]:
# Create target column
df_true = df_true.assign(status = 'Real')
df_true.head()

In [None]:
# Remove city names and news sources from 'text' column
df_true['text']=df_true['text'].str.extract(r'(?<=\-\s)(.*)')

In [None]:
df_true.head()

In [None]:
df_true.shape

In [None]:
# Check for missing data
df_true.isnull().sum()

In [None]:
# Drop articles missing text
df_true.dropna(inplace=True)

In [None]:
df_true.isnull().sum()

In [None]:
# Apply tokenization function to the real articles. Create a new column for the processed articles
df_true['processed_articles'] = df_true['text'].map(lambda x: normalize(x))

In [None]:
list(df_true.processed_articles)

In [None]:
# Save the new processed data frame
pickle_out = open('pickled_files/df_spacy_true.pickle',"wb")
pickle.dump(df_true, pickle_out)
pickle_out.close()

In [None]:
infile = open('pickled_files/df_spacy_true.pickle','rb')
df_true = pickle.load(infile)
infile.close()

In [None]:
true_text = df_true.processed_articles

In [None]:
df_true.head()

In [None]:
str(true_text)

In [None]:
# Create a list of all of the tokenized words
true_list = []
for x in true_text:
    true_list.append(x)

In [None]:
true_list

In [None]:
len(str(true_list))

In [None]:
# Create and generate a word cloud image:
wordcloud = WordCloud().generate(str(true_list))

# Display the generated image:
plt.figure(figsize = (15, 15))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Create a new list of stopwords
stopwords = ['``', 's', 'c', "'", ",", "t", "l", 'j', '...', ":", '0', '1', '2', 'couldn', 'wouldn', 'isn', 'aren', 'shouldn', 'don', 'doesn', 'didn']
# Create a new tokenized list to use in the frequency distribution tables
true_tokenized_word = nltk.word_tokenize(str(true_list))

In [None]:
# Iterate through new tokenized list and remove additional stopwords.
true_tokenized_word = [word for word in true_tokenized_word if word not in stopwords]

In [None]:
# Create frequency distribution for the words in the real articles
true_fdist=nltk.FreqDist(true_tokenized_word)

In [None]:
print(true_fdist.most_common(20))

In [None]:
# Plot frequency distribution for the 20 most common words
true_fdist.plot(20)
plt.show()

In [None]:
# Get a list of all the bi-grams
true_bigrams = ngrams(true_tokenized_word, 2)

In [None]:
# List 20 most common bigrams
true_bigrams_freq = collections.Counter(true_bigrams)
true_bigrams_freq.most_common(20)

In [None]:
true_bigram_fdist=nltk.FreqDist(true_bigrams_freq)

In [None]:
true_bigram_fdist.plot(20)
plt.show()

In [None]:
# Get a list of all the tri-grams
true_trigrams = ngrams(true_tokenized_word, 3)

In [None]:
# List 20 most common tri-grams
true_trigrams_freq = collections.Counter(true_trigrams)
true_trigrams_freq.most_common(20)

In [None]:
true_trigram_fdist=nltk.FreqDist(true_trigrams_freq)

In [None]:
true_trigram_fdist.plot(20)
plt.show()

**Fake News**

In [None]:
# Import Fake news articles
df_fake = pd.read_csv('Fake.csv')
df_fake.head()

In [None]:
# Create target column
df_fake = df_fake.assign(status = 'Fake')
df_fake.head()

In [None]:
df_fake.shape

In [None]:
# Check for missing data
df_fake.isnull().sum()

In [None]:
# Apply tokenization function to the real articles. Create a new column for the processed articles
df_fake['processed_articles'] = df_fake['text'].map(lambda x: normalize(x))

In [None]:
list(df_fake.processed_articles)

In [None]:
# Save the new processed data frame
pickle_out = open('pickled_files/df_spacy_fake.pickle',"wb")
pickle.dump(df_fake, pickle_out)
pickle_out.close()

In [None]:
infile = open('pickled_files/df_spacy_fake.pickle','rb')
df_fake = pickle.load(infile)
infile.close()

In [None]:
fake_text = df_fake.processed_articles

In [None]:
# Create a list of all of the tokenized words
fake_list = []
for x in fake_text:
    fake_list.append(x)

In [None]:
len(str(fake_list))

In [None]:
# Create and generate a word cloud image:
wordcloud = WordCloud().generate(str(fake_list))

# Display the generated image:
plt.figure(figsize = (15, 15))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Create a new tokenized list to use in the frequency distribution tables
fake_tokenized_word = nltk.word_tokenize(str(fake_list))

In [None]:
# Iterate through new tokenized list and remove additional stopwords.
fake_tokenized_word = [word for word in fake_tokenized_word if word not in stopwords]

In [None]:
# Create frequency distribution for the words in the real articles
fake_fdist = nltk.FreqDist(fake_tokenized_word)

In [None]:
print(fake_fdist.most_common(20))

In [None]:
# Plot frequency distribution for the 20 most common words
fake_fdist.plot(20)
plt.show()

In [None]:
# Get a list of all the bi-grams
fake_bigrams = ngrams(fake_tokenized_word, 2)

In [None]:
# List 20 most common bi-grams
fake_bigrams_freq = collections.Counter(fake_bigrams)
fake_bigrams_freq.most_common(20)

In [None]:
fake_ngram_fdist=nltk.FreqDist(fake_bigrams_freq)

In [None]:
fake_ngram_fdist.plot(20)
plt.show()

In [None]:
# Get a list of all the tri-grams
fake_trigrams = ngrams(fake_tokenized_word, 3)

In [None]:
# List 20 most common tri-grams
fake_trigrams_freq = collections.Counter(fake_trigrams)
fake_trigrams_freq.most_common(20)

In [None]:
fake_trigram_fdist=nltk.FreqDist(fake_trigrams_freq)

In [None]:
fake_trigram_fdist.plot(20)
plt.show()

**Join data frames**

In [None]:
# Concatenate the two dataframes
frames = [df_true, df_fake]
df=pd.concat(frames)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
# Shuffle rows
df = df.sample(frac = 1)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Check for missing data
df.isnull().sum()

## Train test split

In [None]:
# Separate features and labels 
X = df['processed_articles']
y = df.status

# Create test and train datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

## Modeling

### Dummy Classifier

In [None]:
# Run dummy classifier to get baseline prediction
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X, y)
DummyClassifier(strategy='most_frequent')
dummy_clf.predict(X)
dummy_clf.score(X, y)

In [None]:
# Create function to evaluate model and display confusion matrix
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    
    print(classification_report(y, y_pred))
    
    plot_confusion_matrix(model, X, y, cmap='Blues', xticks_rotation='vertical')

In [None]:
evaluate_model(dummy_clf, X, y)

### Logistic Regression

**Simple Logistic Regression Pipeline** 

In [None]:
# Create a simple logistic regression pipeline including model and vectorizer 
lr_pipeline = Pipeline(steps=[
    ('vectorizer', TfidfVectorizer()),
    ('lr', LogisticRegression())
])

In [None]:
# Fit the pipeline to the training data
lr_pipeline.fit(X_train, y_train)

In [None]:
pd.DataFrame(lr_pipeline.predict(X_test)).value_counts()

In [None]:
evaluate_model(lr_pipeline, X_train, y_train)

In [None]:
evaluate_model(lr_pipeline, X_test, y_test)

**Logistic Regression Pipeline with hyperparameters tuned**

In [None]:
# Create a logistic regression pipeline including a tuned model and vectorizer
lr_pipeline2 = Pipeline(steps=[
    ('vectorizer', TfidfVectorizer(min_df=10, max_df=.9, ngram_range = (1, 2))),
    ('lr', LogisticRegression(solver='saga', 
                              max_iter = 500, 
                              C=1, 
                              penalty='l2', 
                              verbose=1, 
                              n_jobs=-1,
                              random_state=42))
])

In [None]:
# Fit the model to the training data
lr_pipeline2.fit(X_train, y_train)

In [None]:
pd.DataFrame(lr_pipeline2.predict(X_test)).value_counts()

In [None]:
evaluate_model(lr_pipeline2, X_train, y_train)

In [None]:
evaluate_model(lr_pipeline2, X_test, y_test)

**Logistic Regression Pipeline with Gridsearch**

In [None]:
# Create a logistic regression pipeline including a tuned model and vectorizer
lr_pipeline_3 = Pipeline(steps=[
    ('vectorizer', TfidfVectorizer()),
    ('lr', LogisticRegression(class_weight='balanced', solver='saga', random_state=42))
])

In [None]:
# Define gridsearch parameters
lr_param_grid = {
    'lr__C': [0.001, 0.01, 0.1, 1, 10],
    'lr__penalty': ['l1', 'l2'],
    'lr__max_iter': [500, 1000],
    'vectorizer__max_df': np.linspace(.7,1,5),
    'vectorizer__min_df': [10],
    'vectorizer__ngram_range': [(1, 2)]
}

In [None]:
# Combine logistic regression pipeline with gridsearch
lr_grid = GridSearchCV(lr_pipeline_3, 
                       lr_param_grid,
                       cv=5,
                       scoring='f1_weighted',
                       verbose=1,
                       n_jobs=-1)

In [None]:
# Fit pipeline with gridsearch to the training data
lr_grid.fit(X_train, y_train)

In [None]:
# Display best parameters
lr_grid.best_params_

In [None]:
evaluate_model(lr_grid, X_train, y_train)

In [None]:
evaluate_model(lr_grid, X_test, y_test)

In [None]:
lr_grid_model = lr_grid.best_estimator_.fit(X_train, y_train)

In [None]:
# Save the model
pickle_out = open('pickled_files/lr_grid_model.pickle',"wb")
pickle.dump(lr_grid_model, pickle_out)
pickle_out.close()

In [None]:
infile = open('pickled_files/lr_grid_model.pickle','rb')
lr_grid_model = pickle.load(infile)
infile.close()

### Naive Bayes

**Naive Bayes Pipeline**

In [None]:
# Create a tuned multinomial naive bayes pipeline including a model and vectorizer
nb_pipeline = imbPipeline(steps=[
    ('vectorizer', TfidfVectorizer(min_df=10, max_df=0.9)),
    ('nb', MultinomialNB())
])

In [None]:
# Fit the pipeline to the training data
nb_pipeline.fit(X_train, y_train)

In [None]:
evaluate_model(nb_pipeline, X_train, y_train)

In [None]:
evaluate_model(nb_pipeline, X_test, y_test)

**Naive Bayes Pipeline with Gridsearch**

In [None]:
# Create a  multinomial naive bayes pipeline including a model and vectorizer
nb_pipeline_2 = imbPipeline(steps=[
    ('vectorizer', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

In [None]:
# Define gridsearch parameters
nb_param_grid = {
    'nb__alpha': np.linspace(0, 4, 10),
    'vectorizer__max_df': np.linspace(.8,1,5),
    'vectorizer__min_df': [10],
    'vectorizer__norm': ['l1', 'l2']
    }

In [None]:
# Combine Naive Bayes pipeline with gridsearch
nb_grid = GridSearchCV(nb_pipeline_2, 
                       nb_param_grid,
                       cv=5,
                       scoring='f1_weighted',
                       verbose=1,
                       n_jobs=-1)

In [None]:
# Fit pipeline with gridsearch 
nb_grid.fit(X_train, y_train)

In [None]:
# Display best parameters
nb_grid.best_params_

In [None]:
evaluate_model(nb_grid, X_train, y_train)

In [None]:
evaluate_model(nb_grid, X_test, y_test)

In [None]:
nb_grid_model = nb_grid.best_estimator_.fit(X_train, y_train)

In [None]:
# Save the model
pickle_out = open('pickled_files/nb_grid_model.pickle',"wb")
pickle.dump(nb_grid_model, pickle_out)
pickle_out.close()

In [None]:
infile = open('pickled_files/nb_grid_model.pickle','rb')
nb_grid_model = pickle.load(infile)
infile.close()

### Random Forest

**Simple Random Forest Pipleline**

In [None]:
# Create a tuned random forest classifier pipeline including a model and vectorizer 
rf_pipeline = Pipeline(steps=[
    ('vectorizer', TfidfVectorizer(min_df=10, max_df=0.9)),
    ('rf', RandomForestClassifier(n_estimators=10)),
])

In [None]:
# Fit the pipeline to the training data
rf_pipeline.fit(X_train, y_train)

In [None]:
pd.DataFrame(rf_pipeline.predict(X_test)).value_counts()

In [None]:
# Display f1-score for the testing data
f1_score(y_test, rf_pipeline.predict(X_test), average = 'weighted')

In [None]:
evaluate_model(rf_pipeline, X_train, y_train)

In [None]:
evaluate_model(rf_pipeline, X_test, y_test)

In [None]:
rf_model = rf_pipeline.fit(X_train, y_train)

In [None]:
# Save the model
pickle_out = open('pickled_files/rf_model.pickle',"wb")
pickle.dump(rf_model, pickle_out)
pickle_out.close()

In [None]:
infile = open('pickled_files/rf_model.pickle','rb')
rf_model = pickle.load(infile)
infile.close()

### Voting Classifier

In [None]:
# instantiate the three models already used with the best hyper parameters from grid search where applicable
lr=LogisticRegression(C=10, max_iter=500, penalty='l2')
nb=MultinomialNB(alpha=0.0)
rf=RandomForestClassifier(n_estimators=10)

In [None]:
# Create a voting classifier pipeline including the three models listed above and a vecotrizer
vc_pipeline = Pipeline(steps=[
    ('vectorizer', TfidfVectorizer(min_df=10,max_df=.8)),
    ('vc', VotingClassifier(
                estimators=[('LogisticRegression', lr), ('randomforest', rf), ('NaiveBayes', nb)], 
                voting='hard')),
    
])

In [None]:
# fit the model to the training data
vc_pipeline.fit(X_train, y_train)

In [None]:
# Display the f-1 score for the testing data
f1_score(y_test, vc_pipeline.predict(X_test),average = 'weighted')

In [None]:
evaluate_model(vc_pipeline, X_train, y_train)

In [None]:
evaluate_model(vc_pipeline, X_test, y_test)

In [None]:
vc_pipe_model = vc_pipeline.fit(X_train, y_train)

In [None]:
# Save the model
pickle_out = open('pickled_files/vc_pipe_model.pickle',"wb")
pickle.dump(vc_pipe_model, pickle_out)
pickle_out.close()

In [None]:
infile = open('pickled_files/vc_pipe_model.pickle','rb')
vc_pipe_model = pickle.load(infile)
infile.close()

**Fin**