<a href="https://colab.research.google.com/github/Anna-Desorcy/FakeNewsDetection/blob/main/FakeNewsDetector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fake News Detector



In [None]:
#unzipping fake news data
!unzip "News _dataset.zip"

Archive:  News _dataset.zip
  inflating: Fake.csv                
  inflating: True.csv                


In [None]:
#Loading in the data
import pandas as pd
df = pd.read_csv("Fake.csv")
print(df.head())

                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  


In [None]:
def clean_article(text):
  #remove punctuation
  text = text.lower()
  text = text.replace('.','')
  text = text.replace(',','')
  text = text.replace('!','')
  text = text.replace('"','')
  text = text.replace("'",'')
  text = text.replace('?','')
  text = text.replace(':','')
  text = text.replace('/','')
  text = text.replace('@','')
  text = text.replace('(','')
  text = text.replace(')','')
  text = text.replace('[','')
  text = text.replace(']','')
  text = text.replace('_','')
  text = text.replace('*','')
  text = text.replace('0','')
  text = text.replace('1','')
  text = text.replace('2','')
  text = text.replace('3','')
  text = text.replace('4','')
  text = text.replace('5','')
  text = text.replace('6','')
  text = text.replace('7','')
  text = text.replace('8','')
  text = text.replace('9','')
  text = text.replace('-','')
  text = text.replace('#','')
  text = text.replace(';','')

  #split into words
  text = text.strip().split()

  #remove links
  text = [ x for x in text if "www" not in x ]
  text = [ x for x in text if "http" not in x ]

  return text

In [None]:
#Turning Fake and Real news into bag of words
import pandas as pd

df_fake = pd.read_csv("Fake.csv")
df_real = pd.read_csv("True.csv")

df_fake = df_fake['text']
df_real = df_real['text']

word_dict = {}

#Get Vocab Words for Fake
cnt = 0
for text in df_fake:
  text = clean_article(text)
  for word in text:
    try:
      word_dict[word] += 1
    except:
      word_dict[word] = 0
  cnt += 1
  if cnt > 1000:
    break

#Get Vocab Words for Real
cnt = 0
for text in df_real:
  text = clean_article(text)
  for word in text:
    try:
      word_dict[word] += 1
    except:
      word_dict[word] = 0
  cnt += 1
  if cnt > 1000:
    break

#Remove words that occur less than min_thresh times and more than max_thresh times
vocab = list(word_dict)
print("Vocabulary Length Before Min/Max Removal:", len(vocab))

min_thresh = 100
max_thresh = 1000
for word in vocab:
  if word_dict[word] <= min_thresh or word_dict[word] > max_thresh:
    word_dict.pop(word)

vocab = list(word_dict)
print("Vocabulary Length After Min/Max Removal:", len(vocab))


Vocabulary Length Before Min/Max Removal: 36871
Vocabulary Length After Min/Max Removal: 885


In [None]:
#Writing out BOW for each article

#create empty article dictionary
article_dict = word_dict.copy()
article_dict = dict.fromkeys(article_dict, 0) #This is faster than looping through and setting each count to 0

#Open output file and write the vocab out as the header line
fout = open('news_data.csv', 'w')
vocab_str = ','.join(vocab)
fout.write(vocab_str + ',target_label\n') #add target_label header for label column

cnt = 0
for text in df_fake:
  text = clean_article(text)
  for word in text:
    try:                        # try/except is faster than if/else
      article_dict[word] += 1
    except:
      continue #word not in dictionary, go to next word (just an error catch)

  #Turn count list into a string of comma separated values
  article_list = list(article_dict.values())
  str_list = ','.join(str(e) for e in article_list)
  fout.write(str_list + ',1\n') #add 1 at the end for label for fake

  #reset article dictionary to 0 counts
  article_dict = dict.fromkeys(article_dict, 0)

  #only keep the first 1000 articles for quicker computation
  cnt += 1
  if cnt >= 1000:
    break

#Repeat process for real articles (label of 0 for true)
cnt = 0
for text in df_real:
  text = clean_article(text)
  for word in text:
    try:
      article_dict[word] += 1
    except:
      continue

  article_list = list(article_dict.values())
  str_list = ','.join(str(e) for e in article_list)
  fout.write(str_list + ',0\n')

  article_dict = dict.fromkeys(article_dict, 0)
  cnt += 1
  if cnt >= 1000:
    break
fout.close()

In [None]:
#Create and display a Decision Tree:
import pandas as pd
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier

df_train = pd.read_csv('news_data.csv')
#get feature names
#get X and y (use dictionary_name.values so you don't get all those warnings)
list_of_features = df_train.keys()[:-1]
X = df_train[list_of_features].values
y = df_train['target_label']

#train DT
dtree = DecisionTreeClassifier()
dtree = dtree.fit(X, y)

#print DT
tree.plot_tree(dtree, feature_names=list_of_features)


In [None]:
#Test our decision tree on more data from Fake.csv and True.csv

df_fake = pd.read_csv("Fake.csv")
df_real = pd.read_csv("True.csv")
df_fake = df_fake['text']
df_real = df_real['text']


df_fake = df_fake[1000:10000]
df_real = df_real[1000:10000]

#test samples 1000-1100 from Fake and True and get accuracies for both
#test samples 1000-10000 from Fake and Real and get accuracies for both
correct = 0
total = 0
for text in df_fake:
  text = clean_article(text)
  for word in text:
    try:                        # try/except is faster than if/else
      article_dict[word] += 1
    except:
      continue #word not in dictionary, go to next word (just an error catch)
  article_list = list(article_dict.values())
  article_dict = dict.fromkeys(article_dict, 0)

  if dtree.predict([article_list]) == 1:
    correct += 1
  total += 1
print(f'Fake Data Test Accuracy:  {round((correct / total) * 100, 2)}%')

#Repeat process for real articles (label of 0 for true)
correct = 0
total = 0
for text in df_real:
  text = clean_article(text)
  for word in text:
    try:
      article_dict[word] += 1
    except:
      continue
  article_list = list(article_dict.values())
  article_dict = dict.fromkeys(article_dict, 0)

  if dtree.predict([article_list]) == 0:
    correct += 1
  total += 1
print(f'Real Data Test Accuracy:  {round((correct / total) * 100, 2)}%')

Fake Data Test Accuracy:  93.91%
Real Data Test Accuracy:  98.27%


In [None]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier

df_train = pd.read_csv('news_data.csv')

list_of_features = df_train.keys()[:-1]
X = df_train[list_of_features].values
y = df_train['target_label']

#Train KNN
KNN = KNeighborsClassifier(n_neighbors = 3)
KNN = KNN.fit(X,y)


In [None]:
df_fake = pd.read_csv("Fake.csv")
df_real = pd.read_csv("True.csv")
df_fake = df_fake['text']
df_real = df_real['text']


df_fake = df_fake[1000:10000]
df_real = df_real[1000:10000]

correct = 0
total = 0
for text in df_fake:
  text = clean_article(text)
  for word in text:
    try:                        # try/except is faster than if/else
      article_dict[word] += 1
    except:
      continue #word not in dictionary, go to next word (just an error catch)
  article_list = list(article_dict.values())
  article_dict = dict.fromkeys(article_dict, 0)

  if KNN.predict([article_list]) == 1:
    correct += 1
  total += 1
print(f'Fake Data Test Accuracy:  {round((correct / total) * 100, 2)}%')

#Repeat process for real articles (label of 0 for true)
correct = 0
total = 0
for text in df_real:
  text = clean_article(text)
  for word in text:
    try:
      article_dict[word] += 1
    except:
      continue
  article_list = list(article_dict.values())
  article_dict = dict.fromkeys(article_dict, 0)

  if KNN.predict([article_list]) == 0:
    correct += 1
  total += 1
print(f'Real Data Test Accuracy:  {round((correct / total) * 100, 2)}%')

Fake Data Test Accuracy:  16.62%
Real Data Test Accuracy:  95.63%
