# Fake News Project
The goal of this project is to create a fake news prediction system. Fake news is a major problem that can have serious negative effects on how people understand the world around them. You will work with a dataset containing real and fake news in order to train a simple and a more advanced classifier to solve this problem. This project covers the full Data Science pipeline, from data processing, to modelling, to visualization and interpretation.
## Part 1 Data Processing

### Task 1

In [1]:
import pandas as pd 
df = pd.read_csv("news_sample.csv")
dfcpy = df.copy()
dfcpy = dfcpy.dropna(subset=['content'])

In [2]:
import pandas as pd 
import re
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.lm import Vocabulary
from nltk.probability import FreqDist
from cleantext import clean
import concurrent.futures
import threading


def clean_text(text):
  clean_text = re.sub(r'([A-Z][A-z]+.?) ([0-9]{1,2}?), ([0-9]{4})', '<DATE>', text)
  clean_text = clean(clean_text,
    lower=True,
    no_urls=True, replace_with_url="<URL>",
    no_emails=True, replace_with_email="<EMAIL>",
    no_numbers=True, replace_with_number= r"<NUM>",
    no_currency_symbols=True, replace_with_currency_symbol="<CUR>",
    no_punct=True, replace_with_punct="",
    no_line_breaks=True 
  )
  return clean_text

def rmv_stopwords(tokens):
  stop_words = set(nltk.corpus.stopwords.words('english'))
  tokens = [word for word in tokens if word not in stop_words]
  return tokens

def stem_tokens(tokens):
  stemmer=PorterStemmer()
  Output=[stemmer.stem(word) for word in tokens]
  return Output

# build a vocabulary from a dataframe with list of tokens
def build_vocabulary(df_tokens):
    # Flatten the list of tokens
  tokens = []
  for lst in df_tokens:
    tokens += lst
  fq = FreqDist(tokens)
  return fq

In [3]:
# df = pd.read_csv(file)
dfcpy = df.copy()

dfcpy.content = dfcpy.content.apply(clean_text)

tokenizer = RegexpTokenizer(r'<[\w]+>|[\w]+')
dfcpy["tokenized"] = dfcpy.content.apply(tokenizer.tokenize)

vocab = build_vocabulary(dfcpy.tokenized)
vocab_size = vocab.B()
vocab.
print("After cleaning:")
print(f"vocabulary size: {vocab_size}\n")

dfcpy.tokenized = dfcpy.tokenized.apply(rmv_stopwords)
vocab = build_vocabulary(dfcpy.tokenized)
# reduction rate of the vocabulary size
reduction = vocab_size - vocab.B()
vocab_size = vocab.B()
print("After removing stopwords:")
print(f"vocabulary size: {vocab_size}")
print(f"reduction rate of the vocabulary size: {reduction} words\n")

dfcpy.tokenized = dfcpy.tokenized.apply(stem_tokens)
vocab = build_vocabulary(dfcpy.tokenized)
reduction = vocab_size - vocab.B()
vocab_size = vocab.B()
print("After stemming:")
print(f"vocabulary size: {vocab_size}")
print(f"reduction rate of the vocabulary size: {reduction} words\n")

# make tokenize colum into a string with whitespace separator

After cleaning:
vocabulary size: 16577

After removing stopwords:
vocabulary size: 16445
reduction rate of the vocabulary size: 132 words

After stemming:
vocabulary size: 11031
reduction rate of the vocabulary size: 5414 words




1. counting the number of URLs in the content
2. counting the number of dates in the content
3. counting the number of numeric values in the content
4. determining the 100 more frequent words that appear in the content
5. plot the frequency of the 10000 most frequent words (any interesting patterns?)
6. run the analysis in point 4 and 5 both before and after removing stopwords and applying stemming: do you see any difference?


In [None]:
import pandas as pd 
# import dask.dataframe as dd

# df = pd.read_csv("995,000_rows.csv", usecols=['id','content', 'type', 'url', 'title', 'authors', 'domain'], engine='c', dtype = str)
# dfcpy = df.copy()
# dfcpy = dfcpy.dropna(subset=['id'])
# dfcpy = dfcpy.dropna(subset=['content'])
# dfcpy = dfcpy.dropna(subset=['type'])
# ddf = dd.from_pandas(dfcpy, npartitions=10) # find your own number of partitions

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# plot the frequency of the top n words
def plot_freq(fq, top_n):
  common_words = fq.most_common(top_n)
  fq.most
  # convert the list of tuples to a dictionary 
  all_freq = dict(common_words)
  # create a plot
  # plot most be less than 2^16 pixels in each direction
  plt.figure(figsize = (top_n*0.1, 5))
  plt.xticks(rotation = 90,fontsize = 5)
  plt.yticks(range(0, max(all_freq.values())+1, 300))
  sns.lineplot(x = list(all_freq.keys()), y = list(all_freq.values()), color = 'red')
  sns.barplot(x = list(all_freq.keys()), y = list(all_freq.values()))
  plt.title(f'Top {top_n} most common words')
  plt.xlabel('Words')
  plt.ylabel('Frequency')
  plt.grid(axis = 'y')
  plt.show()
  return

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# plot the frequency of the top n words
def plot_freq1(fq, top_n):
  common_words = fq.most_common(top_n)
  # convert the list of tuples to a dictionary 
  all_freq = dict(common_words)
  # create a plot
  plt.figure(figsize=(5, 16))
  plt.xticks(rotation=90,)
  sns.barplot(x = list(all_freq.values()), y = list(all_freq.keys()))
  plt.title(f'Top {top_n} most common words')
  plt.xlabel('Frequency')
  plt.ylabel('Words')
  plt.show()
  return

In [None]:
# ddf_update = ddf.content.apply(clean_text).compute()
dfcpy.content = dfcpy.content.apply(clean_text)

In [None]:
tokenizer = RegexpTokenizer(r'<[\w]+>|[\w]+')
dfcpy["tokenized"] = dfcpy.content.apply(tokenizer.tokenize)

In [None]:
dfcpy.tokenized = dfcpy.tokenized.apply(rmv_stopwords)

In [None]:
dfcpy.tokenized = dfcpy.tokenized.apply(stem_tokens)

In [None]:
vocab = build_vocabulary(dfcpy.tokenized)

In [None]:
dfcpy.tokenized = dfcpy.tokenized.apply(' '.join)
dfcpy.to_csv('cleaned_news_sample.csv', index=False)

url_freq = vocab.get("<url>",0)
date_freq = vocab.get("<date>",0)
num_freq = vocab.get("<num>",0)
print(f"Number of URLs in the content: {url_freq}")
print(f"Number of dates in the content: {date_freq}")
print(f"Number of numerics in the content: {num_freq}")
plot_freq(vocab, 100)


# Task 4

In [34]:
from sklearn.model_selection import train_test_split

df = pd.read_csv('cleaned_news_sample.csv')
dfcpy = df.copy()

# TODO: should I split the tokenized column into a list of words?
# dfcpy['tokenized'] = dfcpy['tokenized'].apply(lambda x: x.split())

X = dfcpy['tokenized']
y = dfcpy['type']

train_ratio = 0.80
validation_ratio = 0.10
test_ratio = 0.10

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 1 - train_ratio, random_state = 42) 
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size= test_ratio / (test_ratio + validation_ratio), random_state = 42)

print("Training Set:")
print(x_train.head())

print("\nValidation Set:")
print(x_val.head())

print("\nTest Set:")
print(x_test.head())




Training Set:
79     past <num> month other repeatedli warn isi pos...
161    trend global connect enterpris market <num> an...
112    <num> principl govern success peopl headlin bi...
109    who poster talk headlin bitcoin blockchain sea...
101    son god goe forth war reader think stori fact ...
Name: tokenized, dtype: object

Validation Set:
154    fed seiz <cur> <num> million bitcoin alleg sil...
153    obama lawless presid us histori titl bush head...
119    easili understand differ day christ day lord e...
104    play hitler card headlin bitcoin blockchain se...
30     greg hunter big bank big troubl syrianorth kor...
Name: tokenized, dtype: object

Test Set:
137    hubbl captur incred rare imag explod star read...
214    disturb sugar daddysugar babi relationship don...
148    worldwid laser technolog market analysi report...
82     interview princ harri septemb former presid ba...
73     richmond fed suffer biggest <num> month drop s...
Name: tokenized, dtype: object


# Part 2: A simple model

In [38]:
import pandas as pd
# from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# df = pd.read_csv('cleaned_news_sample.csv')
# dfcpy = df.copy()

# give each row a true/false column depending on if the type is fake or not using map
dfcpy['fake'] = dfcpy['type'].map({'fake': 1, 'conspiracy': 1, 'junksci': 1, 'clickbait': 0, 'political': 0, 'reliable': 0})
dfcpy = dfcpy.dropna(subset=['fake'])
dfcpy['fake'] = dfcpy['fake'].astype(int)


vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(x_train)
X_test_vectorized = vectorizer.transform(x_test)
x_val_vectorized = vectorizer.transform(x_val)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)
y_pred = model.predict(x_val_vectorized)

# score the model
import sklearn.metrics as metrics
accuracy_score = metrics.accuracy_score(y_val, y_pred)
recall_score = metrics.recall_score(y_val, y_pred, average='binary', pos_label = 'fake')
precision_score = metrics.precision_score(y_val, y_pred, average='binary', pos_label = 'fake')
f1_score = metrics.f1_score(y_val, y_pred, average='binary', pos_label = 'fake')

print(f"Accuracy: {accuracy_score}")
print(f"Recall: {recall_score}")
print(f"Precision: {precision_score}")  
print(f"F!: {f1_score}")


  (0, 8233)	0.018634608315950867
  (0, 3838)	0.04953985673130612
  (0, 8331)	0.05350026514226691
  (0, 8503)	0.03880908542344456
  (0, 4178)	0.028078314115583005
  (0, 7800)	0.05350026514226691
  (0, 2998)	0.044550332216270805
  (0, 3555)	0.02399805672888513
  (0, 4993)	0.05350026514226691
  (0, 5895)	0.04953985673130612
  (0, 9200)	0.044550332216270805
  (0, 7019)	0.10700053028453382
  (0, 8329)	0.07761817084688911
  (0, 1570)	0.05350026514226691
  (0, 6911)	0.03521730826924733
  (0, 3631)	0.033819560908409256
  (0, 6522)	0.04126381643643715
  (0, 9041)	0.033819560908409256
  (0, 1467)	0.039959539348465405
  (0, 8511)	0.05350026514226691
  (0, 3909)	0.033819560908409256
  (0, 5697)	0.016210496666516805
  (0, 4698)	0.02964447201004746
  (0, 2772)	0.037546315433279076
  (0, 5641)	0.0310096064224693
  :	:
  (174, 6670)	0.0838684910594962
  (174, 2849)	0.07855811120528691
  (174, 4214)	0.06959456210324742
  (174, 4713)	0.0704465373491803
  (174, 8742)	0.2535976921468615
  (174, 5418)	0.04