# Fake News Project
The goal of this project is to create a fake news prediction system. Fake news is a major problem that can have serious negative effects on how people understand the world around them. You will work with a dataset containing real and fake news in order to train a simple and a more advanced classifier to solve this problem. This project covers the full Data Science pipeline, from data processing, to modelling, to visualization and interpretation.
## Part 1 Data Processing

### Task 1

In [56]:
import pandas as pd
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.lm import Vocabulary
from functools import reduce
from cleantext import clean

def clean_text(text):
  clean_text = re.sub(r'([A-Z][A-z]+.?) ([0-9]{1,2}?), ([0-9]{4})', '<DATE>', text)
  clean_text = clean(clean_text,
    lower=True,
    no_urls=True, replace_with_url="<URL>",
    no_emails=True, replace_with_email="<EMAIL>",
    no_numbers=True, replace_with_number="<NUM>",
    no_currency_symbols=True, replace_with_currency_symbol="<CUR>",
    no_punct=True, replace_with_punct="",
    no_line_breaks=True 
  )
  return clean_text
def tokenize(text):
  tokens = nltk.word_tokenize(text)
  return tokens
def rmv_stopwords(tokens):
  stop_words = set(nltk.corpus.stopwords.words('english'))
  tokens = [word for word in tokens if word not in stop_words]
  return tokens

def stem_tokens(tokens):
  stemmer=PorterStemmer()
  Output=[stemmer.stem(word) for word in tokens]
  return Output

# build a vocabulary from a dataframe with list of tokens
# def build_vocabulary(df_tokens):
#   print(type(df_tokens))
#   tokens = " ".join(df_tokens)
#   return tokens 


[nltk_data] Downloading package punkt to /home/katikistan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/katikistan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [57]:
df = pd.read_csv('news_sample.csv')
clean_df = df.copy()

clean_df.content = clean_df.content.apply(clean_text)
# clean_df["tokenized"] = clean_df.content.apply(process_text)
clean_df["tokenized"] = clean_df.content.apply(tokenize)
# clean_df.tokenized = clean_df.tokenized.apply(rmv_stopwords)
# clean_df.tokenized = clean_df.tokenized.apply(stem_tokens)

print(clean_df.tokenized[1])
  


['awakening', 'of', '<', 'num', '>', 'strands', 'of', 'dna', 'reconnecting', 'with', 'you', 'movie', 'of', 'readers', 'think', 'this', 'story', 'is', 'fact', 'add', 'your', 'two', 'cents', 'headline', 'bitcoin', 'blockchain', 'searches', 'exceed', 'trump', 'blockchain', 'stocks', 'are', 'next', '<', 'date', '>', 'zurichtimesnet', 'as', 'miles', 'johnston', 'was', 'giving', 'update', 'it', 'was', 'another', 'case', 'of', 'strange', 'synchronicities', 'of', 'goodness', 'hidden', 'inside', 'of', 'tests', 'and', 'trials', 'like', 'a', 'follow', 'the', 'whiterabbit', 'down', 'the', 'rabbit', 'hole', 'type', 'of', 'exercise', 'in', 'researching', 'the', '<', 'num', '>', 'strands', 'of', 'dna', 'we', 'came', 'across', 'some', 'articles', 'one', 'in', 'particular', 'was', 'as', 'a', 'strange', 'synchronicity', 'written', 'exactly', '<', 'num', '>', 'year', 'ago', 'on', 'the', 'same', 'topic', '<', 'url', '>', '<', 'url', '>', 'what', 'are', 'the', '<', 'num', '>', 'strands', 'of', 'our', 'dna'