In [78]:
#Import required packages
import pandas as pd
import numpy as np
import pickle
import re


import nltk
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim import corpora, models, similarities, matutils
# logging for gensim (set to INFO)

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [79]:
#Initial Dataframe Import
#Open Corpus of News Article Text
with open('./data/credible_news_df.pickle', 'rb') as file:
     credible_news_df = pickle.load(file)

In [80]:
#Read in fake news data to pandas dataframe
not_credible_news_df = pd.read_csv('./data/Not_Credible/fake.csv')

### Approach to preprocessing

1- Remove capitalization and punctuation

2- Remove overfit words/phrases (including source names, format-specific words (e.g., one source listed the day of the week in the first line of every article), and phrases contained in every article –usually a header/footer); 

3- Remove short words (words less than 3 characters long); 

4- Remove stop words.

5- Convert numbers into words or removing numbers

6- expanding abbreviations

7- text canoncalization

# Clean Real News Text

In [81]:
# Text preprocessing steps - remove numbers, captial letters and punctuation from article text and title
import re
import string

alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

credible_news_df['text'] = credible_news_df.text.map(alphanumeric).map(punc_lower)
credible_news_df['title'] = credible_news_df.title.map(alphanumeric).map(punc_lower)

In [82]:
#Remove punctuation from text
credible_news_df['text'] = credible_news_df['text'].apply(lambda x: re.sub(r'[^\w\s]','', x))
credible_news_df['title'] = credible_news_df['title'].apply(lambda x: re.sub(r'[^\w\s]','', x))

In [83]:
#Remove All Spaces
credible_news_df['text'] = credible_news_df['text'].apply(lambda x: ' '.join(x.split()))
credible_news_df['title'] = credible_news_df['title'].apply(lambda x: ' '.join(x.split()))

In [84]:
credible_news_df['text']= credible_news_df['text'].str.findall('\w{3,}').str.join(' ')

In [85]:
credible_news_df['title']= credible_news_df['title'].str.findall('\w{3,}').str.join(' ')

In [86]:
#Add Target Column 
# Column Value = 1 Means Article is not Real
# Column Value = 0 Means Article is Real
credible_news_df['Not_Real_or_Real'] = 0 

In [87]:
# #Save Updated Data Frame
with open('./data/credible_news_df_cleaned.pickle', 'wb') as file:
     pickle.dump(credible_news_df, file) 

# Clean Fake News Text

In [88]:
#Choose subset of fake news that is 1.5 size of real news
not_credible_news_df = not_credible_news_df.loc[0:2500]

In [89]:
#Remove all Non Value Rows from both text and titles
not_credible_news_df.dropna(inplace=True)

#Reshape Matrix Indices
not_credible_news_df.reset_index(inplace=True)

In [90]:
# Text preprocessing steps - remove numbers, captial letters and punctuation from article text and title
import re
import string

alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

not_credible_news_df['text'] = not_credible_news_df.text.map(alphanumeric).map(punc_lower)
not_credible_news_df['title'] = not_credible_news_df.title.map(alphanumeric).map(punc_lower)

In [91]:
#Remove punctuation from text
not_credible_news_df['text'] = not_credible_news_df['text'].apply(lambda x: re.sub(r'[^\w\s]','', x))
not_credible_news_df['title'] = not_credible_news_df['title'].apply(lambda x: re.sub(r'[^\w\s]','', x))

In [92]:
#Remove All Spaces
not_credible_news_df['text'] = not_credible_news_df['text'].apply(lambda x: ' '.join(x.split()))
not_credible_news_df['title'] = not_credible_news_df['title'].apply(lambda x: ' '.join(x.split()))

In [93]:
not_credible_news_df['text']= not_credible_news_df['text'].str.findall('\w{3,}').str.join(' ')

In [94]:
not_credible_news_df['title']= not_credible_news_df['title'].str.findall('\w{3,}').str.join(' ')

In [95]:
not_credible_news_df.rename(index=str, columns={"index": "label"} ,inplace=True)

In [96]:
not_credible_news_df.label=0

In [97]:
#Add Target Column 
# Column Value = 1 Means Article is not Real
# Column Value = 0 Means Article is Real
not_credible_news_df['Not_Real_or_Real'] = 1 

In [98]:
# #Save Updated Data Frame
with open('./data/not_credible_news_df_cleaned.pickle', 'wb') as file:
     pickle.dump(not_credible_news_df, file) 

# Combine Reduced News Dataframes

In [99]:
#Merge both dataframes
news_data_frame = pd.concat([credible_news_df, not_credible_news_df], ignore_index=True)

In [100]:
news_data_frame = news_data_frame.reset_index(drop=True)

In [101]:
news_data_frame.head(3)

Unnamed: 0,Not_Real_or_Real,label,text,title
0,0,0,the ban united states companies selling parts ...,huawei blacklisted starting friday
1,0,0,washington hatice cengiz the fiancee murdered ...,world has done nothing khashoggi fiancee gives...
2,0,0,least six civilians including women and childr...,saudi uae coalition carries out deadly air rai...


In [102]:
news_data_frame.tail(3)

Unnamed: 0,Not_Real_or_Real,label,text,title
4423,1,0,dear president how many jobs will building wal...,dear president how many jobs will building wal...
4424,1,0,why the trump reflation trade for chumps david...,why the trump reflation trade for chumps
4425,1,0,chart the day the trump rally all dow mind the...,chart the day the trump rally all dowmind the ...


In [103]:
# #Save Updated Data Frame
with open('./data/news_fake_real_df_reduced.pickle', 'wb') as file:
     pickle.dump(news_data_frame, file)       

### Tokenization
`process of splitting the given text into smaller pieces called tokens`

In [104]:
# nltk.download('all')

In [105]:
#Remove stop words and tokeniz individual texts
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [106]:
from nltk.tokenize import word_tokenize
news_data_frame['tokenized_text'] = news_data_frame['text'].apply(word_tokenize)

In [107]:
news_data_frame['tokenized_title'] = news_data_frame['title'].apply(word_tokenize)

In [108]:
#Remove Stop Words
news_data_frame['tokenized_text']= news_data_frame['tokenized_text'].apply(lambda x: [item for item in x if item not in stop_words])
news_data_frame['tokenized_title']= news_data_frame['tokenized_title'].apply(lambda x: [item for item in x if item not in stop_words])

In [109]:
# # #Save Updated Data Frame
# with open('./data/news_fake_real_df_reduced_token.pickle', 'wb') as file:
#       pickle.dump(news_data_frame, file)

### Tfid Vectorization
- Gives the relative importance of a term in a corpus (text data)

In [110]:
#tfid_vectorization function
def tfid_vectorization(df, column_to_vectorize=None, vectorized_name=None):
    # list of text documents
    article= df[column_to_vectorize]

    # create the transform
    vectorizer= TfidfVectorizer()

    # tokenize and build vocab
    vectorizer.fit(article)

    # summarize
    #print(vectorizer.vocabulary_)
    #print(vectorizer.idf_)

    # # # encode documents
    df[vectorized_name] = article.apply(lambda x: vectorizer.transform([x]))
    
    # # summarize encoded vector
    #print(df[vectorized_name].shape)
    #print(df[vectorized_name].toarray())
    
    print('Tfid Vectorization Completed \n')
    
    return df;

In [111]:
#tfid vectorization of document text
tfid_vectorization(news_data_frame,'text','tfid_vec_text');

Tfid Vectorization Completed 



In [112]:
#tfid vectorization of document titles
tfid_vectorization(news_data_frame,'title','tfid_vec_title');

Tfid Vectorization Completed 



### Lexicon Normalization
- Convert all disparities of a word into their normalized form as a part of feature engineering with text as it converts the high dimensional features (N different features) to the low dimensional space (1 feature), which is an ideal ask for any ML model. 

### Stemming using NLTK
-Stemming is a process of linguistic normalization, which reduces words to their root word or chops off the derviational affices. For example, connection, connected, connecting word reduce to a common word 'connect'

In [113]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer= PorterStemmer()
lemmatizer=WordNetLemmatizer()

#Stemming Text
news_data_frame['stemmed_text'] = news_data_frame['tokenized_text'].apply(lambda x: [stemmer.stem(y) for y in x])

#Stemming Titles
news_data_frame['stemmed_title'] = news_data_frame['tokenized_title'].apply(lambda x: [stemmer.stem(y) for y in x])

### Lemmatization using NLTK

In [114]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

#Lemmatizing Text
news_data_frame['lemmatized_text'] = news_data_frame['tokenized_text'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x])

#Lemmatizing Titles
news_data_frame['lemmatized_title'] = news_data_frame['tokenized_title'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x])

### POS and Chunking Text
- Helps overcome bagofwords weakness which fails to capture the structure of sentences and sometimes gives its appropriate meaning. 

In [115]:
#POS
#POS Text
news_data_frame['pos_text'] = news_data_frame['tokenized_text'].apply(lambda x: [nltk.pos_tag([y]) for y in x])

#POS Titles
news_data_frame['pos_title'] = news_data_frame['tokenized_title'].apply(lambda x: [nltk.pos_tag([y]) for y in x])

In [116]:
news_data_frame.drop(['label'], axis=1, inplace=True)

In [117]:
# #Save Updated Data Frame
with open('./data/news_data_frame_reduced_preprocessed.pickle', 'wb') as file:
     pickle.dump(news_data_frame, file)       