In [None]:
''' 
    /*----------------------------- IMPORT_LIBRARIES -------------
'''
import re
import string
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder     
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier          
from sklearn.model_selection import train_test_split
from prettytable import PrettyTable   

import warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
''' 
    /*-------------------- LOAD_TRAINING_DATA ----------------
    | Function  : read_csv()
    | Purpose   : Read a Dataset in CSV file Format 
    | Arguments : 
    |       path    : Path to dataset file
    |       dataset : Dataset file name
    | Return    :
    |       dataset : Dataset in DataFrame Format
    *---------------------------------------------------------*/
'''

# Read the Data in CSV Format
training_data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Fake_News/News-Training.csv')
training_data = pd.DataFrame(training_data)
pd.set_option("display.max_rows", None, "display.max_columns", None)
#Load the Training Data
print("Training Data:")
print("=============\n")
print(training_data)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
20    Posted on October 29, 2016 by Dr. Eowyn \nEarl...  FAKE
21      Carol Adl in News // 0 Comments \nHundreds o...  FAKE
22    Wednesday 9 November 2016 by Lucas Wilde Ooh F...  FAKE
23    Donate The American Way: Socialism for the Ric...  FAKE
24    With Election Day only three days away, tensio...  FAKE
25    0 comments \nRetired assistant FBI director Ja...  FAKE
26    Photo of the day: Miss Russia at the internati...  FAKE
27    Leaked Email: ‚ÄòIf She Wins, Hillary Will Own...  FAKE
28    in: Corporate Takeover , Economy & Business , ...  FAKE
29    Comments \nA private group of technical expert...  FAKE
30    AHEAD of his much anticipated title fight agai...  FAKE
31    October 31, 2016 - Fort Russ -  Ruslan Ostashk...  FAKE
32    by MICHAEL TENNANT \nSeen any walnuts in your ...  FAKE
33    Home / Be The Change / Government Corruption /...  FAKE
34    Reinventing Democracy in America Starts by Vot...  FAKE
35   

In [None]:
''' 
    /*-------------------- LOAD_TESTING_DATA ----------------
    | Function  : read_csv()
    | Purpose   : Read a Dataset in CSV file Format 
    | Arguments : 
    |       path    : Path to dataset file
    |       dataset : Dataset file name
    | Return    :
    |       dataset : Dataset in DataFrame Format
    *---------------------------------------------------------*/
'''

# Read the Data in CSV Format
testing_data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Fake_News/News-Testing.csv')
testing_data = pd.DataFrame(testing_data)
pd.set_option("display.max_rows", None, "display.max_columns", None)
#Load the Training Data
print("Testing Data:")
print("============\n")
print(testing_data)

Testing Data:

                                                   text label
0     Daniel Greenfield, a Shillman Journalism Fello...  FAKE
1     Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE
2     ‚Äî Kaydee King (@KaydeeKing) November 9, 2016...  FAKE
3       \nI‚Äôm not an immigrant, but my grandparent...  FAKE
4     Share This Baylee Luciani (left), Screenshot o...  FAKE
5     Click Here To Learn More About Alexandra's Per...  FAKE
6     October 31, 2016 at 4:52 am \nPretty factual e...  FAKE
7     Shocking! Michele Obama & Hillary Caught Glamo...  FAKE
8     0 \nHillary Clinton has barely just lost the p...  FAKE
9     November 13, 2016 By 21wire Leave a Comment \n...  FAKE
10    Mises.org November 1, 2016 Inferno is a great ...  FAKE
11    Syrian War Report ‚Äì October 31, 2016: Al-Nus...  FAKE
12    Randy Maugans & Jeffrey Sewell | Metabiology f...  FAKE
13    In a previous article , I discussed stretching...  FAKE
14    Britain and EU After Brexit ( 31 ) 0 13 0 0 Br...

In [None]:
''' 
    /*------------------------- DATA_UNDERSTANDING -----------------------
    |    • Name of Attributes in Training Data
    |    • Total number of instances in Training Data
    *--------------------------------------------------------------------*/
'''
# Name of Attributes in Training Data
print("\n\nAttributes Names in Training Data:")
print("==================================\n")
print(training_data.columns)

# Total number of instances in Training Data
print("\n\nTotal Number of Instances in Training Data:", len(training_data.index))
print("==============================================\n")




Attributes Names in Training Data:

Index(['text', 'label'], dtype='object')


Total Number of Instances in Training Data: 5020



In [None]:
''' 
    /*------------------------PRE-PROCESSING-TTRAINING-DATA-------------
    | Function  : lower() & re.sub()
    | Purpose   : Perform following preprocessing:
    |              • Lower case
    |              • Remove Punctuation marks
    | Arguments : 
    |       text: Text to be pre-processed
    | Return    :
    |       text: Pre-processed text
    *-----------------------------------------------------------------*/
'''
# Apply the Preprocessing on Input Data Text of Training Data
training_data['text'] = training_data['text'].apply(lambda x: " ".join(x.lower() for x in x.split())) 
training_data['text'] = training_data['text'].map(lambda x: re.sub(r'\W+', ' ', x)) 
preprocessed_training_data = training_data

# Save the Pre-processed Training Data into CSV File 
preprocessed_training_data.to_csv(r'preprocessed-training-data.csv', index = False, header=True)
pd.set_option("display.max_rows", None, "display.max_columns", None)

print("\nTraining Data After Pre-processing:")
print("====================================\n")
print(preprocessed_training_data)


Training Data After Pre-processing:

                                                   text label
0     hispanic crowd boos marco rubio off stage rafa...  FAKE
1                  posted nov 6th 2016 by madjez madjez  FAKE
2     written by daniel mcadams we were told that we...  FAKE
3     elections 2016 top democrats have repeatedly w...  FAKE
4     google pinterest digg linkedin reddit stumbleu...  FAKE
5      äπ ä arnaldo rodgers is a trained and educate...  FAKE
6     by sarah jones on fri oct 28th 2016 at 9 04 pm...  FAKE
7     the benedict arnold of the republican party pa...  FAKE
8     by jon rappoport äî jon rappoport äôs blog oct...  FAKE
9     duterte calls us admin äòmonkeys äô for haltin...  FAKE
10    us keen to keep south china sea nations buying...  FAKE
11    bias bashers vladimir putin condemns europe fo...  FAKE
12    is this a new escalation black chemtrails repo...  FAKE
13    breaking trump beating äúfederal investigation...  FAKE
14    print ed äì videos are sta

In [None]:
''' 
    /*------------------------- DATA_UNDERSTANDING -----------------------
    |    • Name of Attributes in Testing Data
    |    • Total number of instances in Testing Data
    *--------------------------------------------------------------------*/
'''
# Name of Attributes in Testing Data
print("\n\nAttributes Names in Testing Data:")
print("=================================\n")
print(testing_data.columns)

# Total number of instances in Testing Data
print("\n\nTotal Number of Instances in Testing Data:", len(testing_data.index))
print("=============================================\n")




Attributes Names in Testing Data:

Index(['text', 'label'], dtype='object')


Total Number of Instances in Testing Data: 1258



In [None]:
''' 
    /*------------------------PRE-PROCESSING-TESTING-DATA -------------
    | Function  : lower() & re.sub()
    | Purpose   : Perform following preprocessing:
    |              • Lower case
    |              • Remove Punctuation marks
    | Arguments : 
    |       text: Text to be pre-processed
    | Return    :
    |       text: Pre-processed text
    *-----------------------------------------------------------------*/
'''
# Apply the Preprocessing on Input Data Text of Testing Data
testing_data['text'] = testing_data['text'].apply(lambda x: " ".join(x.lower() for x in x.split())) 
testing_data['text'] = testing_data['text'].map(lambda x: re.sub(r'\W+', ' ', x)) 
preprocessed_testing_data = testing_data

# Save the Pre-processed Testing Data into CSV File 
preprocessed_testing_data.to_csv(r'preprocessed-testing-data.csv', index = False, header=True)
pd.set_option("display.max_rows", None, "display.max_columns", None)

print("\nTesting Data After Pre-processing:")
print("==================================\n")
print(preprocessed_testing_data)


Testing Data After Pre-processing:

                                                   text label
0     daniel greenfield a shillman journalism fellow...  FAKE
1     google pinterest digg linkedin reddit stumbleu...  FAKE
2      äî kaydee king kaydeeking november 9 2016 the...  FAKE
3     i äôm not an immigrant but my grandparents are...  FAKE
4     share this baylee luciani left screenshot of w...  FAKE
5     click here to learn more about alexandra s per...  FAKE
6     october 31 2016 at 4 52 am pretty factual exce...  FAKE
7     shocking michele obama hillary caught glamoriz...  FAKE
8     0 hillary clinton has barely just lost the pre...  FAKE
9     november 13 2016 by 21wire leave a comment epi...  FAKE
10    mises org november 1 2016 inferno is a great t...  FAKE
11    syrian war report äì october 31 2016 al nusra ...  FAKE
12    randy maugans jeffrey sewell metabiology face ...  FAKE
13    in a previous article i discussed stretching ä...  FAKE
14    britain and eu after brexit

In [None]:
''' 
    /*----------------------------- FEATURE_EXTRACTION ----------------
    | Function  : CountVectorizer()
    | Purpose   : Transform Input (Text) into Numerical Representation 
    | Arguments : 
    |       Text: Input Text
    | Return    :
    |   Features: Numerical Representation
    *-----------------------------------------------------------------*/
'''

# Initialize the Count Vectorizer 
count_vectorizer = CountVectorizer(
        stop_words = None,  # Stopwords not Removed
        lowercase = True,     # Text Convert into Lower Case
        analyzer = 'char',    # Word n-grams Generation
        token_pattern = r'\w{2,}',  #vectorize 2-character words or more
        ngram_range = (3,10),  # Word Uni-grams Feeatures
        max_features = 1000) # Extract All Features 
train_text = preprocessed_training_data['text']

# Fit the Count Vectorizer on Input Text of Training Data
count_vectorizer = count_vectorizer.fit(train_text)


In [None]:
''' 
    /*----------------- SAVE_THE_TRAINED_COUNT_VECTORIZER -------------------
    | Function  : dump()
    | Purpose   : Save the Trained Vectorizer on your Hard Disk
    | Arguments : 
    |    Model   : Model Objects
    | Return    :
    |    File    : Trained Vectorizer will be Saved on Hard Disk
    *-----------------------------------------------------------------------*/
'''

# Save the Trained Count Vectorizer in Pkl File
pickle.dump(count_vectorizer, open('/content/drive/My Drive/Colab Notebooks/Fake_News/vectorizer_word_unigram.pkl', 'wb'))


In [None]:
# Transform the Input Text of Training Data using Trained Count Vectorizer
train_feature_vectors = count_vectorizer.transform(train_text)
train_feature_vectors = train_feature_vectors.todense()

# Get the name of Features (Feature  Set) and create a DataFrame of Input Features
input_training_features = pd.DataFrame(train_feature_vectors, columns = count_vectorizer.get_feature_names())

# Create a DataFrame of Output Label
training_data_output = pd.DataFrame(preprocessed_training_data["label"])

# Combine the Input Features of Training Data and Output Label
training_data_features = input_training_features.join(training_data_output)

# Save the Unigram Features of Training Data into CSV File 
training_data_features.to_csv(r'/content/drive/My Drive/Colab Notebooks/Fake_News/training-data-features.csv', index = False, header=True)
pd.set_option("display.max_rows", None, "display.max_columns", None)

# Display the Document Feature Matrix of Training Data
print("\nDocument Features Matrix of Training Data :")
print("============================================\n")
print(training_data_features)


Document Features Matrix of Training Data :



IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
# Input of Testing Data
test_text = preprocessed_testing_data['text']

# Transform the Input Text of Training Data using Trained Count Vectorizer

test_feature_vectors = count_vectorizer.transform(test_text)
test_feature_vectors = test_feature_vectors.todense()

# Get the name of Features (Feature  Set) and create a DataFrame of Input Features
input_testing_features = pd.DataFrame(test_feature_vectors, columns = count_vectorizer.get_feature_names())

# Create a DataFrame of Output Label
testing_data_output = pd.DataFrame(preprocessed_testing_data["label"])

# Combine the Input Features of Testing Data and Output Label
testing_data_features = input_testing_features.join(testing_data_output)

# Save the Unigram Features of Testing Data into CSV File 
testing_data_features.to_csv(r'/content/drive/My Drive/Colab Notebooks/Fake_News/testing-data-features.csv', index = False, header=True)

pd.set_option("display.max_rows", None, "display.max_columns", None)

# Display the Document Feature Matrix of Testing Data
print("\nDocument Features Matrix of Testing Data :")
print("==========================================\n")
print(testing_data_features)


Document Features Matrix of Testing Data :



IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

