# Faux Filter


# Importing Libraries

In [2]:
import re
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from keras_preprocessing.text import one_hot  # Converting to one-hot repr.
from keras_preprocessing.sequence import pad_sequences

[nltk_data] Error loading stopwords: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


# Reading data from csv

In [3]:
# Reading data from csv
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
test.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [5]:
# Displaying rows and columns in dataset
print("There are {} number of rows and {} number of columns for training.".format(train.shape[0],train.shape[1]))
print("There are {} number of rows and {} number of columns for testing.".format(test.shape[0],test.shape[1]))

There are 20800 number of rows and 5 number of columns for training.
There are 5200 number of rows and 4 number of columns for testing.


# Checking Null Values

In [6]:
# Checking the null values in training data.
train.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [7]:
# Checking the null values in testing data.
test.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

# Handling nan values in dataset using empty spaces

In [8]:
def handle_nan(train_data,test_data):
    '''Input: Data to the function containing Nan values.
       Output : Cleaned data containing no Nan values.
       Function: Cleaning Nan values.
     '''
    train = train_data.fillna(" ")
    test  = test_data.fillna(" ")
    return train,test

train,test = handle_nan(train,test)


In [9]:
# Creating a variable "merged" by merging columns "title" and "author"
train["merged"] = train["title"]+" "+train["author"]
test["merged"]  = test["title"]+" "+test["author"]

In [10]:
# Seperating Independent and dependent features
X = train.drop(columns=['label'],axis=1)
y = train['label']

In [11]:
# Creating One-Hot Representations
messages = X.copy()
messages.reset_index(inplace=True)
messages_test = test.copy()
messages_test.reset_index(inplace=True)


# Data Pre-processing
**1. Firstly, we will process and use stopwords.**
**2. Next, to avoid false predictions or ambiguity with upper and lowercase, we will convert them to lowercase.**
**3. Next, all the sentences are tokenized into words.**
**4. We will use stemming to the tokenized words for quick preprocessing.**
**5. Next, words are joined together and stored in the corpus.**

In [12]:
# Performing data preprocessing on column 'title'
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def perform_preprocess(data):
    '''Input: Data to be processed
       Output: Preprocessed data
    '''
    corpus = []
    for i in range(0,len(data)):
        review = re.sub('[^a-zA-Z]',' ',data['merged'][i])
        review = review.lower()
        review = review.split()
        review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)
    return corpus
    
train_corpus = perform_preprocess(messages)
test_corpus  = perform_preprocess(messages_test)
train_corpus[1]

'flynn hillari clinton big woman campu breitbart daniel j flynn'

In [13]:
test_corpus[1]

'russian warship readi strike terrorist near aleppo'

**Below code converts the pre-processed words to one-hot vectors in the range of vocabulary size=5000. This is done to obtain numerical feature matrix**

In [14]:
vocab_size = 5000
one_hot_train = [one_hot(word,vocab_size) for word in train_corpus]
one_hot_test  = [one_hot(word,vocab_size) for word in test_corpus]

In [15]:
one_hot_test[1]

[13, 2490, 4053, 3019, 1484, 1580, 2077]

**Below code creates an embedding layer which applies "pre" padding to the one-hot encoded features with sentence length = 20. Padding is applied so that the length of every sequence in the dataset should be same.**

In [16]:

# Embedding Representation
sent_length = 20
embedd_docs_train = pad_sequences(one_hot_train,padding='pre',maxlen=sent_length)
embedd_docs_test  = pad_sequences(one_hot_test,padding='pre',maxlen=sent_length)
print(embedd_docs_train)

[[   0    0    0 ...  464 3314 3273]
 [   0    0    0 ... 2336 2316 2209]
 [   0    0    0 ... 3417 2409 3837]
 ...
 [   0    0    0 ... 2828 2370 3231]
 [   0    0    0 ... 1523 4923 3199]
 [   0    0    0 ... 3704   53 2577]]


In [17]:
print(embedd_docs_test)

[[   0    0    0 ... 3141   53 4114]
 [   0    0    0 ... 1484 1580 2077]
 [   0    0    0 ... 3165 4231  882]
 ...
 [   0    0    0 ... 3141 4900 2798]
 [   0    0    0 ...   13  416 2269]
 [   0    0    0 ... 3141 2964 4385]]


In [18]:
# Converting Embedding repr. to array
x_final = np.array(embedd_docs_train)
y_final = np.array(y)
x_test_final = np.array(embedd_docs_test)

In [19]:
# Dimensions of prev. array repr.
x_final.shape,y_final.shape,x_test_final.shape

((20800, 20), (20800,), (5200, 20))

**Data Split into Test and Train.**

In [20]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_final, y_final, test_size=0.1, random_state=42, stratify = y_final)
X_train, x_valid, Y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=42, stratify = y_train)
x_test_final = x_test_final

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

# create CountVectorizer object
cv = CountVectorizer(max_features=5000)

# fit_transform on train data
x_train_cv = cv.fit_transform(train_corpus).toarray()

# transform on test data
x_test_cv = cv.transform(test_corpus).toarray()


# Creating Models
**In this phase, several models are created and evaluated against various metrics shown using classification report.**

**1. Logistic Regresssion**

**Logistic Regression: A linear model that uses a logistic function to model the probability of a binary response variable. It is commonly used for binary classification problems, but can also be extended to multi-class classification problems.**

**The logistic regression model predicts the probability of a binary response variable (y) given a set of input features (x) using the logistic function:
p(y=1|x) = 1 / (1 + exp(-z))
where z = b0 + b1x1 + b2x2 + ... + bn*xn is the linear combination of the input features and their corresponding weights (b0, b1, b2, ..., bn).
The weights are estimated from the training data using maximum likelihood estimation or other optimization techniques.**

In [22]:
# Create a Logistic Regression model with a maximum of 900 iterations
model_1 = LogisticRegression(max_iter=900)

# Train the model on the training data
model_1.fit(X_train, Y_train)

# Use the trained model to make predictions on the test data
pred_1 = model_1.predict(x_test)

# Generate a classification report to evaluate the performance of the model
cr1 = classification_report(y_test, pred_1)

# Print the classification report to the console
print(cr1)

              precision    recall  f1-score   support

           0       0.71      0.75      0.73      1039
           1       0.73      0.69      0.71      1041

    accuracy                           0.72      2080
   macro avg       0.72      0.72      0.72      2080
weighted avg       0.72      0.72      0.72      2080



**2. Naive Bayes**
**A probabilistic model that uses Bayes' theorem to predict the probability of a class given a set of features. It is commonly used for text classification problems, where the features are the frequencies of words in a document.**

**The Naive Bayes model predicts the probability of a binary response variable (y) given a set of input features (x) using Bayes' theorem:
p(y=1|x) = p(x|y=1) * p(y=1) / p(x)
where p(x|y=1) is the likelihood of the input features given the positive class, p(y=1) is the prior probability of the positive class, and p(x) is the marginal probability of the input features.**

In [23]:
# Create a Multinomial Naive Bayes model
model_2 = MultinomialNB()

# Train the model on the training data
model_2.fit(X_train, Y_train)

# Use the trained model to make predictions on the test data
pred_2 = model_2.predict(x_test)

# Generate a classification report to evaluate the performance of the model
cr2 = classification_report(y_test, pred_2)

# Print the classification report to the console
print(cr2)

              precision    recall  f1-score   support

           0       0.73      0.66      0.69      1039
           1       0.69      0.76      0.72      1041

    accuracy                           0.71      2080
   macro avg       0.71      0.71      0.71      2080
weighted avg       0.71      0.71      0.71      2080



**3. Decision Trees**
**A decision tree is a flowchart-like structure in which each internal node represents a "test" on an attribute, each branch represents the outcome of the test, and each leaf node represents a class label. It recursively splits the data into subsets based on the values of the features, and assigns a class label to each leaf node.**

In [24]:
# Create a Decision Tree Classifier model
model_3 = DecisionTreeClassifier()

# Train the model on the training data
model_3.fit(X_train, Y_train)

# Use the trained model to make predictions on the test data
pred_3 = model_3.predict(x_test)

# Generate a classification report to evaluate the performance of the model
cr3 = classification_report(y_test, pred_3)

# Print the classification report to the console
print(cr3)

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      1039
           1       0.95      0.93      0.94      1041

    accuracy                           0.94      2080
   macro avg       0.94      0.94      0.94      2080
weighted avg       0.94      0.94      0.94      2080



**4. Random Forest**
**An ensemble model(An ensemble model is a machine learning model that combines multiple individual models to improve the accuracy and robustness of the predictions.) that combines multiple decision trees to improve the accuracy and robustness of the predictions. It randomly selects a subset of features and data points for each tree, and aggregates the predictions of all the trees to make the final prediction.An ensemble model that combines multiple decision trees to improve the accuracy and robustness of the predictions. It randomly selects a subset of features and data points for each tree, and aggregates the predictions of all the trees to make the final prediction.**

**The prediction of a random forest model can be represented by the following formula:
y = mode(y1, y2, ..., yn)
where y1, y2, ..., yn are the predictions of the individual trees, and mode() is the function that returns the most common prediction.**

In [25]:
# Create a Random Forest Classifier model
model_4 = RandomForestClassifier()

# Train the model on the training data
model_4.fit(X_train, Y_train)

# Use the trained model to make predictions on the test data
pred_4 = model_4.predict(x_test)

# Generate a classification report to evaluate the performance of the model
cr4 = classification_report(y_test, pred_4)

# Print the classification report to the console
print(cr4)

              precision    recall  f1-score   support

           0       0.97      0.86      0.91      1039
           1       0.87      0.97      0.92      1041

    accuracy                           0.92      2080
   macro avg       0.92      0.92      0.92      2080
weighted avg       0.92      0.92      0.92      2080



**5. XGBOOST**
**An optimized implementation of gradient boosting(Gradient Boosting is a machine learning technique that combines multiple weak models to create a strong predictive model) that uses a combination of tree-based models and linear models to improve the accuracy and speed of the predictions. It uses a gradient descent algorithm to iteratively improve the predictions by minimizing a loss function.**

**The prediction of an XGBoost model can be represented by the following formula:
y = sum(wi * yi)
where wi is the weight of the i-th tree, and yi is the prediction of the i-th tree. The weights and predictions are determined by the gradient descent algorithm, which minimizes a loss function that measures the difference between the predicted and actual values**

In [26]:
# Create an XGBoost Classifier model
model_5 = XGBClassifier()

# Train the model on the training data
model_5.fit(X_train, Y_train)

# Use the trained model to make predictions on the test data
pred_5 = model_5.predict(x_test)

# Generate a classification report to evaluate the performance of the model
cr5 = classification_report(y_test, pred_5)

# Print the classification report to the console
print(cr5)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1039
           1       0.98      1.00      0.99      1041

    accuracy                           0.99      2080
   macro avg       0.99      0.99      0.99      2080
weighted avg       0.99      0.99      0.99      2080



# Evaluation of Models

**Tabulating the results of various implemented models.**

**Saving Model**


In [29]:
import pickle
pickle.dump(model_5,open('model.pkl','wb'))