In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import joblib
from scipy.sparse import csr_matrix, load_npz

In [2]:
xv_test = load_npz('testing_data.npz')
manual_v = load_npz('manual_testing_data.npz')

In [3]:
print("Testing data dimensions:", xv_test.shape)
print("Manual testing data dimensions:", manual_v.shape)

Testing data dimensions: (12108, 150541)
Manual testing data dimensions: (200, 150541)


In [71]:
y_test = pd.read_csv('y_test.csv')
manual = pd.read_csv('manual_testing.csv')
training = pd.read_csv('training_data.csv')
testing = pd.read_csv('testing_data.csv')

In [5]:
y_test = y_test.drop('Unnamed: 0', axis=1)
manual = manual.drop('Unnamed: 0', axis=1)
training = training.drop('Unnamed: 0', axis=1)

In [6]:
y_test.shape, manual.shape, training.shape

((12108, 1), (200, 2), (48428, 1))

# FINAL CHECK FOR THE REPEAT OF DATA IN MANUAL AND TRAINING

In [46]:
print(type(manual['text'][0]))
print(type(training['text'][0]))
same = []
for i in range(0, len(manual)):
    for j in range(0, len(training)):
        if training['text'][j] == manual['text'][i]:
            print(i, '   ', j)
            same.append(j)

<class 'str'>
<class 'str'>


# SUPPORT VECTOR MACHINE

In [8]:
support = joblib.load('Support_Vector.joblib')

In [9]:
support.score(xv_test, y_test)

0.9247604889329369

In [10]:
pred_Support = support.predict(xv_test)

In [11]:
print(classification_report(y_test, pred_Support))

              precision    recall  f1-score   support

           0       0.90      0.95      0.92      5726
           1       0.95      0.90      0.93      6382

    accuracy                           0.92     12108
   macro avg       0.92      0.93      0.92     12108
weighted avg       0.93      0.92      0.92     12108



In [12]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, pred_Support))

[[5424  302]
 [ 609 5773]]


# LOGISTIC REGRESSION

In [13]:
logistics = joblib.load('Logistics_Regression.joblib')

In [14]:
num_features = logistics.coef_.shape[1]
print("Number of features in the model:", num_features)

Number of features in the model: 150541


In [15]:
logistics.score(xv_test, y_test)

0.9174925668979187

In [16]:
pred_Logistics = logistics.predict(xv_test)

In [17]:
print(classification_report(y_test, pred_Logistics))

              precision    recall  f1-score   support

           0       0.89      0.94      0.92      5726
           1       0.94      0.90      0.92      6382

    accuracy                           0.92     12108
   macro avg       0.92      0.92      0.92     12108
weighted avg       0.92      0.92      0.92     12108



In [18]:
print(confusion_matrix(y_test, pred_Logistics))

[[5389  337]
 [ 662 5720]]


# DECISION TREE CLASSIFIER

In [19]:
decision = joblib.load('DecisionTree_Classifier.joblib')

In [20]:
num_features = decision.tree_.n_features
print("Number of features in the model:", num_features)

Number of features in the model: 150541


In [21]:
decision.score(xv_test, y_test)

0.8866864882722167

In [22]:
pred_Decision = decision.predict(xv_test)

In [23]:
print(classification_report(y_test, pred_Decision))

              precision    recall  f1-score   support

           0       0.88      0.89      0.88      5726
           1       0.90      0.89      0.89      6382

    accuracy                           0.89     12108
   macro avg       0.89      0.89      0.89     12108
weighted avg       0.89      0.89      0.89     12108



In [24]:
print(confusion_matrix(y_test, pred_Decision))

[[5071  655]
 [ 717 5665]]


# GRADIENT BOOSTING CLASSIFIER

In [25]:
gradient = joblib.load('GradientBoosting_Classifier.joblib')

In [26]:
gradient.score(xv_test, y_test)

0.9073339940535183

In [27]:
pred_Gradient = gradient.predict(xv_test)

In [28]:
print(classification_report(y_test, pred_Gradient))

              precision    recall  f1-score   support

           0       0.85      0.97      0.91      5726
           1       0.97      0.85      0.91      6382

    accuracy                           0.91     12108
   macro avg       0.91      0.91      0.91     12108
weighted avg       0.92      0.91      0.91     12108



In [30]:
print(confusion_matrix(y_test, pred_Gradient))

[[5575  151]
 [ 971 5411]]


# RANDOM FOREST CLASSIFIER

In [31]:
random = joblib.load('RandomForest_Classifier.joblib')

In [32]:
random.score(xv_test, y_test)

0.8946151304922365

In [33]:
pred_Random = random.predict(xv_test)

In [34]:
print(classification_report(y_test, pred_Random))

              precision    recall  f1-score   support

           0       0.86      0.93      0.89      5726
           1       0.93      0.86      0.90      6382

    accuracy                           0.89     12108
   macro avg       0.90      0.90      0.89     12108
weighted avg       0.90      0.89      0.89     12108



In [35]:
print(confusion_matrix(y_test, pred_Random))

[[5341  385]
 [ 891 5491]]


# MANUAL TESTING

In [36]:
v_array = manual_v.toarray()

In [37]:
def result(num):
    if num == 0:
        return 'Fake news'
    elif num == 1:
        return 'Not Fake news'
    
def manual_testing(news):
    for i in range(0, len(manual)):
        if news == manual['text'][i]:
            req_index = i
            break
    pred_SV = support.predict(v_array[req_index].reshape(1, -1))
    pred_LR = logistics.predict(v_array[req_index].reshape(1, -1))
    pred_DT = decision.predict(v_array[req_index].reshape(1, -1))
    pred_GBC = gradient.predict(v_array[req_index].reshape(1, -1))
    pred_RFC = random.predict(v_array[req_index].reshape(1, -1))
    print('\nSVC Prediction: ' + result(pred_SV))
    print('\nLR Prediction: ' + result(pred_LR))
    print('\nDT Prediction: ' + result(pred_DT))
    print('\nGBC Prediction: ' + result(pred_GBC))
    print('\nRFC Prediction: ' + result(pred_RFC))

In [42]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [43]:
display(manual)

Unnamed: 0,text,class
0,cnn hammered trump for retweeting white suprem...,0
1,donald trump s disgraced national security adv...,0
2,donald trump went on a whining rant about the ...,0
3,david daleiden the mastermind behind the debun...,0
4,the koch brothers arranged their meeting of we...,0
5,at a campaign rally in cedar rapids iowa repub...,0
6,it was just revealed today that federal agents...,0
7,thank yousacott i m honored truly,0
8,if there is one reason to vote for hillary cli...,0
9,queen of the south s jon ecker opens up about ...,0


In [80]:
news = manual['text'][154]
manual_testing(news)


SVC Prediction: Not Fake news

LR Prediction: Not Fake news

DT Prediction: Not Fake news

GBC Prediction: Not Fake news

RFC Prediction: Not Fake news
