In [1]:
# Importing the Libraries
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Fake_Real_Data.csv')
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [3]:
df.shape

(9900, 2)

In [4]:
# Checking, if there is class imbalance
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Fake,5000
Real,4900


In [5]:
# Converting the label column into numbers
# Fake--> 0 & Real--> 1

df['label_num'] = df['label'].map({"Fake": 0, "Real": 1})

In [6]:
df.head()

Unnamed: 0,Text,label,label_num
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0
4,Democrats say Trump agrees to work on immigrat...,Real,1


In [7]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [8]:
import spacy

nlp = spacy.load("en_core_web_lg")

In [9]:
# Converting the "Text" column into word vectors

df['text_vector'] = df['Text'].apply(lambda x: nlp(x).vector)

In [10]:
df.head()

Unnamed: 0,Text,label,label_num,text_vector
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0,"[-0.6759837, 1.4263071, -2.318466, -0.451093, ..."
1,U.S. conservative leader optimistic of common ...,Real,1,"[-1.8355803, 1.3101058, -2.4919677, 1.0268308,..."
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1,"[-1.9851209, 0.14389805, -2.4221718, 0.9133005..."
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0,"[-2.7812982, -0.16120885, -1.609772, 1.3624227..."
4,Democrats say Trump agrees to work on immigrat...,Real,1,"[-2.2010763, 0.9961637, -2.4088492, 1.128273, ..."


In [11]:
# Now we will use this vector to train our model.

# Splitting the data into training and test set.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text_vector'].values, df['label_num'], test_size=0.2, random_state=2024)

In [12]:
X_train

array([array([-1.9370583e+00,  6.7307764e-01, -2.0342209e+00,  8.3849376e-01,
               5.0582399e+00,  6.9561619e-01,  8.4494680e-01,  3.1886001e+00,
              -1.8559305e-01, -2.3077946e+00,  5.7106333e+00,  1.9615663e+00,
              -3.8906546e+00,  7.0745426e-01, -5.5861807e-01,  3.0465801e+00,
               1.6254791e+00,  4.5530903e-01, -1.6434262e+00, -1.4830869e+00,
               7.8545010e-01, -1.4669340e+00, -8.5639030e-01,  6.9189775e-01,
               2.5764078e-01, -1.3267909e+00, -1.5089076e+00, -1.2453447e-02,
              -1.1435866e+00,  5.0170070e-01,  7.5545192e-01,  3.2688296e-01,
              -6.4489061e-01, -1.7220320e+00, -2.5479710e+00, -6.4732337e-01,
              -3.2688227e-01,  3.5910955e-01,  3.3932900e-01, -5.5153232e-02,
               2.2551973e-01, -3.7040213e-01, -5.0905091e-01,  5.5882496e-01,
              -2.0343032e+00,  1.2633475e+00,  1.5275000e+00, -1.9012048e+00,
              -9.3877620e-01,  1.8726635e+00, -1.9624627e+00,  2

In [13]:
X_train.shape

(7920,)

In [14]:
X_test.shape

(1980,)

In [15]:
# we have to convert the X_train and X_test to 2D array, because model aspects a 2D array.

X_train_2D = np.stack(X_train)
X_test_2D = np.stack(X_test)

In [16]:
X_train_2D

array([[-1.9370583 ,  0.67307764, -2.034221  , ..., -1.2719777 ,
        -1.9884472 ,  1.0053477 ],
       [-2.191668  , -0.16486427, -1.9093462 , ..., -1.6499615 ,
        -1.3815571 ,  0.8758069 ],
       [-2.2016706 , -0.5199784 , -0.58734244, ..., -1.4630992 ,
        -1.0164541 ,  0.69745815],
       ...,
       [-1.0993794 ,  0.9693734 , -2.3240595 , ..., -0.08093151,
        -3.0894012 ,  1.0929768 ],
       [-1.478615  ,  1.0917808 , -2.3930342 , ..., -0.74500334,
        -2.4764638 ,  0.7318762 ],
       [-2.40196   ,  0.70360106, -1.8457925 , ..., -1.8062043 ,
        -1.492288  ,  0.580932  ]], dtype=float32)

### 1. Multinomial Naive bayes


In [17]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()

ValueError: Negative values in data passed to MultinomialNB (input X).

`Negatives` values are not allowed in `multinomialNB` model.  
So, we will do `MinMaxScaling`.

In [18]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2D)
scaled_test_embed = scaler.transform(X_test_2D)

In [19]:
scaled_train_embed[0]

array([0.43124777, 0.62599504, 0.37933758, 0.52273   , 0.75556266,
       0.6311421 , 0.66044563, 0.6021845 , 0.4618413 , 0.25878775,
       0.79355174, 0.6196411 , 0.27693236, 0.64328474, 0.5215674 ,
       0.75318265, 0.64241624, 0.5599154 , 0.37396502, 0.43297857,
       0.60305595, 0.44517162, 0.50843537, 0.59226954, 0.50082505,
       0.42717656, 0.6033423 , 0.73005146, 0.25130296, 0.5587761 ,
       0.62676144, 0.52355695, 0.4309271 , 0.3851936 , 0.44727403,
       0.7123683 , 0.485351  , 0.61564004, 0.4440249 , 0.47728845,
       0.46885407, 0.45696718, 0.47422776, 0.48335946, 0.3693884 ,
       0.6788415 , 0.63925606, 0.46944422, 0.2621642 , 0.73801863,
       0.34011972, 0.82235026, 0.44846603, 0.12288582, 0.45530498,
       0.39528066, 0.5763065 , 0.7318836 , 0.5601536 , 0.53663605,
       0.54056746, 0.41285458, 0.70590127, 0.37993103, 0.52647436,
       0.7738293 , 0.3940401 , 0.18290228, 0.6270076 , 0.50819993,
       0.5291767 , 0.37401348, 0.16617864, 0.45246646, 0.35622

In [20]:
scaled_test_embed[0]

array([0.5279307 , 0.58394384, 0.47402996, 0.52783185, 0.78483725,
       0.52211434, 0.36299446, 0.73587155, 0.47346926, 0.39059612,
       0.7317545 , 0.5405766 , 0.35255224, 0.45216686, 0.6796993 ,
       0.65603346, 0.5213078 , 0.45598686, 0.5041536 , 0.56142616,
       0.6630881 , 0.48266503, 0.50627214, 0.47397962, 0.4335602 ,
       0.39436656, 0.53856015, 0.5745352 , 0.4795882 , 0.5684869 ,
       0.72609675, 0.50246793, 0.38121945, 0.5207111 , 0.38612026,
       0.5276381 , 0.31606483, 0.7287639 , 0.52892077, 0.5945253 ,
       0.5210516 , 0.5585401 , 0.6150821 , 0.34103686, 0.48232728,
       0.52953017, 0.4584595 , 0.46909937, 0.32045433, 0.77802944,
       0.47337133, 0.6522676 , 0.5846262 , 0.23821503, 0.55794024,
       0.37973744, 0.63049877, 0.6588937 , 0.5647625 , 0.43598956,
       0.43636918, 0.47907227, 0.6399738 , 0.37754956, 0.59271026,
       0.7672075 , 0.36077732, 0.39400563, 0.59554327, 0.6092857 ,
       0.5393958 , 0.4104044 , 0.3117422 , 0.5871827 , 0.45922

In [21]:
# training the model on scaled data.
clf.fit(scaled_train_embed, y_train)

In [22]:
y_pred = clf.predict(scaled_test_embed)

In [23]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.94      0.94      1003
           1       0.94      0.95      0.94       977

    accuracy                           0.94      1980
   macro avg       0.94      0.94      0.94      1980
weighted avg       0.94      0.94      0.94      1980



### 2. KNN

In [24]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean')

clf.fit(scaled_train_embed, y_train)

In [25]:
y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1003
           1       0.99      0.99      0.99       977

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980

