In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('Fake_Real_Data.csv')

In [None]:
df.shape

(9900, 2)

In [None]:
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [None]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Fake,5000
Real,4900


In [None]:
df['label_num'] = df['label'].map({"Fake": 1, "Real": 0})

In [None]:
df.head()

Unnamed: 0,Text,label,label_num
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,1
1,U.S. conservative leader optimistic of common ...,Real,0
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,0
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,1
4,Democrats say Trump agrees to work on immigrat...,Real,0


## Modeling without Pre-processing text data

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['label_num'], test_size=0.2, random_state=10)

In [None]:
X_train.shape

(7920,)

In [None]:
X_test.shape


(1980,)

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,3))),
    ('knn', KNeighborsClassifier(n_neighbors=10, metric='euclidean'))
])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy:  0.6828282828282828
              precision    recall  f1-score   support

           0       0.60      0.99      0.75       935
           1       0.98      0.41      0.58      1045

    accuracy                           0.68      1980
   macro avg       0.79      0.70      0.66      1980
weighted avg       0.80      0.68      0.66      1980



### KNN with `cosine` metric

In [None]:
pipe = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,3))),
    ('knn', KNeighborsClassifier(n_neighbors=10, metric='cosine'))
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

In [None]:
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy:  0.7196969696969697
              precision    recall  f1-score   support

           0       0.63      1.00      0.77       935
           1       1.00      0.47      0.64      1045

    accuracy                           0.72      1980
   macro avg       0.81      0.73      0.70      1980
weighted avg       0.82      0.72      0.70      1980



### Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
pipe = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(3,3))),
    ('knn', RandomForestClassifier(n_estimators=10))
])

pipe.fit(X_train, y_train)


In [None]:
y_pred = pipe.predict(X_test)

In [None]:
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy:  0.9712121212121212
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       935
           1       0.99      0.96      0.97      1045

    accuracy                           0.97      1980
   macro avg       0.97      0.97      0.97      1980
weighted avg       0.97      0.97      0.97      1980



### Multinomial Naive byes

In [None]:
from sklearn.naive_bayes import MultinomialNB

pipe = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('knn', MultinomialNB(alpha=0.75))
])

pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy:  0.9838383838383838
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       935
           1       0.98      0.99      0.98      1045

    accuracy                           0.98      1980
   macro avg       0.98      0.98      0.98      1980
weighted avg       0.98      0.98      0.98      1980



## With text preprocessing

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

def preprocess(text):
  doc = nlp(text)
  filtered_token = []
  for token in doc:
    if token.is_stop or token.is_punct:
      continue
    filtered_token.append(token.lemma_)

  return " ".join(filtered_token)


In [None]:
df.columns

Index(['Text', 'label', 'label_num'], dtype='object')

In [None]:
df['preprocessed_text'] = df['Text'].apply(preprocess)

In [None]:
df.head()

Unnamed: 0,Text,label,label_num,preprocessed_text
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,1,Trump Surrogate BRUTALLY Stabs Pathetic vide...
1,U.S. conservative leader optimistic of common ...,Real,0,U.S. conservative leader optimistic common gro...
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,0,trump propose U.S. tax overhaul stir concern d...
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,1,Court Forces Ohio allow million illegally pu...
4,Democrats say Trump agrees to work on immigrat...,Real,0,Democrats Trump agree work immigration bill wa...


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['preprocessed_text'], df['label_num'], test_size=0.2, random_state=2022, stratify=df.label_num)

### Random forest with only trigram


In [None]:
clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(3,3))),
    ('rf', RandomForestClassifier(n_estimators=10))
])

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
print("Accuracy: ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy:  0.9308080808080809
              precision    recall  f1-score   support

           0       0.95      0.91      0.93       980
           1       0.91      0.95      0.93      1000

    accuracy                           0.93      1980
   macro avg       0.93      0.93      0.93      1980
weighted avg       0.93      0.93      0.93      1980



### Random forest with unigram, Bigram, and trigrams.

In [None]:
clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,3))),
    ('rf', RandomForestClassifier(n_estimators=10))
])

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
print("Accuracy: ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy:  0.9676767676767677
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       980
           1       0.99      0.95      0.97      1000

    accuracy                           0.97      1980
   macro avg       0.97      0.97      0.97      1980
weighted avg       0.97      0.97      0.97      1980



In [None]:
# confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[967  13]
 [ 51 949]]
