# Importing necessary libraries and Classes

In [21]:
# importing pandas for csv data processing
import numpy as np
import pandas as pd
# importing necessary libraries for vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.ensemble import VotingClassifier, GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier 
from sklearn.neighbors import KNeighborsClassifier

# Data Preprocessing

In [2]:
df = pd.read_csv('/content/news_dataset.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3729 entries, 0 to 3728
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   3729 non-null   object
 1   text    3721 non-null   object
dtypes: object(2)
memory usage: 58.4+ KB


In [4]:
df.fillna(" ", inplace=True)

In [5]:
df.head()

Unnamed: 0,label,text
0,REAL,Payal has accused filmmaker Anurag Kashyap of ...
1,FAKE,A four-minute-long video of a woman criticisin...
2,FAKE,"Republic Poll, a fake Twitter account imitatin..."
3,REAL,"Delhi teen finds place on UN green list, turns..."
4,REAL,Delhi: A high-level meeting underway at reside...


In [6]:
df['label'] = (df['label'] == 'REAL')*1

In [8]:
df.head()

Unnamed: 0,label,text
0,1,Payal has accused filmmaker Anurag Kashyap of ...
1,0,A four-minute-long video of a woman criticisin...
2,0,"Republic Poll, a fake Twitter account imitatin..."
3,1,"Delhi teen finds place on UN green list, turns..."
4,1,Delhi: A high-level meeting underway at reside...


In [9]:
X, Y = df['text'], df['label']

In [10]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

In [30]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

# Training and Computing Scores

In [31]:
logreg = LogisticRegression()

In [32]:
logreg.fit(X_train, Y_train)

In [33]:
logreg.score(X_test, Y_test)

0.9959785522788204

In [34]:
logreg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [35]:
classification_report(Y_test, logreg.predict(X_test))

'              precision    recall  f1-score   support\n\n           0       1.00      0.99      1.00       374\n           1       0.99      1.00      1.00       372\n\n    accuracy                           1.00       746\n   macro avg       1.00      1.00      1.00       746\nweighted avg       1.00      1.00      1.00       746\n'

In [36]:
confusion_matrix(Y_test, logreg.predict(X_test))

array([[371,   3],
       [  0, 372]])

In [37]:
BNB = BernoulliNB()

In [38]:
BNB.fit(X_train, Y_train)

In [39]:
BNB.score(X_test, Y_test)

0.9638069705093834

In [40]:
svc = SVC()

In [41]:
svc.fit(X_train, Y_train)

In [42]:
svc.score(X_test, Y_test)

0.9812332439678284

In [43]:
tree = DecisionTreeClassifier()

In [44]:
tree.fit(X_train, Y_train)

In [45]:
tree.score(X_test, Y_test)

0.985254691689008

In [46]:
forest = RandomForestClassifier()

In [47]:
forest.fit(X_train, Y_train)

In [48]:
forest.score(X_test, Y_test)

0.9906166219839142

In [52]:
bag = BaggingClassifier()
# vote = VotingClassifier()
hist = HistGradientBoostingClassifier()
grad = GradientBoostingClassifier()
ada = AdaBoostClassifier()

In [53]:
bag.fit(X_train, Y_train)

In [54]:
bag.score(X_test, Y_test)

0.9865951742627346

In [55]:
hist.fit(X_train.toarray(), Y_train)

In [56]:
hist.score(X_test.toarray(), Y_test)

0.9946380697050938

In [57]:
grad.fit(X_train, Y_train)

In [58]:
grad.score(X_test, Y_test)

0.9932975871313673

In [59]:
ada.fit(X_train, Y_train)

In [60]:
ada.score(X_test, Y_test)

0.9906166219839142

In [61]:
knn = KNeighborsClassifier(n_neighbors=20)

In [62]:
knn.fit(X_train, Y_train)

In [63]:
knn.score(X_test, Y_test)

0.9450402144772118

# Computing **F1 Scores**

In [64]:
f1_score(Y_test, logreg.predict(X_test))

0.9959839357429718

In [65]:
f1_score(Y_test, grad.predict(X_test))

0.9933065595716198

In [66]:
f1_score(Y_test, knn.predict(X_test))

0.9464052287581699

In [67]:
f1_score(Y_test, ada.predict(X_test))

0.9906291834002676

In [68]:
f1_score(Y_test, bag.predict(X_test))

0.9864130434782609

In [69]:
f1_score(Y_test, tree.predict(X_test))

0.9851551956815116

In [70]:
f1_score(Y_test, forest.predict(X_test))

0.9905020352781547

In [71]:
f1_score(Y_test, svc.predict(X_test))

0.9815303430079156

In [72]:
f1_score(Y_test, BNB.predict(X_test))

0.9630642954856362

In [73]:
f1_score(Y_test, hist.predict(X_test.toarray()))

0.9946236559139785

# Exporting the models and the Vectorizer

In [74]:
import pickle as pkl

In [97]:
with open('countVectorizer.sav', 'wb') as f:
    pkl.dump(vectorizer, f)

In [81]:
with open('logreg.sav', 'wb') as f:
    pkl.dump(logreg, f)

In [82]:
with open('BNB.sav', 'wb') as f:
    pkl.dump(BNB, f)

In [83]:
with open('svc.sav', 'wb') as f:
    pkl.dump(svc, f)

In [84]:
with open('tree.sav', 'wb') as f:
    pkl.dump(tree, f)

In [85]:
with open('forest.sav', 'wb') as f:
    pkl.dump(forest, f)

In [86]:
with open('bag.sav', 'wb') as f:
    pkl.dump(bag, f)

In [87]:
with open('hist.sav', 'wb') as f:
    pkl.dump(hist, f)

In [88]:
with open('grad.sav', 'wb') as f:
    pkl.dump(grad, f)

In [89]:
with open('ada.sav', 'wb') as f:
    pkl.dump(ada, f)

In [90]:
with open('knn.sav', 'wb') as f:
    pkl.dump(knn, f)

# Testing the exported models

In [94]:
with open('hist.sav', 'rb') as f:
    hist_loaded = pkl.load(f)

In [96]:
hist_loaded.predict(X_test[0].toarray())

array([0])

In [114]:
hist_loaded.predict(X_test[0].toarray())[0] == list(Y_test.items())[0][1]

True

In [98]:
with open('countVectorizer.sav', 'rb') as f:
    vectrizer = pkl.load(f)

In [105]:
vectrizer.get_feature_names_out()

array(['00', '000', '000_', ..., '𝕾𝖍𝖆𝖒𝖊𝖑𝖊𝖘𝖘', '𝕿𝖍𝖊', '𝖀𝖓'], dtype=object)