In [160]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import matplotlib.pyplot as plt
from sklearn.naive_bayes import BernoulliNB  # Using BernoulliNB to avoid negative values issue


In [39]:
data=pd.read_csv('news.csv')

In [40]:
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [41]:
# Remove the unnamed column
data.drop('Unnamed: 0',axis=1,inplace=True)

In [42]:
data.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [43]:
data['label']

0       FAKE
1       FAKE
2       REAL
3       FAKE
4       REAL
        ... 
6330    REAL
6331    FAKE
6332    FAKE
6333    REAL
6334    REAL
Name: label, Length: 6335, dtype: object

In [44]:
#Split into train and test set
X_train,X_test,y_train,y_test=train_test_split(data['text'],data['label'],random_state=10,test_size=0.3)

In [45]:
count_vector=CountVectorizer(lowercase=True, stop_words='english')
count_train=count_vector.fit_transform(X_train)
count_test=count_vector.transform(X_test)

In [46]:
print(count_train)

  (0, 17147)	1
  (0, 34559)	1
  (0, 46657)	1
  (0, 17181)	1
  (0, 25244)	1
  (0, 11275)	1
  (0, 11270)	1
  (0, 56626)	1
  (0, 20270)	1
  (0, 52444)	2
  (0, 5182)	1
  (0, 22567)	1
  (0, 42956)	1
  (0, 33720)	1
  (0, 56084)	1
  (0, 17392)	1
  (0, 40714)	2
  (0, 11977)	1
  (0, 57104)	1
  (0, 55783)	1
  (0, 19123)	1
  (0, 52946)	1
  (0, 49131)	1
  (0, 31979)	2
  (0, 38399)	8
  :	:
  (4433, 45416)	2
  (4433, 34782)	1
  (4433, 36095)	1
  (4433, 56634)	2
  (4433, 30140)	1
  (4433, 36590)	2
  (4433, 31077)	2
  (4433, 45888)	1
  (4433, 28650)	1
  (4433, 12741)	1
  (4433, 43307)	2
  (4433, 46638)	1
  (4433, 52328)	2
  (4433, 50685)	1
  (4433, 4027)	1
  (4433, 5845)	1
  (4433, 30317)	1
  (4433, 49075)	1
  (4433, 52152)	1
  (4433, 27336)	1
  (4433, 6438)	1
  (4433, 36091)	1
  (4433, 23918)	1
  (4433, 2453)	1
  (4433, 43702)	1


In [47]:
#Model Creation
clf = MultinomialNB()
clf.fit(count_train, y_train)


In [48]:
print("Shape of count_train:", count_train.shape)
print("Length of y_train:", len(y_train))


Shape of count_train: (4434, 58721)
Length of y_train: 4434


In [49]:
y_pred=clf.predict(count_test)

In [50]:
#Accuracy_Score
acc_score=accuracy_score(y_test,y_pred)
acc_score

0.8884797475013151

In [51]:
#confusion_matrix
cm=confusion_matrix(y_test,y_pred,labels=['FAKE','REAL'])
cm

array([[778, 142],
       [ 70, 911]], dtype=int64)

In [52]:
#Classification Report
clf_report=classification_report(y_test,y_pred)
print(clf_report)

              precision    recall  f1-score   support

        FAKE       0.92      0.85      0.88       920
        REAL       0.87      0.93      0.90       981

    accuracy                           0.89      1901
   macro avg       0.89      0.89      0.89      1901
weighted avg       0.89      0.89      0.89      1901



In [53]:
# Validate with test set
X_test

5781    The first nine months of 2013 have convinced u...
2699    Heseltine strangled dog as part of Thatcher ca...
3336    Former Democratic presidential candidate Jim W...
867     David M. Perry is an associate professor of hi...
3490    Getty - Kevin Mazur The Wildfire is an opinion...
                              ...                        
1888    By wmw_admin on October 29, 2016 Morgan Chalfa...
5171    WASHINGTON — The future of same-sex marriage a...
313     REPORT: Megyn Trashes Trump, Newt… Then Murdoc...
5821    Open Thread (NOT U.S. Election) 2016-38 \nNews...
4523    The warming of the oceans due to climate chang...
Name: text, Length: 1901, dtype: object

In [63]:
count_test

<1901x58721 sparse matrix of type '<class 'numpy.int64'>'
	with 488803 stored elements in Compressed Sparse Row format>

In [56]:
count_vector=CountVectorizer(lowercase=True, stop_words='english')
count_train=count_vector.fit_transform(X_train)
count_test=count_vector.transform(X_test)

In [60]:
clf.predict(count_train[[0]])

array(['REAL'], dtype='<U4')

In [62]:
clf.predict(count_test[[5]])

array(['REAL'], dtype='<U4')

# TFid Vectorizer

In [90]:
#Split into train and test set
X_train,X_test,y_train,y_test=train_test_split(data['text'],data['label'],random_state=10,test_size=0.3)

In [91]:
tfidf_vectorizer = TfidfVectorizer()
tfid_train = tfidf_vectorizer.fit_transform(X_train)
tfid_test=tfidf_vectorizer.transform(X_test)


In [92]:
#Model Creation
clf = MultinomialNB()
clf.fit(tfid_train, y_train)


In [93]:
y_pred=clf.predict(tfid_test)

In [94]:
# Evaluation_Metrics
acc_score=accuracy_score(y_test,y_pred)
acc_score

0.8200946870068385

In [95]:
cf_matrix=confusion_matrix(y_test,y_pred,labels=['FAKE','REAL'])
cf_matrix

array([[592, 328],
       [ 14, 967]], dtype=int64)

In [96]:
clf_report=classification_report(y_test,y_pred)
print(clf_report)

              precision    recall  f1-score   support

        FAKE       0.98      0.64      0.78       920
        REAL       0.75      0.99      0.85       981

    accuracy                           0.82      1901
   macro avg       0.86      0.81      0.81      1901
weighted avg       0.86      0.82      0.81      1901



In [97]:
X_test[[5781]]

5781    The first nine months of 2013 have convinced u...
Name: text, dtype: object

In [107]:
tfid_train.shape

(4434, 59029)

In [104]:
clf.predict(tfid_test[[1]])

array(['FAKE'], dtype='<U4')

In [100]:
count_train.shape

(4434, 58721)

In [101]:
X_train.shape

(4434,)

In [111]:
clf.predict(tfid_train[[15]])

array(['FAKE'], dtype='<U4')

In [119]:
feature_names = tfidf_vectorizer.get_feature_names_out()


In [120]:
feature_names

array(['00', '000', '0000', ..., 'من', 'هذا', 'والمرضى'], dtype=object)

In [123]:
feature_names = count_vector.get_feature_names_out()


In [124]:

feature_names

array(['00', '000', '0000', ..., 'من', 'هذا', 'والمرضى'], dtype=object)

# Hashing Vector

In [164]:
# Split into train and test set
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], random_state=10, test_size=0.3)

In [165]:
# Using HashingVectorizer with alternate_sign=False to avoid negative values
vectorizer = HashingVectorizer(n_features=10000, alternate_sign=False)

In [166]:
# Transform both train and test data with the same vectorizer instance
hash_Xtrain = vectorizer.fit_transform(X_train)
hash_Xtest = vectorizer.transform(X_test)

In [167]:
# Model Creation
clf_hash = BernoulliNB()
clf_hash.fit(hash_Xtrain, y_train)

In [168]:
y_pred=clf_hash.predict(hash_Xtest)

In [169]:
# Evaluation_Metrics
acc_score=accuracy_score(y_test,y_pred)
acc_score

0.7432930036822725

In [170]:
cf_matrix=confusion_matrix(y_test,y_pred)
cf_matrix

array([[729, 191],
       [297, 684]], dtype=int64)

In [172]:
clf_report=classification_report(y_test,y_pred)
print(clf_report)

              precision    recall  f1-score   support

        FAKE       0.71      0.79      0.75       920
        REAL       0.78      0.70      0.74       981

    accuracy                           0.74      1901
   macro avg       0.75      0.74      0.74      1901
weighted avg       0.75      0.74      0.74      1901



In [173]:
clf_hash.predict(hash_Xtest[1])

array(['FAKE'], dtype='<U4')