In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
#printing the stopwords
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [None]:
#loading the dataset
import pandas as pd
news_dataset = pd.read_csv('/content/final_10000_full_columns.csv', engine='python', on_bad_lines='skip')

In [None]:
news_dataset.shape

(3378, 5)

In [None]:
#printing the first five rows in dataset
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,1,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,2,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,3,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,4,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,5,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [None]:
#counting the number of missing values
news_dataset.isnull().sum()

Unnamed: 0,0
id,0
title,0
author,0
text,0
label,0


In [None]:
#replacing the null values with empty string
news_dataset = news_dataset.fillna('')


In [None]:
#merging the author name and news title
news_dataset['content'] = news_dataset['author']+' '+news_dataset['title']

In [None]:
print(news_dataset['content'])

0      Darrell Lucus House Dem Aide: We Didn’t Even S...
1      Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2      Consortiumnews.com Why the Truth Might Get You...
3      Jessica Purkiss 15 Civilians Killed In Single ...
4      Howard Portnoy Iranian woman jailed for fictio...
                             ...                        
386    Kenneth Chang Visions of Life on Mars in Earth...
387    Christine Hauser After Obama Cancels Talk, Rod...
388    Patrick McGeehan New Jersey Will Increase Gas ...
389    Ian Hanchett ABC: Manchester Attack ’Likely to...
390    Nathanial Mauka Hidden in plain sight – The gl...
Name: content, Length: 391, dtype: object


In [None]:
#separating data & label
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset['label']

In [None]:
print(X)
print(Y)

      id                                              title  \
0      1  House Dem Aide: We Didn’t Even See Comey’s Let...   
1      2  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2      3                  Why the Truth Might Get You Fired   
3      4  15 Civilians Killed In Single US Airstrike Hav...   
4      5  Iranian woman jailed for fictional unpublished...   
..   ...                                                ...   
386  387  Visions of Life on Mars in Earth’s Depths - Th...   
387  388  After Obama Cancels Talk, Rodrigo Duterte of P...   
388  389  New Jersey Will Increase Gas Tax 23¢, Ending L...   
389  390  ABC: Manchester Attack ’Likely to Inflame Anti...   
390  391  Hidden in plain sight – The global depopulatio...   

                 author                                               text  \
0         Darrell Lucus  House Dem Aide: We Didn’t Even See Comey’s Let...   
1       Daniel J. Flynn  Ever get the feeling your life circles the rou...   
2    Cons

In [None]:
#Stemming
port_stem=PorterStemmer()


In [None]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content=stemmed_content.split()
  stemmed_content =  [port_stem.stem(word) for word in stemmed_content if word not in stopwords.words('english')]
  stemmed_content=' '.join(stemmed_content)
  return stemmed_content

In [None]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [None]:
print(news_dataset['content'])

0      darrel lucu hous dem aid even see comey letter...
1      daniel j flynn flynn hillari clinton big woman...
2                 consortiumnew com truth might get fire
3      jessica purkiss civilian kill singl us airstri...
4      howard portnoy iranian woman jail fiction unpu...
                             ...                        
590    jafe arnoldski norepli blogger com authoritari...
591    chri tomlinson swedish journo sweden collaps w...
592    newsbiscuit editori team promis internet caree...
593    trip gabriel iowa trump voter unfaz controvers...
594    elliot bougi radic american grown marxist terr...
Name: content, Length: 595, dtype: object


In [None]:
#separating data and label
X=news_dataset['content'].values
Y=news_dataset['label'].values

In [None]:
print(X)

['darrel lucu hous dem aid even see comey letter jason chaffetz tweet'
 'daniel j flynn flynn hillari clinton big woman campu breitbart'
 'consortiumnew com truth might get fire'
 'jessica purkiss civilian kill singl us airstrik identifi'
 'howard portnoy iranian woman jail fiction unpublish stori woman stone death adulteri'
 'daniel nussbaum jacki mason hollywood would love trump bomb north korea lack tran bathroom exclus video breitbart'
 'life life luxuri elton john favorit shark pictur stare long transcontinent flight'
 'alissa j rubin beno hamon win french socialist parti presidenti nomin new york time'
 'excerpt draft script donald trump q ampa black church pastor new york time'
 'megan twohey scott shane back channel plan ukrain russia courtesi trump associ new york time'
 'aaron klein obama organ action partner soro link indivis disrupt trump agenda'
 'chri tomlinson bbc comedi sketch real housew isi caus outrag'
 'amando flavio russian research discov secret nazi militari base

In [None]:
print(Y)

[1 0 1 1 1 0 1 0 0 0 0 0 1 1 1 0 0 1 1 0 1 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 1
 1 0 0 0 1 0 1 0 0 1 1 0 0 1 1 0 1 1 1 1 1 1 0 0 1 0 1 0 0 1 1 0 1 1 1 0 0
 0 0 1 1 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 0 1 0 1 0 1 0 0 0
 0 1 0 1 0 0 0 0 0 1 1 1 0 1 1 1 0 0 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 1 1 1 0
 1 0 1 1 0 0 0 0 1 0 0 1 0 1 1 1 1 0 0 0 0 1 1 0 1 1 0 1 0 1 1 1 0 0 0 0 1
 0 1 0 0 1 1 0 1 1 1 1 1 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 0 1 1 1 0 0 0 0 1
 0 1 0 0 1 0 1 1 0 1 0 1 0 1 1 0 1 1 0 0 1 0 0 0 0 0 1 0 1 0 0 1 1 0 0 1 0
 0 1 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 1 1 1 0 1 0 1 1 1 0 0 1 0 1 0 1 0 1 1
 0 0 0 1 0 0 0 0 1 0 0 1 0 1 0 0 1 0 0 0 0 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0
 1 0 1 1 1 1 0 0 1 0 1 1 1 0 1 1 0 1 0 0 1 0 0 1 0 0 0 1 1 1 1 1 0 0 1 0 0
 0 1 1 1 0 0 1 0 1 1 0 0 1 1 1 1 0 0 1 0 0 1 1 1 0 0 0 1 1 1 0 0 0 0 0 1 1
 1 0 1 1 0 0 0 0 1 1 0 0 0 1 1 1 0 0 0 0 0 1 0 1 0 1 1 1 1 0 0 1 0 0 0 1 1
 1 0 0 0 0 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 1 1 1 0 0 1 1 0 0 1 1 1 0 1 1 1
 1 1 1 0 1 0 0 1 1 1 0 1 

In [None]:
Y.shape

(595,)

In [None]:
#converting the textual data to numerical data
vectorizer=TfidfVectorizer()
vectorizer.fit(X)
X=vectorizer.transform(X)

In [None]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6024 stored elements and shape (595, 2743)>
  Coords	Values
  (0, 54)	0.284150949244751
  (0, 413)	0.3291905211342467
  (0, 506)	0.25525874415238103
  (0, 611)	0.30926019307344954
  (0, 642)	0.284150949244751
  (0, 831)	0.284150949244751
  (0, 1156)	0.22697715855343237
  (0, 1264)	0.2676119131474156
  (0, 1410)	0.2951194002739752
  (0, 1465)	0.30926019307344954
  (0, 2160)	0.30926019307344954
  (0, 2537)	0.2951194002739752
  (1, 255)	0.31609730671062225
  (1, 318)	0.1680197106847445
  (1, 371)	0.3662005613628044
  (1, 480)	0.2238841332311952
  (1, 607)	0.29039726336806543
  (1, 933)	0.6880590359676952
  (1, 1132)	0.2218955314985701
  (1, 2696)	0.28395682560752145
  (2, 502)	0.3459658244777175
  (2, 532)	0.45813605677608127
  (2, 913)	0.36352646651014536
  (2, 1001)	0.37046664595496126
  (2, 1565)	0.45813605677608127
  :	:
  (593, 544)	0.38446914824347483
  (593, 977)	0.40924633077575057
  (593, 1233)	0.40924633077575057
  (5

In [None]:
#Splitting the dataset to training and testing data
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [None]:
#training the model:Logistic REgression Model
model=LogisticRegression()


In [None]:
model.fit(X_train,Y_train)

In [None]:
#Evaluation
#accuracy score on the training data
X_train_prediction=model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction,Y_train)

In [None]:
print("Accuracy Score of the training data:",(training_data_accuracy))

Accuracy Score of the training data: 0.9789915966386554


In [None]:
#Evaluation
#accuracy score on the testing data
X_test_prediction=model.predict(X_test)
test_data_accuracy=accuracy_score(X_test_prediction,Y_test)

In [None]:
print("Accuarcy score of the testing data:",test_data_accuracy)

Accuarcy score of the testing data: 0.9243697478991597


In [None]:
#Making a Predictive System
X_new=X_test[0]
preditcion=model.predict(X_new)
print(preditcion)
if(preditcion[0]==0):
  print("The news is Real")
else:
  print("The news is Fake")

[0]
The news is Real


In [None]:
index = int(input("Enter the index of the news sample to predict: "))

# Select the sample and keep it 2D
X_new = X_test[index]

# Reshape to (1, -1) if needed
prediction = model.predict(X_new)

print("Prediction:", prediction)

if prediction[0] == 0:
    print("The news is Real")
else:
    print("The news is Fake")

Enter the index of the news sample to predict: 8
Prediction: [0]
The news is Real


In [None]:
print("Actual label:", Y_test[index])


Actual label: 0
