In [1]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [3]:
with open('english.txt', 'r') as file:
    lines = file.readlines()

lines = [line.strip() for line in lines]

In [4]:
print(lines)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [5]:
#Preprocessing
df = pd.read_csv('train.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [None]:
df.shape

(20800, 5)

In [None]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [None]:
df = df.fillna(' ')

In [None]:
df.isnull().sum()   

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [None]:
#merging author name and title

df['content'] = df['author'] + ' ' + df['title']
df.drop(columns=['author', 'title'], inplace=True)
df.head()

Unnamed: 0,id,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,Ever get the feeling your life circles the rou...,0,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.com Why the Truth Might Get You...
3,3,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Print \nAn Iranian woman has been sentenced to...,1,Howard Portnoy Iranian woman jailed for fictio...


In [None]:
df.set_index('id', inplace=True)

In [None]:
X = df.drop(columns='label', axis=1)
Y = df['label']
print(X)


                                                    text   
id                                                         
0      House Dem Aide: We Didn’t Even See Comey’s Let...  \
1      Ever get the feeling your life circles the rou...   
2      Why the Truth Might Get You Fired October 29, ...   
3      Videos 15 Civilians Killed In Single US Airstr...   
4      Print \nAn Iranian woman has been sentenced to...   
...                                                  ...   
20795  Rapper T. I. unloaded on black celebrities who...   
20796  When the Green Bay Packers lost to the Washing...   
20797  The Macy’s of today grew from the union of sev...   
20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799    David Swanson is an author, activist, journa...   

                                                 content  
id                                                        
0      Darrell Lucus House Dem Aide: We Didn’t Even S...  
1      Daniel J. Flynn FLYNN: Hillary Clin

In [None]:
print(Y)


id
0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 20800, dtype: int64


In [None]:
#Stemming -> converting words to their base form   playing -> play
port_stem = PorterStemmer()


In [None]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in lines]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [None]:
df['content'] = df['content'].apply(stemming)

In [None]:
df['content'].head()

id
0    darrel lucu hous dem aid even see comey letter...
1    daniel j flynn flynn hillari clinton big woman...
2               consortiumnew com truth might get fire
3    jessica purkiss civilian kill singl us airstri...
4    howard portnoy iranian woman jail fiction unpu...
Name: content, dtype: object

In [None]:
X = df['content'].values
Y = df['label'].values


In [None]:
#Converting text to vectors
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [None]:
print(X)

  (0, 15686)	0.28485063562728646
  (0, 13473)	0.2565896679337957
  (0, 8909)	0.3635963806326075
  (0, 8630)	0.29212514087043684
  (0, 7692)	0.24785219520671603
  (0, 7005)	0.21874169089359144
  (0, 4973)	0.233316966909351
  (0, 3792)	0.2705332480845492
  (0, 3600)	0.3598939188262559
  (0, 2959)	0.2468450128533713
  (0, 2483)	0.3676519686797209
  (0, 267)	0.27010124977708766
  (1, 16799)	0.30071745655510157
  (1, 6816)	0.1904660198296849
  (1, 5503)	0.7143299355715573
  (1, 3568)	0.26373768806048464
  (1, 2813)	0.19094574062359204
  (1, 2223)	0.3827320386859759
  (1, 1894)	0.15521974226349364
  (1, 1497)	0.2939891562094648
  (2, 15611)	0.41544962664721613
  (2, 9620)	0.49351492943649944
  (2, 5968)	0.3474613386728292
  (2, 5389)	0.3866530551182615
  (2, 3103)	0.46097489583229645
  :	:
  (20797, 13122)	0.2482526352197606
  (20797, 12344)	0.27263457663336677
  (20797, 12138)	0.24778257724396507
  (20797, 10306)	0.08038079000566466
  (20797, 9588)	0.174553480255222
  (20797, 9518)	0.295420

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)


In [None]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((16640, 17128), (4160, 17128), (16640,), (4160,))

In [None]:
#model
model = LogisticRegression()
model.fit(X_train, Y_train)

train_prediction = model.predict(X_train)
train_accuracy = accuracy_score(train_prediction, Y_train)
print('Accuracy on training data : ', train_accuracy)

Accuracy on training data :  0.9865985576923076


In [None]:

test_prediction = model.predict(X_test)
test_accuracy = accuracy_score(test_prediction, Y_test)
print('Accuracy on test data : ', test_accuracy)

Accuracy on test data :  0.9795673076923077


In [None]:

X_new = X_test[30]
prediction = model.predict(X_new)
print(prediction)

if (prediction[0] == 0):
    print('The news is Real')
else:
    print('The news is Fake')

[1]
The news is Fake
