# About Dataset

### train.csv: A full training dataset with the following attributes
         
- id: unique id for a news article

- title: the title of a news article

- author: author of the news article

- text: the text of the article; could be incomplete.

- label: a label that marks the article as potentially unreliable.

                     1: unreliable (Fake News)
                     0: reliable   (Real News)


In [77]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
import time
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load the dataset

data = pd.read_csv('train.csv')
data

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [6]:
# data.describe()

# Analysing the dataset - EDA

In [8]:
data['label'].value_counts()

1    10413
0    10387
Name: label, dtype: int64

In [9]:
data.columns

Index(['id', 'title', 'author', 'text', 'label'], dtype='object')

In [11]:
data['title'][0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

In [13]:
data['author'][0]

'Darrell Lucus'

In [12]:
data['text'][0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It By Darrell Lucus on October 30, 2016 Subscribe Jason Chaffetz on the stump in American Fork, Utah ( image courtesy Michael Jolley, available under a Creative Commons-BY license) \nWith apologies to Keith Olbermann, there is no doubt who the Worst Person in The World is this week–FBI Director James Comey. But according to a House Democratic aide, it looks like we also know who the second-worst person is as well. It turns out that when Comey sent his now-infamous letter announcing that the FBI was looking into emails that may be related to Hillary Clinton’s email server, the ranking Democrats on the relevant committees didn’t hear about it from Comey. They found out via a tweet from one of the Republican committee chairmen. \nAs we now know, Comey notified the Republican chairmen and Democratic ranking members of the House Intelligence, Judiciary, and Oversight committees that his agency was reviewing emai

# Missing Data

In [14]:
data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [15]:
data.fillna(' ', inplace=True)

In [16]:
data.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [17]:
'ab' + 'cd'

'abcd'

In [28]:
'ab' + ' ' +  'cd'

'ab cd'

In [18]:
'ab' + ' '

'ab '

In [21]:
'ab '.replace(' ', '$')

'ab$'

# Duplicated data

In [23]:
data.duplicated().sum()

0

In [24]:
data[data.duplicated()]

Unnamed: 0,id,title,author,text,label


# Data  Manipulation

In [25]:
data

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [26]:
data['author']   

0                                    Darrell Lucus
1                                  Daniel J. Flynn
2                               Consortiumnews.com
3                                  Jessica Purkiss
4                                   Howard Portnoy
                           ...                    
20795                                Jerome Hudson
20796                             Benjamin Hoffman
20797    Michael J. de la Merced and Rachel Abrams
20798                                  Alex Ansary
20799                                David Swanson
Name: author, Length: 20800, dtype: object

In [27]:
data['title']

0        House Dem Aide: We Didn’t Even See Comey’s Let...
1        FLYNN: Hillary Clinton, Big Woman on Campus - ...
2                        Why the Truth Might Get You Fired
3        15 Civilians Killed In Single US Airstrike Hav...
4        Iranian woman jailed for fictional unpublished...
                               ...                        
20795    Rapper T.I.: Trump a ’Poster Child For White S...
20796    N.F.L. Playoffs: Schedule, Matchups and Odds -...
20797    Macy’s Is Said to Receive Takeover Approach by...
20798    NATO, Russia To Hold Parallel Exercises In Bal...
20799                            What Keeps the F-35 Alive
Name: title, Length: 20800, dtype: object

In [30]:
data['author'] + ' ' + data['title']

0        Darrell Lucus House Dem Aide: We Didn’t Even S...
1        Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2        Consortiumnews.com Why the Truth Might Get You...
3        Jessica Purkiss 15 Civilians Killed In Single ...
4        Howard Portnoy Iranian woman jailed for fictio...
                               ...                        
20795    Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796    Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma...
20797    Michael J. de la Merced and Rachel Abrams Macy...
20798    Alex Ansary NATO, Russia To Hold Parallel Exer...
20799              David Swanson What Keeps the F-35 Alive
Length: 20800, dtype: object

In [31]:
data['content'] = data['author'] + ' ' + data['title']

In [33]:
data.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Howard Portnoy Iranian woman jailed for fictio...


In [35]:
data['content'][0]

'Darrell Lucus House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

In [40]:
data

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Howard Portnoy Iranian woman jailed for fictio...
...,...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0,Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0,"Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma..."
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0,Michael J. de la Merced and Rachel Abrams Macy...
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1,"Alex Ansary NATO, Russia To Hold Parallel Exer..."


# Final Dataframe

In [41]:
news_df = data[['content', 'label']]
news_df

Unnamed: 0,content,label
0,Darrell Lucus House Dem Aide: We Didn’t Even S...,1
1,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...",0
2,Consortiumnews.com Why the Truth Might Get You...,1
3,Jessica Purkiss 15 Civilians Killed In Single ...,1
4,Howard Portnoy Iranian woman jailed for fictio...,1
...,...,...
20795,Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...,0
20796,"Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma...",0
20797,Michael J. de la Merced and Rachel Abrams Macy...,0
20798,"Alex Ansary NATO, Russia To Hold Parallel Exer...",1


In [44]:
news_df['content'][0]

'Darrell Lucus House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

# Text Preprocessing Steps:


- Punctuation Removal
- Lower Case
- Tokenization
- Stopwords Removal
- Stemming

In [45]:
news_df['content'][0]

'Darrell Lucus House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

In [49]:
# Sample

t = re.sub('[^a-z A-Z]', ' ', news_df['content'][0])
t

'Darrell Lucus House Dem Aide  We Didn t Even See Comey s Letter Until Jason Chaffetz Tweeted It'

In [52]:
t = t.lower()
t

'darrell lucus house dem aide  we didn t even see comey s letter until jason chaffetz tweeted it'

In [54]:
t = t.split()
print(t)

['darrell', 'lucus', 'house', 'dem', 'aide', 'we', 'didn', 't', 'even', 'see', 'comey', 's', 'letter', 'until', 'jason', 'chaffetz', 'tweeted', 'it']


In [55]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
# Final Function

In [72]:
ps = PorterStemmer()

def text_preprocessing(text):
    
    cleaned_data = re.sub('[^a-z A-Z]', ' ', text)
    lower_text = cleaned_data.lower()
    tokens = lower_text.split()
    stemmed_data = [ps.stem(word) for word in tokens if word not in stopwords.words('english')]
    pure_str = ' '.join(stemmed_data)
    
    return pure_str

In [73]:
text_preprocessing(news_df['content'][0])

'darrel lucu hous dem aid even see comey letter jason chaffetz tweet'

In [75]:
news_df['content'].apply(text_preprocessing)

0        darrel lucu hous dem aid even see comey letter...
1        daniel j flynn flynn hillari clinton big woman...
2                   consortiumnew com truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20795    jerom hudson rapper trump poster child white s...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc rachel abram maci said re...
20798    alex ansari nato russia hold parallel exercis ...
20799                            david swanson keep f aliv
Name: content, Length: 20800, dtype: object

In [76]:
news_df

Unnamed: 0,content,label
0,Darrell Lucus House Dem Aide: We Didn’t Even S...,1
1,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...",0
2,Consortiumnews.com Why the Truth Might Get You...,1
3,Jessica Purkiss 15 Civilians Killed In Single ...,1
4,Howard Portnoy Iranian woman jailed for fictio...,1
...,...,...
20795,Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...,0
20796,"Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma...",0
20797,Michael J. de la Merced and Rachel Abrams Macy...,0
20798,"Alex Ansary NATO, Russia To Hold Parallel Exer...",1


In [79]:
import time

start = time.time()

news_df['content'] = news_df['content'].apply(text_preprocessing)

end = time.time() - start
print(f'Time taken to complete this process: {end} seconds.')

Time taken to complete this process: 68.31523776054382 seconds.


In [80]:
news_df

Unnamed: 0,content,label
0,darrel lucu hous dem aid even see comey letter...,1
1,daniel j flynn flynn hillari clinton big woman...,0
2,consortiumnew com truth might get fire,1
3,jessica purkiss civilian kill singl us airstri...,1
4,howard portnoy iranian woman jail fiction unpu...,1
...,...,...
20795,jerom hudson rapper trump poster child white s...,0
20796,benjamin hoffman n f l playoff schedul matchup...,0
20797,michael j de la merc rachel abram maci said re...,0
20798,alex ansari nato russia hold parallel exercis ...,1


# Separate the data and label

In [81]:
news_df['content']

0        darrel lucu hous dem aid even see comey letter...
1        daniel j flynn flynn hillari clinton big woman...
2                   consortiumnew com truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20795    jerom hudson rapper trump poster child white s...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc rachel abram maci said re...
20798    alex ansari nato russia hold parallel exercis ...
20799                            david swanson keep f aliv
Name: content, Length: 20800, dtype: object

In [82]:
news_df['content'].values

array(['darrel lucu hous dem aid even see comey letter jason chaffetz tweet',
       'daniel j flynn flynn hillari clinton big woman campu breitbart',
       'consortiumnew com truth might get fire', ...,
       'michael j de la merc rachel abram maci said receiv takeov approach hudson bay new york time',
       'alex ansari nato russia hold parallel exercis balkan',
       'david swanson keep f aliv'], dtype=object)

In [84]:
news_df['label'].values

array([1, 0, 1, ..., 0, 1, 1], dtype=int64)

In [85]:
X = news_df['content'].values
y = news_df['label'].values

# Encode the textual data into vectors using TfidfVectorizer

In [86]:
vector = TfidfVectorizer()
vector.fit(X)

In [88]:
X = vector.transform(X)
X

<20800x17128 sparse matrix of type '<class 'numpy.float64'>'
	with 210687 stored elements in Compressed Sparse Row format>

In [89]:
print(X)

  (0, 15686)	0.28485063562728646
  (0, 13473)	0.2565896679337957
  (0, 8909)	0.3635963806326075
  (0, 8630)	0.29212514087043684
  (0, 7692)	0.24785219520671603
  (0, 7005)	0.21874169089359144
  (0, 4973)	0.233316966909351
  (0, 3792)	0.2705332480845492
  (0, 3600)	0.3598939188262559
  (0, 2959)	0.2468450128533713
  (0, 2483)	0.3676519686797209
  (0, 267)	0.27010124977708766
  (1, 16799)	0.30071745655510157
  (1, 6816)	0.1904660198296849
  (1, 5503)	0.7143299355715573
  (1, 3568)	0.26373768806048464
  (1, 2813)	0.19094574062359204
  (1, 2223)	0.3827320386859759
  (1, 1894)	0.15521974226349364
  (1, 1497)	0.2939891562094648
  (2, 15611)	0.41544962664721613
  (2, 9620)	0.49351492943649944
  (2, 5968)	0.3474613386728292
  (2, 5389)	0.3866530551182615
  (2, 3103)	0.46097489583229645
  :	:
  (20797, 13122)	0.2482526352197606
  (20797, 12344)	0.27263457663336677
  (20797, 12138)	0.24778257724396507
  (20797, 10306)	0.08038079000566466
  (20797, 9588)	0.174553480255222
  (20797, 9518)	0.295420

In [91]:
news_df['content'][0]

'darrel lucu hous dem aid even see comey letter jason chaffetz tweet'

In [92]:
news_df['content'][1]

'daniel j flynn flynn hillari clinton big woman campu breitbart'

In [94]:
news_df['content'][20798]

'alex ansari nato russia hold parallel exercis balkan'

# Splitting the data

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [97]:
X_train.shape, X_test.shape

((16640, 17128), (4160, 17128))

In [99]:
print(X_train)

  (0, 16996)	0.08592732658546715
  (0, 16462)	0.22659461042913195
  (0, 15295)	0.08431126902348997
  (0, 14457)	0.2911481622175399
  (0, 13216)	0.26345814215284585
  (0, 12136)	0.4007594909617104
  (0, 11074)	0.2935927067894611
  (0, 10399)	0.3219768275280276
  (0, 10306)	0.0830590692063111
  (0, 9263)	0.3170829514133382
  (0, 4334)	0.2989943819613284
  (0, 3407)	0.2877460193062496
  (0, 2453)	0.27412723528216965
  (0, 1012)	0.2733679885556474
  (1, 14822)	0.25693783181358953
  (1, 13285)	0.2621146156989141
  (1, 12739)	0.3077610518061034
  (1, 10577)	0.18822294073887524
  (1, 10234)	0.24642752666048326
  (1, 8222)	0.6326153588265743
  (1, 7824)	0.19284257070506372
  (1, 6633)	0.24613867690271277
  (1, 1728)	0.3362916285372642
  (1, 837)	0.25167609243564176
  (2, 16947)	0.25403995310175265
  :	:
  (16636, 277)	0.26249197175441774
  (16637, 16996)	0.11875104913510133
  (16637, 15295)	0.11651766729286954
  (16637, 14594)	0.4668797273883944
  (16637, 12906)	0.3737884244063731
  (16637, 11

# Model Building

In [100]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [101]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [104]:
y_train[:8]

array([0, 0, 1, 0, 0, 0, 0, 1], dtype=int64)

In [105]:
y_train_pred[:8]

array([0, 0, 1, 0, 0, 0, 0, 1], dtype=int64)

In [106]:
y_test[:8]

array([1, 1, 0, 0, 1, 0, 1, 1], dtype=int64)

In [107]:
y_test_pred[:8]

array([1, 1, 0, 0, 1, 0, 1, 1], dtype=int64)

# Training and Test Evaluation

In [None]:
# Training Evaluation

In [108]:
confusion_matrix(y_train, y_train_pred)

array([[8089,  166],
       [  43, 8342]], dtype=int64)

In [109]:
accuracy_score(y_train, y_train_pred)

0.9874399038461539

In [110]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      8255
           1       0.98      0.99      0.99      8385

    accuracy                           0.99     16640
   macro avg       0.99      0.99      0.99     16640
weighted avg       0.99      0.99      0.99     16640



In [None]:
# Test Evaluation

In [111]:
confusion_matrix(y_test, y_test_pred)

array([[2045,   87],
       [  21, 2007]], dtype=int64)

In [112]:
accuracy_score(y_test, y_test_pred)

0.9740384615384615

In [113]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.99      0.96      0.97      2132
           1       0.96      0.99      0.97      2028

    accuracy                           0.97      4160
   macro avg       0.97      0.97      0.97      4160
weighted avg       0.97      0.97      0.97      4160



# Detection System

In [114]:
print(X_test)

  (0, 16348)	0.22583100441636614
  (0, 15186)	0.39848485655580945
  (0, 14932)	0.29893567442456387
  (0, 13473)	0.27475791963366003
  (0, 12771)	0.2883148928980967
  (0, 12305)	0.27716565528409937
  (0, 11684)	0.25367875912575993
  (0, 6565)	0.31054968535837807
  (0, 4625)	0.19641692014138545
  (0, 1465)	0.4354344005745577
  (0, 114)	0.27646465077777255
  (1, 15582)	0.1498653484999072
  (1, 14444)	0.26400020318193473
  (1, 13503)	0.4512102901935686
  (1, 13046)	0.2563994975288171
  (1, 12774)	0.2988350814382177
  (1, 10315)	0.23985909373735884
  (1, 3721)	0.37257665177690863
  (1, 809)	0.42107119803033977
  (1, 421)	0.4190032645727401
  (2, 16996)	0.10545861987603505
  (2, 15582)	0.13124949250519383
  (2, 15295)	0.10347523220532906
  (2, 13270)	0.2168795415288745
  (2, 12404)	0.376783399672724
  :	:
  (4157, 10177)	0.344964447796332
  (4157, 9062)	0.3421341069854292
  (4157, 7692)	0.3007953576697398
  (4157, 4951)	0.33945086484840337
  (4157, 4505)	0.3651807269959845
  (4157, 4174)	0.3

In [140]:
for i in vector.get_feature_names_out():
    print(i)

aa
aaa
aap
aapi
aargh
aaron
ab
abad
abandon
abandona
abandonado
abaten
abba
abbi
abbott
abc
abd
abdel
abdelmoumen
abdeslam
abdic
abduct
abductor
abdul
abe
abedi
abedin
abellera
abelson
aber
abgelegenen
abgesprochen
abhorr
abil
abinico
abl
aboard
abolish
abolit
abord
aborigin
abort
abound
abram
abramo
abramovi
abramson
abran
abri
abridg
abroad
abrog
abrupt
abruptli
absenc
absent
absente
absichtlich
absolut
absorb
abstain
absurd
absurdli
abu
abus
abyss
aca
acaba
academ
academi
academia
acc
acceler
accent
accept
access
accid
accident
accidentel
acclaim
accommod
accompani
accomplic
accomplish
acconci
accord
account
acct
accur
accus
acedemi
aceh
ach
achat
acheron
achiev
acid
ackerberg
acknowledg
aclara
aclu
acn
acoplan
acosta
acoust
acquir
acquisit
acquit
acquitt
acr
across
act
action
activ
activist
activistpost
actor
actress
actriz
actual
actuari
acuerdo
acuesta
acupunctur
acura
acut
ad
adachi
adalia
adam
adamski
adan
adapt
adazi
add
adderal
addestramento
addict
addit
address
adebayo
adeel

In [142]:
X_test.shape

(4160, 17128)

In [145]:
print(X_test[4159])

  (0, 15582)	0.1868085182590659
  (0, 13130)	0.4655130183597939
  (0, 12801)	0.4700680983275486
  (0, 11787)	0.32716285910150367
  (0, 10747)	0.5776786277147142
  (0, 4625)	0.2944283561145273


In [147]:
print(X_test)

  (0, 16348)	0.22583100441636614
  (0, 15186)	0.39848485655580945
  (0, 14932)	0.29893567442456387
  (0, 13473)	0.27475791963366003
  (0, 12771)	0.2883148928980967
  (0, 12305)	0.27716565528409937
  (0, 11684)	0.25367875912575993
  (0, 6565)	0.31054968535837807
  (0, 4625)	0.19641692014138545
  (0, 1465)	0.4354344005745577
  (0, 114)	0.27646465077777255
  (1, 15582)	0.1498653484999072
  (1, 14444)	0.26400020318193473
  (1, 13503)	0.4512102901935686
  (1, 13046)	0.2563994975288171
  (1, 12774)	0.2988350814382177
  (1, 10315)	0.23985909373735884
  (1, 3721)	0.37257665177690863
  (1, 809)	0.42107119803033977
  (1, 421)	0.4190032645727401
  (2, 16996)	0.10545861987603505
  (2, 15582)	0.13124949250519383
  (2, 15295)	0.10347523220532906
  (2, 13270)	0.2168795415288745
  (2, 12404)	0.376783399672724
  :	:
  (4157, 10177)	0.344964447796332
  (4157, 9062)	0.3421341069854292
  (4157, 7692)	0.3007953576697398
  (4157, 4951)	0.33945086484840337
  (4157, 4505)	0.3651807269959845
  (4157, 4174)	0.3

In [157]:
news_df.iloc[2]

content    consortiumnew com truth might get fire
label                                           1
Name: 2, dtype: object

In [156]:
input_data = X_test[8]
pred = model.predict(input_data)

if pred[0] == 0:
    print('Real News')
else:
    print('Fake News')

Fake News


In [None]:
# 2 Fake News
# 86 Fake News
# 80 Fake News