In [1]:
import nltk
import pandas as pd

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\avina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
fake = pd.read_csv("Fake-210604-161841.csv")
genuine = pd.read_csv("True-210604-161650.csv")

In [4]:
fake['genuineness'] = 0
genuine['genuineness'] = 1

In [5]:
data = pd.concat([fake, genuine], axis=0)

In [6]:
display(data)

Unnamed: 0,title,text,subject,date,genuineness
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1


In [7]:
data=data.drop(['subject','date','title'], axis=1)

In [8]:
#preprocessing

In [9]:
# TOKENIZATION

In [10]:
from nltk.tokenize import word_tokenize
data['text'] = data['text'].apply(word_tokenize)

In [11]:
# STEMMING

In [12]:
from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer("english", ignore_stopwords=False)

In [13]:
def stem_it(text):
    return [porter.stem(word) for word in text]

In [14]:
data['text']=data['text'].apply(stem_it)

In [15]:
data

Unnamed: 0,text,genuineness
0,"[donald, trump, just, couldn, t, wish, all, am...",0
1,"[hous, intellig, committe, chairman, devin, nu...",0
2,"[on, friday, ,, it, was, reveal, that, former,...",0
3,"[on, christma, day, ,, donald, trump, announc,...",0
4,"[pope, franci, use, his, annual, christma, day...",0
...,...,...
21412,"[brussel, (, reuter, ), -, nato, alli, on, tue...",1
21413,"[london, (, reuter, ), -, lexisnexi, ,, a, pro...",1
21414,"[minsk, (, reuter, ), -, in, the, shadow, of, ...",1
21415,"[moscow, (, reuter, ), -, vatican, secretari, ...",1


In [16]:
# Stopword removal

In [17]:
def stopword_remove(t):
    return [word for word in t if len(word)>>2]

In [18]:
data['text'] = data['text'].apply(stopword_remove)

In [19]:
data

Unnamed: 0,text,genuineness
0,"[donald, trump, just, couldn, wish, american, ...",0
1,"[hous, intellig, committe, chairman, devin, nu...",0
2,"[friday, reveal, that, former, milwauke, sheri...",0
3,"[christma, donald, trump, announc, that, would...",0
4,"[pope, franci, annual, christma, messag, rebuk...",0
...,...,...
21412,"[brussel, reuter, nato, alli, tuesday, welcom,...",1
21413,"[london, reuter, lexisnexi, provid, legal, reg...",1
21414,"[minsk, reuter, shadow, disus, soviet-era, fac...",1
21415,"[moscow, reuter, vatican, secretari, state, ca...",1


In [20]:
data['text'] = data['text'].apply(' '.join)

In [21]:
# Splitting up of data

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['genuineness'], test_size=0.25)
display(X_train.head())
print('\n')
display(y_train.head())

5784     feder appeal court judg neil gorsuch u.s. supr...
20203    left have thrown their women children liter op...
4605     presid zero left obama strike again.donald tru...
18988    washington reuter unit state militari offic sa...
14280    johannesburg reuter south african presid jacob...
Name: text, dtype: object





5784     1
20203    0
4605     0
18988    1
14280    1
Name: genuineness, dtype: int64

In [23]:
# Vectorization

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
my_tfidf = TfidfVectorizer( max_df=0.7)
tfidf_train = my_tfidf.fit_transform(X_train)
tfidf_test = my_tfidf.transform(X_test)

In [25]:
print(tfidf_train)

  (0, 57237)	0.05494647936288774
  (0, 42261)	0.03402550061719773
  (0, 80798)	0.0325740722172432
  (0, 73202)	0.01219007887389132
  (0, 43239)	0.02473787860402229
  (0, 69622)	0.031444513580325274
  (0, 31275)	0.043988932060584185
  (0, 73990)	0.0485320938078936
  (0, 29517)	0.031044685875709094
  (0, 24716)	0.01833553112095987
  (0, 59882)	0.03666979052214951
  (0, 49222)	0.03681609164380762
  (0, 32522)	0.018251852987001417
  (0, 20576)	0.05751345382068464
  (0, 15350)	0.04315680411619285
  (0, 59949)	0.018839632352459234
  (0, 48644)	0.014704783624749486
  (0, 23573)	0.022699364952082142
  (0, 78780)	0.028329182153759114
  (0, 51080)	0.026685796035449988
  (0, 61849)	0.014670304094433581
  (0, 13195)	0.011978571027135414
  (0, 51447)	0.0162856523537694
  (0, 13931)	0.02627201711051895
  (0, 86841)	0.015156052722036463
  :	:
  (33672, 58521)	0.11158688932014053
  (33672, 24418)	0.14316393078446044
  (33672, 86826)	0.022123209854347254
  (33672, 79418)	0.06266371318429674
  (33672, 8

In [26]:
# LogisticRegression

In [27]:
from sklearn.linear_model import LogisticRegression
model_1 = LogisticRegression(max_iter=900)
model_1.fit(tfidf_train, y_train)

LogisticRegression(max_iter=900)

In [28]:
from sklearn.metrics import accuracy_score
pred_1 = model_1.predict(tfidf_test)
cr1    = accuracy_score(y_test,pred_1)
print(cr1*100)

98.8596881959911


In [29]:
# PassiveAggressiveClassifier

In [30]:
from sklearn.linear_model import PassiveAggressiveClassifier
model = PassiveAggressiveClassifier(max_iter=50)
model.fit(tfidf_train, y_train)

PassiveAggressiveClassifier(max_iter=50)

In [31]:
y_pred = model.predict(tfidf_test)
accscore = accuracy_score(y_test, y_pred)
print('The accuracy of prediction is ',accscore*100)

The accuracy of prediction is  99.59910913140313


In [32]:
y_pred

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [33]:
y_test

12821    0
15207    0
16252    1
1062     0
22181    0
        ..
3869     1
1545     1
6468     0
16747    0
7725     0
Name: genuineness, Length: 11225, dtype: int64