Necessary libraries are imported & few extras are downloaded

In [None]:
import pandas as pd
import nltk
nltk.download('punkt')

Importing the required CSV data

In [47]:
real_data = pd.read_csv(r"true_data.csv")
fake_data = pd.read_csv(r"fake_data.csv")

display(real_data.head(5))
display(fake_data.head(5))

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


Few columns are added for reference and both the datasets are concatenated

In [48]:
real_data['value'] = 1
fake_data['value'] = 0

data = pd.concat([real_data, fake_data], axis=0)
data = data.reset_index(drop=True)

data = data.drop(['title', 'subject','date'], axis=1)

**Data processing code-section begins**

Tokenization of text data

In [49]:
from nltk.tokenize import word_tokenize

data['text'] = data['text'].apply(word_tokenize)
display(data['text'])

0        [WASHINGTON, (, Reuters, ), -, The, head, of, ...
1        [WASHINGTON, (, Reuters, ), -, Transgender, pe...
2        [WASHINGTON, (, Reuters, ), -, The, special, c...
3        [WASHINGTON, (, Reuters, ), -, Trump, campaign...
4        [SEATTLE/WASHINGTON, (, Reuters, ), -, Preside...
                               ...                        
44893    [21st, Century, Wire, says, As, 21WIRE, report...
44894    [21st, Century, Wire, says, It, s, a, familiar...
44895    [Patrick, Henningsen, 21st, Century, WireRemem...
44896    [21st, Century, Wire, says, Al, Jazeera, Ameri...
44897    [21st, Century, Wire, says, As, 21WIRE, predic...
Name: text, Length: 44898, dtype: object

Applying Stemming to the pulverized data

In [50]:
from nltk.stem.snowball import SnowballStemmer

porter = SnowballStemmer("english", ignore_stopwords=False)

def stemmer(text):
  return [porter.stem(word) for word in text]

data['text'] = data['text'].apply(stemmer)

display(data['text'])

0        [washington, (, reuter, ), -, the, head, of, a...
1        [washington, (, reuter, ), -, transgend, peopl...
2        [washington, (, reuter, ), -, the, special, co...
3        [washington, (, reuter, ), -, trump, campaign,...
4        [seattle/washington, (, reuter, ), -, presid, ...
                               ...                        
44893    [21st, centuri, wire, say, as, 21wire, report,...
44894    [21st, centuri, wire, say, it, s, a, familiar,...
44895    [patrick, henningsen, 21st, centuri, wireremem...
44896    [21st, centuri, wire, say, al, jazeera, americ...
44897    [21st, centuri, wire, say, as, 21wire, predict...
Name: text, Length: 44898, dtype: object

Stopwords are deleted from the modified text

In [51]:
# Here, actual stopwords are not deleted due to the case of news data; since news data may contain impactful stopwords, removing them may 
# result in inappropriate output results. So, a different criterion is thereby considered to remove certain words.

def stopwords_remover(text):
  return [word for word in text if len(word)>2]

data['text'] = data['text'].apply(stopwords_remover)

display(data['text'])

0        [washington, reuter, the, head, conserv, repub...
1        [washington, reuter, transgend, peopl, will, a...
2        [washington, reuter, the, special, counsel, in...
3        [washington, reuter, trump, campaign, advis, g...
4        [seattle/washington, reuter, presid, donald, t...
                               ...                        
44893    [21st, centuri, wire, say, 21wire, report, ear...
44894    [21st, centuri, wire, say, familiar, theme, wh...
44895    [patrick, henningsen, 21st, centuri, wireremem...
44896    [21st, centuri, wire, say, jazeera, america, w...
44897    [21st, centuri, wire, say, 21wire, predict, ne...
Name: text, Length: 44898, dtype: object

Words of the text are joined and are splitted between training & test data

In [52]:
data['text'] = data['text'].apply(' '.join)

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data['text'], data['value'], test_size=0.25)

Vectorization of data

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5)

vector_train = vectorizer.fit_transform(x_train)
vector_test = vectorizer.transform(x_test)

print(vector_train)

  (0, 75076)	0.07867740834587342
  (0, 48777)	0.1359604456126613
  (0, 80073)	0.0740610406028035
  (0, 68327)	0.1400564966862215
  (0, 14939)	0.16327961690084994
  (0, 57369)	0.06932538381629769
  (0, 3772)	0.1938504853734042
  (0, 73989)	0.13280933711477677
  (0, 21296)	0.13934577859395747
  (0, 9038)	0.11611769344607537
  (0, 70226)	0.21707700922165027
  (0, 24669)	0.1903959814906681
  (0, 66098)	0.11771981560056426
  (0, 56029)	0.09533132608848814
  (0, 82466)	0.06941052575522184
  (0, 24577)	0.10157808690283564
  (0, 16326)	0.13097015622834776
  (0, 76970)	0.1081845497507408
  (0, 61989)	0.17691909272874423
  (0, 61450)	0.1213497463283406
  (0, 77844)	0.1138573227197121
  (0, 85432)	0.149137809352625
  (0, 66560)	0.27351618940357686
  (0, 64464)	0.13815509196340447
  (0, 17730)	0.08034787409235528
  :	:
  (33672, 81519)	0.03696681188929346
  (33672, 67615)	0.04096804092728339
  (33672, 29181)	0.025591971770606903
  (33672, 81408)	0.016580985621438404
  (33672, 51894)	0.022663164190

Logistic Regression is applied and input test data is predicted

In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model_1 = LogisticRegression(max_iter=1000)
model_1.fit(vector_train, y_train)

predicted_data_1 = model_1.predict(vector_test)
accuracy_1 = accuracy_score(y_test, predicted_data_1)*100

print(accuracy_1)

98.9042316258352


Using PassiveAggressiveClassifier

In [55]:
from sklearn.linear_model import PassiveAggressiveClassifier

model_2 = PassiveAggressiveClassifier(max_iter=1000)
model_2.fit(vector_train, y_train)

predicted_data_2 = model_2.predict(vector_test)
accuracy_2 = accuracy_score(y_test, predicted_data_2)*100

print(accuracy_2)

99.65256124721603
