In [12]:
import pandas as pd
import spacy as sp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [13]:
df_1 = pd.read_csv('Fake.csv')
df_2 = pd.read_csv('True.csv')

In [14]:
fake = 'Fake'
df_1['status'] = fake


true = 'True'
df_2['status'] = true
df_2.head()

Unnamed: 0,title,text,subject,date,status
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",True
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",True
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",True
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",True
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",True


In [15]:
tabels = [df_1, df_2]

data_table = pd.concat(tabels, axis = 0)
data_table.tail()

Unnamed: 0,title,text,subject,date,status
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",True
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",True
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",True
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",True
21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",True


In [16]:
data_table.shape

(44898, 5)

In [17]:
data_table.status.value_counts()

status
Fake    23481
True    21417
Name: count, dtype: int64

In [18]:
min_value = 21417

Fake = data_table[data_table.status == 'Fake'].sample(min_value, random_state = 100)
Real = data_table[data_table.status == 'True'].sample(min_value, random_state = 100)


In [19]:
df = pd.concat([Fake, Real], axis = 0)
df

Unnamed: 0,title,text,subject,date,status
2341,House GOP Puts DISGUSTING Restriction On Who’...,Paul Ryan said that the House would implement ...,News,"March 2, 2017",Fake
3322,Trump Supporter Unleashes EXPLOSIVE Racism On...,Now that Trump is officially going to be the n...,News,"December 21, 2016",Fake
22639,"Trump, Sanders Win Big In Michigan – Trump, Cl...",21st Century Wire says Rich men love to gamble...,US_News,"March 8, 2016",Fake
14296,BREAKING…Internal Memo From OBAMA’S Corrupt EP...,Wow! Bernie and Hillary have been making hay o...,politics,"Mar 15, 2016",Fake
22375,Amerika: ‘Tolerant’ University Educators Exile...,"21st Century Wire says Since the late 1960 s, ...",US_News,"November 15, 2016",Fake
...,...,...,...,...,...
16304,Hungary's Jobbik supports EU deepening with vo...,BUDAPEST (Reuters) - Hungary s main opposition...,worldnews,"October 27, 2017",True
79,Trump aides hope win on taxes will stem slide ...,WASHINGTON (Reuters) - Near the end of Preside...,politicsNews,"December 20, 2017",True
12119,South Africa's Zuma says influence-peddling in...,JOHANNESBURG (Reuters) - South African Preside...,worldnews,"December 16, 2017",True
14147,Iran's Rouhani says foreign interference in Sy...,"SOCHI, Russia (Reuters) - Iranian President Ha...",worldnews,"November 22, 2017",True


In [20]:
df.shape

(42834, 5)

In [21]:
df.head()

Unnamed: 0,title,text,subject,date,status
2341,House GOP Puts DISGUSTING Restriction On Who’...,Paul Ryan said that the House would implement ...,News,"March 2, 2017",Fake
3322,Trump Supporter Unleashes EXPLOSIVE Racism On...,Now that Trump is officially going to be the n...,News,"December 21, 2016",Fake
22639,"Trump, Sanders Win Big In Michigan – Trump, Cl...",21st Century Wire says Rich men love to gamble...,US_News,"March 8, 2016",Fake
14296,BREAKING…Internal Memo From OBAMA’S Corrupt EP...,Wow! Bernie and Hillary have been making hay o...,politics,"Mar 15, 2016",Fake
22375,Amerika: ‘Tolerant’ University Educators Exile...,"21st Century Wire says Since the late 1960 s, ...",US_News,"November 15, 2016",Fake


In [22]:
df.status.value_counts()

status
Fake    21417
True    21417
Name: count, dtype: int64

In [23]:
print(df.isna().sum())
print(df.status.unique())

title      0
text       0
subject    0
date       0
status     0
dtype: int64
['Fake' 'True']


In [24]:
target = {'True' : 0,  'Fake' : 1}

df['status_value'] = df.status.map(target)
df.head()

Unnamed: 0,title,text,subject,date,status,status_value
2341,House GOP Puts DISGUSTING Restriction On Who’...,Paul Ryan said that the House would implement ...,News,"March 2, 2017",Fake,1
3322,Trump Supporter Unleashes EXPLOSIVE Racism On...,Now that Trump is officially going to be the n...,News,"December 21, 2016",Fake,1
22639,"Trump, Sanders Win Big In Michigan – Trump, Cl...",21st Century Wire says Rich men love to gamble...,US_News,"March 8, 2016",Fake,1
14296,BREAKING…Internal Memo From OBAMA’S Corrupt EP...,Wow! Bernie and Hillary have been making hay o...,politics,"Mar 15, 2016",Fake,1
22375,Amerika: ‘Tolerant’ University Educators Exile...,"21st Century Wire says Since the late 1960 s, ...",US_News,"November 15, 2016",Fake,1


In [25]:
df.shape

(42834, 6)

In [26]:
nlp = sp.load('en_core_web_sm')

def clear(text):
    doc = nlp(text)
    word = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        word.append(token.lemma_)
    return ' '.join(word)
        

In [27]:
df['clean_text'] = df.text.apply(clear)
df.head()

Unnamed: 0,title,text,subject,date,status,status_value,clean_text
2341,House GOP Puts DISGUSTING Restriction On Who’...,Paul Ryan said that the House would implement ...,News,"March 2, 2017",Fake,1,Paul Ryan say House implement completely trans...
3322,Trump Supporter Unleashes EXPLOSIVE Racism On...,Now that Trump is officially going to be the n...,News,"December 21, 2016",Fake,1,Trump officially go President United States su...
22639,"Trump, Sanders Win Big In Michigan – Trump, Cl...",21st Century Wire says Rich men love to gamble...,US_News,"March 8, 2016",Fake,1,21st Century Wire say Rich man love gamble wee...
14296,BREAKING…Internal Memo From OBAMA’S Corrupt EP...,Wow! Bernie and Hillary have been making hay o...,politics,"Mar 15, 2016",Fake,1,wow Bernie Hillary make hay Flint water crisis...
22375,Amerika: ‘Tolerant’ University Educators Exile...,"21st Century Wire says Since the late 1960 s, ...",US_News,"November 15, 2016",Fake,1,21st Century Wire say late 1960 s s accept fac...


In [28]:
X_train, X_test, Y_train, Y_test = train_test_split(df.clean_text, df.status_value, test_size = 0.25, random_state = 100, stratify = df.status_value)

In [29]:
print(f'X_train Shape:- {X_train.shape}')
print(f'X_test Shape:- {X_test.shape}')

X_train Shape:- (32125,)
X_test Shape:- (10709,)


In [30]:
Y_train.value_counts()

status_value
1    16063
0    16062
Name: count, dtype: int64

In [31]:
pipeline_1 = Pipeline([
    ('CountVecterizer', CountVectorizer()),
    ('knn', KNeighborsClassifier())
])

pipeline_1.fit(X_train, Y_train)

Y_pred = pipeline_1.predict(X_test)

print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.76      0.81      0.78      5355
           1       0.79      0.75      0.77      5354

    accuracy                           0.78     10709
   macro avg       0.78      0.78      0.78     10709
weighted avg       0.78      0.78      0.78     10709



In [32]:
pipeline_2 = Pipeline([
    ('CountVecterizer', CountVectorizer()),
    ('MultinomialNB', MultinomialNB())
])

pipeline_2.fit(X_train, Y_train)

Y_pred = pipeline_2.predict(X_test)

print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95      5355
           1       0.95      0.95      0.95      5354

    accuracy                           0.95     10709
   macro avg       0.95      0.95      0.95     10709
weighted avg       0.95      0.95      0.95     10709



In [33]:
pipeline_3 = Pipeline([
    ('CountVecterizer', CountVectorizer()),
    ('RandomForestClassifier', RandomForestClassifier())
])

pipeline_3.fit(X_train, Y_train)

Y_pred = pipeline_3.predict(X_test)

print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5355
           1       1.00      0.99      0.99      5354

    accuracy                           0.99     10709
   macro avg       0.99      0.99      0.99     10709
weighted avg       0.99      0.99      0.99     10709



In [34]:
pipeline_4 = Pipeline([
    ('CountVecterizer', CountVectorizer()),
    ('DecisionTreeClassifier', DecisionTreeClassifier())
])

pipeline_4.fit(X_train, Y_train)

Y_pred = pipeline_4.predict(X_test)

print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5355
           1       1.00      1.00      1.00      5354

    accuracy                           1.00     10709
   macro avg       1.00      1.00      1.00     10709
weighted avg       1.00      1.00      1.00     10709



In [None]:
pipeline_5 = Pipeline([
    ('CountVecterizer', CountVectorizer()),
    ('SVC', SVC())
])

pipeline_5.fit(X_train, Y_train)

Y_pred = pipeline_5.predict(X_test)

print(classification_report(Y_test, Y_pred))