In [23]:
import numpy as np
import pandas as pd
import itertools

In [24]:
data = pd.read_csv('news.csv')

In [25]:
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6335 non-null   int64 
 1   title       6335 non-null   object
 2   text        6335 non-null   object
 3   label       6335 non-null   object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


In [27]:
datas = data.rename(columns = {"Unnamed: 0":"Id"})

In [28]:
datas.head()

Unnamed: 0,Id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [29]:
datas.describe()

Unnamed: 0,Id
count,6335.0
mean,5280.415627
std,3038.503953
min,2.0
25%,2674.5
50%,5271.0
75%,7901.0
max,10557.0


In [30]:
datas['Id'].isnull()

0       False
1       False
2       False
3       False
4       False
        ...  
6330    False
6331    False
6332    False
6333    False
6334    False
Name: Id, Length: 6335, dtype: bool

In [31]:
labels = datas.label
labels.head(10)

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
5    FAKE
6    FAKE
7    REAL
8    REAL
9    REAL
Name: label, dtype: object

In [32]:
labels.value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

In [38]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(datas['text'], labels, test_size = 0.2, random_state = 1)

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [40]:
temp = TfidfVectorizer(stop_words = 'english', max_df = 0.1)
temp_train = temp.fit_transform(x_train)
temp_test = temp.transform(x_test)

In [48]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state = 0)
lr.fit(temp_train,y_train)
y_pred = lr.predict(temp_test)
s_lr = round(lr.score(temp_train,y_train)*100, 2)

In [47]:
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knc.fit(temp_train,y_train)
y_pred = knc.predict(temp_test)
s_knc = round(knc.score(temp_train,y_train)*100, 2)

In [49]:
from sklearn.svm import SVC
svc = SVC(kernel = 'linear', random_state = 0)
svc.fit(temp_train,y_train)
y_pred = svc.predict(temp_test)
s_svc = round(svc.score(temp_train,y_train)*100, 2)

In [50]:
from sklearn.svm import SVC
ksvc = SVC(kernel = 'rbf', random_state = 0)
ksvc.fit(temp_train,y_train)
y_pred = ksvc.predict(temp_test)
s_ksvc = round(ksvc.score(temp_train,y_train)*100, 2)

In [53]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
dtc.fit(temp_train,y_train)
y_pred = dtc.predict(temp_test)
s_dtc = round(dtc.score(temp_train,y_train)*100, 2)

In [54]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
rfc.fit(temp_train,y_train)
y_pred = rfc.predict(temp_test)
s_rfc = round(rfc.score(temp_train,y_train)*100, 2)

In [55]:
from sklearn.linear_model import PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter = 50)
pac.fit(temp_train,y_train)
y_pred = pac.predict(temp_test)
s_pac = round(pac.score(temp_train,y_train)*100, 2)

In [56]:
result = pd.DataFrame({'Model':['Logistic Regression','K-Nearest Neighbors','Support vector Classifier','Kernel Support vector Classifier','Passive Aggressive Classifier','Decision Tree Classifier','Random Forest Classifier'],
                     'Score':[s_lr,s_knc,s_svc,s_ksvc,s_pac,s_dtc,s_rfc]})
results = result.sort_values(by = 'Score', ascending = False)
results = results.set_index('Score')
results.head(8)

Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
100.0,Passive Aggressive Classifier
100.0,Decision Tree Classifier
99.84,Kernel Support vector Classifier
99.51,Random Forest Classifier
99.23,Support vector Classifier
96.05,Logistic Regression
57.2,K-Nearest Neighbors


In [63]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = pac, X = temp_train, y = y_train, cv = 10)
print("Scores:",accuracies)
print("Accuracy : {:.2f}%".format(accuracies.mean()*100))
print("Standard Deviation : {:2f}%".format(accuracies.std()*100))

Scores: [0.92110454 0.90729783 0.89940828 0.93491124 0.91913215 0.92899408
 0.9270217  0.94280079 0.88735178 0.94071146]
Accuracy : 92.09%
Standard Deviation : 1.716683%
