In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier

In [2]:
df = pd.read_csv('news.csv')

In [3]:
df = df.set_index('Unnamed: 0')

In [4]:
df

Unnamed: 0_level_0,title,text,label
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...
4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [5]:
df["label"].replace({"FAKE": -1, "REAL": 1}, inplace=True)

In [6]:
df

Unnamed: 0_level_0,title,text,label
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",-1
10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,-1
3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1
10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",-1
875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1
...,...,...,...
4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,1
8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,-1
8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,-1
4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",1


In [7]:
y = df.label

In [8]:
y

Unnamed: 0
8476    -1
10294   -1
3608     1
10142   -1
875      1
        ..
4490     1
8062    -1
8622    -1
4021     1
4330     1
Name: label, Length: 6335, dtype: int64

In [9]:
df = df.drop('label', axis=1)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.22, random_state=43)

In [11]:
y_test

Unnamed: 0
10269   -1
4977     1
10101   -1
7094    -1
3551     1
        ..
1360     1
1715     1
10440   -1
317      1
5989    -1
Name: label, Length: 1394, dtype: int64

In [12]:
#count vectorizer set
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [13]:
#tdidf set
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.73)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [14]:
#hashing set
hash_vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False)
hash_train = hash_vectorizer.fit_transform(X_train)
hash_test = hash_vectorizer.transform(X_test)

In [15]:
#logistic with countvectorizer
clf = LogisticRegression()
clf.fit(count_train,y_train)
pred_test = clf.predict(count_test)
score_lg_cnt = accuracy_score(y_test, pred_test)
print("accuracy:   %0.3f" % score_lg_cnt)

accuracy:   0.920


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [16]:
#logistic with tfidfvectorizer
tfidf_clf = LogisticRegression()
tfidf_clf.fit(tfidf_train,y_train)
tf_pred_test = tfidf_clf.predict(tfidf_test)
score_lg_tf = accuracy_score(y_test, tf_pred_test)
print("accuracy:   %0.3f" % score_lg_tf)

accuracy:   0.918


In [17]:
#logistic with hash
hash_clf = LogisticRegression()
hash_clf.fit(hash_train,y_train)
hash_pred_test = hash_clf.predict(hash_test)
score_lg_hs = accuracy_score(y_test, hash_pred_test)
print("accuracy:   %0.3f" % score_lg_hs)

accuracy:   0.917


In [18]:
#multinomial with tdif
mn_tfidf_clf = MultinomialNB(alpha=0.2)
mn_tfidf_clf.fit(tfidf_train, y_train)
mn_pred = mn_tfidf_clf.predict(tfidf_test)
score_mlt_tf = accuracy_score(y_test, mn_pred)
print("accuracy:   %0.3f" % score_mlt_tf)

accuracy:   0.891


In [19]:
#multinomial with count
mn_cnt_clf = MultinomialNB(alpha=0.2)
mn_cnt_clf.fit(count_train, y_train)
mn_cnt_pred = mn_cnt_clf.predict(count_test)
score_mlt_cnt = accuracy_score(y_test, mn_cnt_pred)
print("accuracy:   %0.3f" % score_mlt_cnt)

accuracy:   0.893


In [20]:
#multinomial with hash
mn_hash_clf = MultinomialNB(alpha=0.2)
mn_hash_clf.fit(hash_train, y_train)
mn_hash_pred = mn_hash_clf.predict(hash_test)
score_mlt_hs = accuracy_score(y_test, mn_hash_pred)
print("accuracy:   %0.3f" % score_mlt_hs)

accuracy:   0.839


In [21]:
#passiveaggressiveclassifier with tdidf
pa_tfidf_clf = PassiveAggressiveClassifier(max_iter=100)
pa_tfidf_clf.fit(tfidf_train, y_train)
pac_tf_pred = pa_tfidf_clf.predict(tfidf_test)
score_pac_tf = accuracy_score(y_test, pac_tf_pred)
print("accuracy:   %0.3f" % score_pac_tf)

accuracy:   0.938


In [22]:
#passiveaggressiveclassifier with count
pa_cnt_clf = PassiveAggressiveClassifier(max_iter=100)
pa_cnt_clf.fit(count_train, y_train)
pac_cnt_pred = pa_cnt_clf.predict(count_test)
score_pac_cnt = accuracy_score(y_test, pac_cnt_pred)
print("accuracy:   %0.3f" % score_pac_cnt)

accuracy:   0.902


In [23]:
#passiveaggressiveclassifier with hash
pa_hash_clf = PassiveAggressiveClassifier(max_iter=100)
pa_hash_clf.fit(hash_train, y_train)
pac_hash_pred = pa_hash_clf.predict(hash_test)
score_pac_hs = accuracy_score(y_test, pac_hash_pred)
print("accuracy:   %0.3f" % score_pac_hs)

accuracy:   0.925


In [24]:
#sgdc witn tf
sgd_tfidf_clf = SGDClassifier()
sgd_tfidf_clf.fit(tfidf_train, y_train)
pred_sgdc_tf = sgd_tfidf_clf.predict(tfidf_test)
score_sgdc_tf = accuracy_score(y_test, pred_sgdc_tf)
print("accuracy:   %0.3f" % score_sgdc_tf)

accuracy:   0.938


In [25]:
#sgdc witn count
sgd_cnt_clf = SGDClassifier()
sgd_cnt_clf.fit(count_train, y_train)
pred_sgdc_cnt = sgd_cnt_clf.predict(count_test)
score_sgdc_cnt = accuracy_score(y_test, pred_sgdc_cnt)
print("accuracy:   %0.3f" % score_sgdc_cnt)

accuracy:   0.912


In [26]:
#sgdc witn hash
sgd_hash_clf = SGDClassifier()
sgd_hash_clf.fit(hash_train, y_train)
pred_sgdc_hash = sgd_hash_clf.predict(hash_test)
score_sgdc_hash = accuracy_score(y_test, pred_sgdc_hash)
print("accuracy:   %0.3f" % score_sgdc_hash)

accuracy:   0.928


In [27]:
#linearsvc with tfidf
svc_tfidf_clf = LinearSVC()
svc_tfidf_clf.fit(tfidf_train, y_train)
pred_lsvc_tf = svc_tfidf_clf.predict(tfidf_test)
score_lsvc_tf = accuracy_score(y_test, pred_lsvc_tf)
print("accuracy:   %0.3f" % score_lsvc_tf)

accuracy:   0.939


In [28]:
#linearsvc with count
svc_cnt_clf = LinearSVC()
svc_cnt_clf.fit(count_train, y_train)
pred_lsvc_cnt = svc_cnt_clf.predict(count_test)
score_lsvc_cnt = accuracy_score(y_test, pred_lsvc_cnt)
print("accuracy:   %0.3f" % score_lsvc_cnt)

accuracy:   0.887




In [29]:
#linearsvc with hash
svc_hash_clf = LinearSVC()
svc_hash_clf.fit(hash_train, y_train)
pred_lsvc_hash = svc_hash_clf.predict(hash_test)
score_lsvc_hash = accuracy_score(y_test, pred_lsvc_hash)
print("accuracy:   %0.3f" % score_lsvc_hash)

accuracy:   0.932


In [30]:
#tree with tfidfvectorizer
tree_tf_clf = tree.DecisionTreeClassifier()
tree_tf_clf.fit(tfidf_train,y_train)
tree_tf_pred = tree_tf_clf.predict(tfidf_test)
score_tree_tf = accuracy_score(y_test, tree_tf_pred)
print("accuracy:   %0.3f" % score_tree_tf)

accuracy:   0.823


In [31]:
#tree with countvectorizer
tree_count_clf = tree.DecisionTreeClassifier()
tree_count_clf.fit(count_train,y_train)
tree_count_pred = tree_count_clf.predict(count_test)
score_tree_count = accuracy_score(y_test, tree_count_pred)
print("accuracy:   %0.3f" % score_tree_count)

accuracy:   0.813


In [32]:
#tree with hashvectorizer
tree_hash_clf = tree.DecisionTreeClassifier()
tree_hash_clf.fit(hash_train,y_train)
tree_hash_pred = tree_hash_clf.predict(hash_test)
score_tree_hash = accuracy_score(y_test, tree_hash_pred)
print("accuracy:   %0.3f" % score_tree_hash)

accuracy:   0.812


In [33]:
#forest with tf
forest_tf_clf = RandomForestClassifier()
forest_tf_clf.fit(tfidf_train,y_train)
forest_tf_pred = forest_tf_clf.predict(tfidf_test)
score_forest_tf = accuracy_score(y_test, forest_tf_pred)
print("accuracy:   %0.3f" % score_forest_tf)

accuracy:   0.915


In [34]:
#forest with countvectorizer
forest_count_clf = RandomForestClassifier()
forest_count_clf.fit(count_train,y_train)
forest_count_pred = forest_count_clf.predict(count_test)
score_forest_count = accuracy_score(y_test, forest_count_pred)
print("accuracy:   %0.3f" % score_forest_count)

accuracy:   0.905


In [35]:
#forest with hashvectorizer
forest_hash_clf = RandomForestClassifier()
forest_hash_clf.fit(hash_train,y_train)
forest_hash_pred = forest_hash_clf.predict(hash_test)
score_forest_hash = accuracy_score(y_test, forest_hash_pred)
print("accuracy:   %0.3f" % score_forest_hash)

accuracy:   0.892


In [36]:
#gbc with tfidfvectorizer
gbc_tf_clf = GradientBoostingClassifier()
gbc_tf_clf.fit(tfidf_train,y_train)
gbc_tf_pred = gbc_tf_clf.predict(tfidf_test)
score_gbc_tf = accuracy_score(y_test, gbc_tf_pred)
print("accuracy:   %0.3f" % score_gbc_tf)

accuracy:   0.900


In [37]:
#gbc with countvectorizer
gbc_count_clf = GradientBoostingClassifier()
gbc_count_clf.fit(count_train,y_train)
gbc_count_pred = gbc_count_clf.predict(count_test)
score_gbc_count = accuracy_score(y_test, gbc_count_pred)
print("accuracy:   %0.3f" % score_gbc_count)

accuracy:   0.900


In [38]:
#gbc with hashvectorizer
gbc_hash_clf = GradientBoostingClassifier()
gbc_hash_clf.fit(hash_train,y_train)
gbc_hash_pred = gbc_hash_clf.predict(hash_test)
score_gbc_hash = accuracy_score(y_test, gbc_hash_pred)
print("accuracy:   %0.3f" % score_gbc_hash)

accuracy:   0.901


In [39]:
#abc with tfidfvectorizer
abc_tf_clf = AdaBoostClassifier()
abc_tf_clf.fit(tfidf_train,y_train)
abc_tf_pred = abc_tf_clf.predict(tfidf_test)
score_abc_tf = accuracy_score(y_test, abc_tf_pred)
print("accuracy:   %0.3f" % score_abc_tf)

accuracy:   0.874


In [40]:
#abc with countvectorizer
abc_count_clf = AdaBoostClassifier()
abc_count_clf.fit(count_train,y_train)
abc_count_pred = abc_count_clf.predict(count_test)
score_abc_count = accuracy_score(y_test, abc_count_pred)
print("accuracy:   %0.3f" % score_abc_count)

accuracy:   0.879


In [41]:
#abc with hashvectorizer
abc_hash_clf = AdaBoostClassifier()
abc_hash_clf.fit(hash_train,y_train)
abc_hash_pred = abc_hash_clf.predict(hash_test)
score_abc_hash = accuracy_score(y_test, abc_hash_pred)
print("accuracy:   %0.3f" % score_abc_hash)

accuracy:   0.876


In [42]:
#bc with tfidfvectorizer
bc_tf_clf = BaggingClassifier()
bc_tf_clf.fit(tfidf_train,y_train)
bc_tf_pred = bc_tf_clf.predict(tfidf_test)
score_bc_tf = accuracy_score(y_test, bc_tf_pred)
print("accuracy:   %0.3f" % score_bc_tf)

accuracy:   0.870


In [43]:
#bc with countvectorizer
bc_count_clf = BaggingClassifier()
bc_count_clf.fit(count_train,y_train)
bc_count_pred = bc_count_clf.predict(count_test)
score_bc_count = accuracy_score(y_test, bc_count_pred)
print("accuracy:   %0.3f" % score_bc_count)

accuracy:   0.875


In [44]:
#bc with hashvectorizer
bc_hash_clf = BaggingClassifier()
bc_hash_clf.fit(hash_train,y_train)
bc_hash_pred = bc_hash_clf.predict(hash_test)
score_bc_hash = accuracy_score(y_test, bc_hash_pred)
print("accuracy:   %0.3f" % score_bc_hash)

accuracy:   0.885
