In [1]:
import os
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.svm import SVC
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from nltk.tokenize import WordPunctTokenizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer
barsize = '{l_bar}{bar:10}{r_bar}{bar:-10b}'
%matplotlib inline

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
os.listdir('final/')

['mcintire.csv', 'kaggle.csv', 'buzzfeed.csv', 'politifact.csv']

In [4]:
data1 = pd.read_csv(r"final/mcintire.csv")
data2 = pd.read_csv(r"final/buzzfeed.csv")
data3 = pd.read_csv(r"final/politifact.csv")
data4 = pd.read_csv(r"final/kaggle.csv")

In [5]:
# df.head()
# dff.head()

In [6]:
dataset = pd.concat([data1, data2])

In [7]:
dataset.sample(frac = 1)
dataset.reset_index(drop =True, inplace = True)

In [8]:
dataset.label.value_counts()

real    2388
fake    2388
Name: label, dtype: int64

In [9]:
real = dataset[dataset['label'] == 'real']
fake = dataset[dataset['label'] == 'fake']
print(len(fake), len(real))

2388 2388


In [10]:
# total = 100
# real = real.sample(n = total)
# fake = fake.sample(n = total)
# print(len(fake), len(real))

In [11]:
dataset = pd.concat([real, fake])

In [12]:
dataset.reset_index(drop =True, inplace = True)
dataset.label.value_counts()

real    2388
fake    2388
Name: label, dtype: int64

In [13]:
import re
dataset['title'] = dataset['title'].apply(lambda x: re.findall("[A-Za-z0-9]+", str(x)))
dataset['title'] = dataset['title'].apply(lambda x: " ".join(x).lower())
dataset['text'] = dataset['text'].apply(lambda x: re.findall("[A-Za-z0-9]+", str(x)))
dataset['text'] = dataset['text'].apply(lambda x: " ".join(x).lower())

In [14]:
dataset.head()
df = dataset.copy()

## PREPROCESSING

In [15]:
tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))

def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()

In [16]:
df['title'] = df['title'].apply(lambda x : tweet_cleaner(str(x)))
df['text'] = df['text'].apply(lambda x : tweet_cleaner(str(x)))
# df['meta_data'] = df['meta_data'].apply(tweet_cleaner)

In [17]:
def all_X(row):
    try:
        return row['title']+" "+row["text"]
    except: print("error", row)
        #giving out the error fields.    

In [18]:
df["all_X"]= df.apply(all_X,axis=1)

In [19]:
df = df[['all_X', 'label']]

In [20]:
print(df.columns)

Index(['all_X', 'label'], dtype='object')


In [21]:
#Assign text to X variable and labels to y

X = df.all_X
y = df.label

In [22]:
#Intialize Count Vectorizer
count_vec = CountVectorizer(lowercase = True, stop_words="english", min_df = 2, ngram_range = (1, 1))
#Fit Count Vectorizer
dtm_cv = count_vec.fit_transform(X)
#Convert it to a pandas data frame
df_cv = pd.DataFrame(dtm_cv.toarray(), columns=count_vec.get_feature_names())

In [23]:
df_cv.head()

Unnamed: 0,aa,aab,aap,aaron,aarp,ab,aba,abaaoud,aback,abadi,...,zu,zucker,zuckerberg,zucman,zuesse,zulu,zurich,zwick,zy,zyuganov
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
df_cv.shape

(4776, 33426)

In [25]:
x_train, x_test, y_train, y_test = train_test_split(df_cv, y, test_size = 0.3, random_state = 0)

In [26]:
print(x_train.shape,
     x_test.shape)

(3343, 33426) (1433, 33426)


### Naive Bayes

In [27]:
#Initialize model
import time
tic = time.time()
model = MultinomialNB()
#Fit model with df_cv and y
model.fit(x_train, y_train)
#score the model
y_pred = model.predict(x_test)
toc = time.time()
print(toc - tic)
# accuracy = cross_val_score(estimator = model, X = x_train, y = y_train, cv=10)
print("Model accuracy : {:0.2f}%".format(accuracy_score(y_pred,y_test)*100))
# print("cross validation : {:0.2f}%".format(accuracy.mean()*100))
print(classification_report(y_test, y_pred))

1.3853652477264404
Model accuracy : 88.70%
              precision    recall  f1-score   support

        fake       0.90      0.87      0.89       727
        real       0.87      0.90      0.89       706

    accuracy                           0.89      1433
   macro avg       0.89      0.89      0.89      1433
weighted avg       0.89      0.89      0.89      1433



### SVM

In [28]:
tic = time.time()
svm = SVC(C=100, gamma = 0.0001)
svm.fit(x_train, y_train)
y_pred = svm.predict(x_test)
toc = time.time()
print(toc - tic)
# tic = time.time()
# accuracy = cross_val_score(estimator = svm, X = x_train, y = y_train, cv=10)
# toc = time.time()
# print(toc - tic)
print("Model accuracy : {:0.2f}%".format(accuracy_score(y_pred,y_test)*100))
# print("cross validation : {:0.2f}%".format(acuracy.mean()*100))
print(classification_report(y_test, y_pred))

288.43796944618225
Model accuracy : 90.16%
              precision    recall  f1-score   support

        fake       0.89      0.92      0.90       727
        real       0.92      0.88      0.90       706

    accuracy                           0.90      1433
   macro avg       0.90      0.90      0.90      1433
weighted avg       0.90      0.90      0.90      1433



In [29]:
# # defining parameter range 
# param_grid = {'C': [0.1, 1, 10, 100, 1000],  
#                'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
#               'kernel': ['rbf']}  
# grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 
# grid.fit(x_train, y_train)
# # print best parameter after tuning 
# print(grid.best_params_) 
# print(grid.best_estimator_)

### Ada Boost

In [42]:
import time
tic = time.time()
abc = AdaBoostClassifier(n_estimators = 50,learning_rate = 1)
abc.fit(x_train, y_train)
y_pred = abc.predict(x_test)
toc = time.time()
print(toc - tic)
# tic = time.time()
# acuracy = cross_val_score(estimator = abc, X = x_train, y = y_train, cv = 10)
# toc = time.time()
# print(toc - tic)
y_pred = abc.predict(x_test)
print("Model accuracy : {:0.2f}%".format(accuracy_score(y_pred,y_test)*100))
# print("cross validation : {:0.2f}%".format(acuracy.mean()*100))
print(classification_report(y_test, y_pred))

121.89146995544434
Model accuracy : 88.49%
              precision    recall  f1-score   support

        fake       0.87      0.91      0.89       727
        real       0.90      0.86      0.88       706

    accuracy                           0.88      1433
   macro avg       0.89      0.88      0.88      1433
weighted avg       0.89      0.88      0.88      1433



### Boosting

In [31]:
# gbc = GradientBoostingClassifier(max_depth = 2)
# gbc.fit(x_train, y_train)
# y_pred = gbc.predict(x_test)
# accuracy = cross_val_score(estimator = gbc, X = x_train, y = y_train, cv=10)
# print("Model accuracy : {:0.2f}%".format(accuracy_score(y_pred,y_test)*100))
# print("cross validation : {:0.2f}%".format(accuracy.mean()*100))

In [32]:
# print(classification_report(y_test, y_pred))

### Decision Tree

In [33]:
# dTree = DecisionTreeClassifier()
# dTree.fit(x_train,y_train)
# y_pred = dTree.predict(x_test)
# accuracy = cross_val_score(estimator = dTree, X = x_train, y = y_train, cv = 10)
# print("Model accuracy : {:0.2f}%".format(accuracy_score(y_pred,y_test)*100))
# print("cross validation : {:0.2f}%".format(accuracy.mean()*100))

In [34]:
# path = dTree.cost_complexity_pruning_path(x_train, y_train)
# ccp_alphas = path.ccp_alphas

# trees = []
# for ccp_alpha in tqdm(ccp_alphas, bar_format = barsize):
#     tree  = DecisionTreeClassifier(random_state = 0, ccp_alpha = ccp_alpha)
#     tree.fit(x_train, y_train)
#     trees.append(tree)
    
# train_score = [tree.score(x_train, y_train) for tree in tqdm(trees, bar_format = barsize)]
# test_score = [tree.score(x_test, y_test) for tree in tqdm(trees, bar_format = barsize)]
# # cross_val_scores = [cross_val_score(estimator = tree, X = x_train, y = y_train, cv = 4).mean() for tree in tqdm(trees, bar_format = barsize)]

# fig, ax = plt.subplots()
# ax.set_xlabel('alpha')
# ax.set_ylabel('accuracy')
# ax.set_title('accuracy vs alpha for training and testing sets')
# ax.plot(ccp_alphas, train_score, marker = 'o', label = 'train')
# ax.plot(ccp_alphas, test_score, marker = 'x', label = 'test')
# # ax.plot(ccp_alphas, cross_val_scores, marker = '*', label = 'cross_val')
# plt.legend(['train', 'test', 'cross'])
# plt.show()

In [35]:
# dTree = DecisionTreeClassifier(ccp_alpha = 0.03)
# dTree.fit(x_train,y_train)
# y_pred = dTree.predict(x_test)
# # accuracy = cross_val_score(estimator = dTree, X = x_train, y = y_train, cv = 10)
# print("Model accuracy : {:0.2f}%".format(accuracy_score(y_pred,y_test)*100))
# print("cross validation : {:0.2f}%".format(accuracy.mean()*100))

In [36]:
# print(classification_report(y_test, y_pred))

### KNN

In [37]:
# #neighbors randomly taken
# knn = KNeighborsClassifier(n_neighbors = 7) 
# knn.fit(x_train, y_train) 
# y_pred = knn.predict(x_test)
# accuracy = cross_val_score(estimator = knn, X = x_train, y = y_train, cv = 10)
# print("Model accuracy : {:0.2f}%".format(accuracy_score(y_pred,y_test)*100))
# print("cross validation : {:0.2f}%".format(accuracy.mean()*100))

In [38]:
# error_rate = []  
# # Will take some time 
# for i in tqdm(range(1, 40), bar_format = barsize): 
      
#     knn = KNeighborsClassifier(n_neighbors = i) 
#     knn.fit(x_train, y_train) 
#     pred_i = knn.predict(x_test) 
#     error_rate.append(np.mean(pred_i != y_test)) 

# plt.figure(figsize =(10, 6)) 
# plt.plot(range(1, 40), error_rate, color ='blue', 
#                 linestyle ='dashed', marker ='o', 
#          markerfacecolor ='red', markersize = 10) 
  
# plt.title('Error Rate vs. K Value') 
# plt.xlabel('K') 
# plt.ylabel('Error Rate') 

In [39]:
#error rate for k=1 is very high
#error rate is decreaing after k = 9

In [40]:
# knn = KNeighborsClassifier(n_neighbors = 36) 
# knn.fit(x_train, y_train) 
# y_pred = knn.predict(x_test)
# accuracy = cross_val_score(estimator = knn, X = x_train, y = y_train, cv=10)
# print("Model accuracy : {:0.2f}%".format(accuracy_score(y_pred,y_test)*100))
# print("cross validation : {:0.2f}%".format(acuracy.mean()*100))

In [41]:
# print(classification_report(y_test, y_pred))