In [61]:
import numpy as np 
import pandas as pd
import string
import re

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle

from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from gensim.models import Word2Vec as w2v
from sklearn.decomposition import PCA
import seaborn as sns

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import LabelEncoder
from sklearn import svm

from sklearn.linear_model import LogisticRegression

# Data Processing and Cleaning

In [62]:
true_data = pd.read_csv('archive/True.csv')
fake_data = pd.read_csv('archive/Fake.csv')
# combine into one data set, with true = 1 and fake = 0
true_data['Category'] = '1'
fake_data['Category'] = '0'
df = pd.concat([true_data, fake_data], axis = "columns")
df = df.drop(df.index[21417:])

In [63]:
df = pd.concat([fake_data, true_data]).reset_index(drop = True)
df.drop(df.columns[[2, 3]], axis = 1, inplace = True)
df.head()

Unnamed: 0,title,text,Category
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,0


In [64]:
stop_words = set(stopwords.words('english'))
st = SnowballStemmer('english')

def clean_data(df, col, clean_col):
    # shuffle data
    df = shuffle(df)
    
    # lowercase
    df[clean_col] = df[col].apply(lambda x: x.lower().strip())
    
    # punctuation
    df[clean_col] = df[clean_col].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
    
    # stopwords
    df[clean_col] = df[clean_col].apply(lambda x: ' '.join(st.stem(text) for text in x.split() if text not in stop_words))
    
    return df

In [65]:
lemmatizer = WordNetLemmatizer()
# Simple Lemmatization
def simple_lemmatize(title):
    temp_list = []
    for word in nltk.word_tokenize(title):
        temp_list.append(lemmatizer.lemmatize(word))
    return ' '.join([str(elem) for elem in temp_list])

In [66]:
df = clean_data(df, "title", "Title Clean")
df.columns

Index(['title', 'text', 'Category', 'Title Clean'], dtype='object')

In [67]:
# nltk.download('punkt')
# nltk.download('omw-1.4')
def process_lemma(unprocessed_list, total_items):
    output_list = []
    count = 0
    for item in unprocessed_list:
        if item != ' ':            
            #Simple
            output_list.append(simple_lemmatize((item)))
        else:
            output_list.append(' ')
        
        count += 1
        
        if count % 10000 == 0:
            print("{:.2%}".format(count / total_items))
        
    return output_list

df['Title_Lemma'] = process_lemma(df['Title Clean'], 44898)
df['Title_Lemma']

22.27%
44.55%
66.82%
89.09%


20015    freshman orient racist ask asian student math ...
3980     trump fan open admit plan assassin hillari ele...
33817      embattl chicago prosecutor defeat elect primari
44744    guatemala top court side u n graft unit fight ...
3741     georg takei perfect respons trump demand hamil...
                               ...                        
18453    cuomo outrag iowan cheer trump said want poor ...
717        break trump kick reinc priebus white hous tweet
40174    egypt author challeng reuter casualti western ...
11788                    judg jeanin like law enforc video
14697    face follow obama gun control ralli ted cruz a...
Name: Title_Lemma, Length: 44898, dtype: object

In [68]:
# need to convert data to Series for Decision Tree Classifier
X = df[['Title_Lemma']].squeeze()
y = df['Category'].squeeze()

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Decision Tree Classifier

In [70]:
# Vectorizing and applying TF-IDF for decision tree classifier
max_depth = 100

pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', DecisionTreeClassifier(criterion= 'entropy',
                                           max_depth = max_depth, 
                                           splitter='best', 
                                           random_state=0))])
# Fitting the model
model = pipe.fit(X_train, y_train)

# Accuracy
prediction = model.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))


accuracy: 88.29%


In [71]:
#accuracy is around 89%. To improve, try to fix clean data
# using lemma reduced to 88

# Naive Bayes

In [75]:
# naive bayes algo
# X_train_float = X_train.to_numpy()
# when using to_numpy(), results increase by 1%
model2 = make_pipeline(TfidfVectorizer(), MultinomialNB())
# nb = GaussianNB()
clf = model2.fit(X_train, y_train)

pred_NB = model2.predict(X_test)

In [76]:
# Print model attributes 
print('Classes: ', clf.classes_) # class labels known to the classifier

# Use score method to get accuracy of the model
print('--------------------------------------------------------')
score = model2.score(X_test, y_test)
print('Accuracy Score: ', score)
print('--------------------------------------------------------')

# Look at classification report to evaluate the model
print(classification_report(y_test, pred_NB))

print("accuracy: {}%".format(round(accuracy_score(y_test, pred_NB)*100,2)))

Classes:  ['0' '1']
--------------------------------------------------------
Accuracy Score:  0.9292873051224945
--------------------------------------------------------
              precision    recall  f1-score   support

           0       0.92      0.95      0.93      4704
           1       0.94      0.91      0.92      4276

    accuracy                           0.93      8980
   macro avg       0.93      0.93      0.93      8980
weighted avg       0.93      0.93      0.93      8980

accuracy: 92.93%


# Random Forest Classifier

In [58]:
# random forest classifier
#this is pretty much bag of words
vectorizer = CountVectorizer(min_df=1)
X_rfc = vectorizer.fit_transform(X_train)

clf = RandomForestClassifier(n_estimators = 100, class_weight = None, criterion = "entropy").fit(X_rfc, y_train)

pred_rfc = clf.predict(vectorizer.transform(X_test).toarray())

In [59]:
print("accuracy: {}%".format(round(accuracy_score(y_test, pred_rfc)*100,2)))
print(classification_report(y_test, pred_rfc))
# did rfc function rearrange the order of X_test? 

accuracy: 93.98%
              precision    recall  f1-score   support

           0       0.95      0.94      0.94      4660
           1       0.93      0.94      0.94      4320

    accuracy                           0.94      8980
   macro avg       0.94      0.94      0.94      8980
weighted avg       0.94      0.94      0.94      8980



# Logistic Regression

In [29]:
bow = CountVectorizer(tokenizer=lambda doc: doc)
bow_x = bow.fit_transform(df['Title Clean'])
words = bow.get_feature_names()
print(words)

[' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']




In [30]:
X_tr, X_te, y_tr, y_te = train_test_split(bow_x, df['Category'], test_size=0.2, random_state=0)
X_tr

<35918x27 sparse matrix of type '<class 'numpy.int64'>'
	with 669512 stored elements in Compressed Sparse Row format>

In [31]:
def simple_logistic_classify(X_tr, y_tr, X_test, y_test, description, _C=1.0):
    model = LogisticRegression(C=_C).fit(X_tr, y_tr)
    score = model.score(X_test, y_test)
    print('Test Score with', description, 'features', score)
    return model

simple_logistic_classify(X_tr, y_tr, X_te, y_te, "bow")

Test Score with bow features 0.7684855233853006


LogisticRegression()

# SVM

In [32]:
# SVM
Encoder = LabelEncoder()
Y_Train = Encoder.fit_transform(y_train)
Y_Test = Encoder.fit_transform(y_test)

In [33]:
#Also try naive bayes and rfc with tf-idf
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['Title Clean'])
Train_X_Tfidf = Tfidf_vect.transform(X_train)
Test_X_Tfidf = Tfidf_vect.transform(X_test)

print(Tfidf_vect.vocabulary_)

{'schumer': 3909, 'call': 644, 'trump': 4618, 'appoint': 204, 'offici': 3112, 'overse': 3178, 'puerto': 3509, 'rico': 3765, 'relief': 3668, 'least': 2567, 'media': 2791, 'worker': 4936, 'kill': 2462, 'job': 2389, 'report': 3692, 'without': 4925, 'border': 527, 'berkeley': 421, 'colleg': 865, 'thug': 4505, 'form': 1759, 'human': 2156, 'chain': 725, 'stop': 4288, 'white': 4889, 'student': 4315, 'attend': 275, 'class': 808, 'video': 4783, 'build': 605, 'owner': 3187, 'manag': 2726, 'arrest': 231, 'south': 4173, 'korean': 2488, 'fire': 1702, 'austrian': 290, 'parent': 3222, 'teacher': 4443, 'sacrific': 3849, 'young': 4982, 'girl': 1875, 'liber': 2602, 'teen': 4450, 'refuge': 3641, 'sexual': 4002, 'abus': 17, 'school': 3906, 'month': 2912, 'japan': 2361, 'busi': 626, 'lobbi': 2637, 'throw': 4503, 'weight': 4870, 'behind': 406, 'pm': 3356, 'abe': 6, 'wage': 4819, 'hike': 2092, 'plan': 3338, 'cuba': 1081, 'discuss': 1297, 'detent': 1242, 'wake': 4823, 'elect': 1438, 'left': 2574, 'go': 1892, 

In [34]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Y_Train)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Y_Test)*100)

SVM Accuracy Score ->  94.20935412026726


In [35]:
print(classification_report(Y_Test,predictions_SVM))

              precision    recall  f1-score   support

           0       0.95      0.94      0.94      4655
           1       0.93      0.95      0.94      4325

    accuracy                           0.94      8980
   macro avg       0.94      0.94      0.94      8980
weighted avg       0.94      0.94      0.94      8980

