# Import Libaries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Load Data

In [2]:
df_train = pd.read_csv(r'D:\Fake News Detection\Data\train.csv')
df_train.drop(['id','user_name','user_name_labelEncoder','user_name_freq'], axis='columns', inplace=True)
df_test = pd.read_csv(r'D:\Fake News Detection\Data\test.csv')
df_test.drop(['id','user_name','user_name_labelEncoder','user_name_freq'], axis='columns', inplace=True)

In [3]:
print(df_train.shape)
df_train.head()

(3869, 6)


Unnamed: 0.1,Unnamed: 0,post_message,num_like_post,num_comment_post,num_share_post,label
0,3943,"Sáng 2/3 , Bộ Y_tế phát đi thông_báo khẩn số 8...",510,22,94.0,0
1,1875,[ C ] ( < URL > ) á lại chết ở công Viên Yên_S...,11,0,0.0,0
2,4006,"Có cái vụ gạo này , mà các ông làm Thủ_tướng b...",14,0,0.0,0
3,4348,"Với hơn 700.000 dân , nhưng có tới 62 dự_án đa...",2,0,0.0,0
4,339,THỦ_PHẠM GÂY RA BỆNH VIÊM PHỔI CẤP TẠI TQ EM T...,5800,6000,21000.0,1


In [4]:
print(df_test.shape)
df_test.head()

(968, 6)


Unnamed: 0.1,Unnamed: 0,post_message,num_like_post,num_comment_post,num_share_post,label
0,255,Bình_Dương : Thanh_tra Sở Xây_dựng phát_hiện d...,33,10,3.0,0
1,4088,TIN CHẤN_ĐỘNG TỪ VŨ HÁN - TRUNG_QUỐC Dữ_liệu t...,6,0,0.0,1
2,600,Mỹ lập kỷ_lục mới về số ca COVID-19 trong vòng...,9,0,2.0,0
3,3856,Uỷ_ban Tư_pháp của Quốc_hội vừa phúc_đáp đơn k...,309,12,41.0,0
4,409,"Ngày 2.6 , Uỷ_ban soạn_thảo dự_luật của Hội_đồ...",8,0,1.0,0


In [5]:
data_train = df_train.post_message.values
data_test = df_test.post_message.values 

# TFIDF

In [6]:
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(data_train)
X_test = tfidf.transform(data_test)
y_train = df_train.label.values
y_test = df_test.label.values

# Model

In [7]:
model = [ SVC(kernel = 'linear', probability = True), MultinomialNB(), LogisticRegression(), DecisionTreeClassifier(), 
         RandomForestClassifier(), CatBoostClassifier(verbose = 200), LGBMClassifier() ]

# Score

In [8]:
data = {'MODEL': [0], 'ACCURACY_SCORE': [0], 'F1_SCORE': [0], 'ROC_AUC_SCORE': [0],}
score = pd.DataFrame(data)

for i in range (0,len(model)):
    print( model[i].__class__.__name__,".....")
    model[i].fit(X_train, y_train)
    y_pred_proba = model[i].predict_proba(X_test) [:,1]
    new_row = {'MODEL': model[i].__class__.__name__, 
               'ACCURACY_SCORE': round(accuracy_score(y_test, y_pred_proba > 0.5), 4), 
               'F1_SCORE': round(f1_score(y_test, y_pred_proba > 0.5, average = "macro"), 4), 
               'ROC_AUC_SCORE': round(roc_auc_score(y_test, y_pred_proba), 4)}
    score = score.append(new_row, ignore_index = True)

SVC .....
MultinomialNB .....
LogisticRegression .....
DecisionTreeClassifier .....
RandomForestClassifier .....
CatBoostClassifier .....
Learning rate set to 0.018359
0:	learn: 0.6837787	total: 324ms	remaining: 5m 24s
200:	learn: 0.3321474	total: 58.5s	remaining: 3m 52s
400:	learn: 0.2850550	total: 1m 53s	remaining: 2m 49s
600:	learn: 0.2461101	total: 2m 49s	remaining: 1m 52s
800:	learn: 0.2107960	total: 3m 43s	remaining: 55.5s
999:	learn: 0.1851825	total: 4m 41s	remaining: 0us
LGBMClassifier .....


In [9]:
score.head(10)

Unnamed: 0,MODEL,ACCURACY_SCORE,F1_SCORE,ROC_AUC_SCORE
0,0,0.0,0.0,0.0
1,SVC,0.9008,0.7839,0.8913
2,MultinomialNB,0.844,0.4577,0.7114
3,LogisticRegression,0.8791,0.6781,0.8914
4,DecisionTreeClassifier,0.8347,0.6878,0.6888
5,RandomForestClassifier,0.8667,0.5989,0.881
6,CatBoostClassifier,0.8905,0.7345,0.8779
7,LGBMClassifier,0.8864,0.7491,0.8784
