# Import Libaries

In [1]:
!pip install transformers



In [2]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from transformers import AutoModel, AutoTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

  return torch._C._cuda_getDeviceCount() > 0


# Load Data

In [3]:
df_train = pd.read_csv(r'D:\Fake News Detection\Data\train.csv')
df_train.drop(['id','user_name','user_name_labelEncoder','user_name_freq'], axis='columns', inplace=True)
df_test = pd.read_csv(r'D:\Fake News Detection\Data\test.csv')
df_test.drop(['id','user_name','user_name_labelEncoder','user_name_freq'], axis='columns', inplace=True)

In [4]:
print(df_train.shape)
df_train.head()

(3869, 6)


Unnamed: 0.1,Unnamed: 0,post_message,num_like_post,num_comment_post,num_share_post,label
0,3943,"Sáng 2/3 , Bộ Y_tế phát đi thông_báo khẩn số 8...",510,22,94.0,0
1,1875,[ C ] ( < URL > ) á lại chết ở công Viên Yên_S...,11,0,0.0,0
2,4006,"Có cái vụ gạo này , mà các ông làm Thủ_tướng b...",14,0,0.0,0
3,4348,"Với hơn 700.000 dân , nhưng có tới 62 dự_án đa...",2,0,0.0,0
4,339,THỦ_PHẠM GÂY RA BỆNH VIÊM PHỔI CẤP TẠI TQ EM T...,5800,6000,21000.0,1


In [5]:
print(df_test.shape)
df_test.head()

(968, 6)


Unnamed: 0.1,Unnamed: 0,post_message,num_like_post,num_comment_post,num_share_post,label
0,255,Bình_Dương : Thanh_tra Sở Xây_dựng phát_hiện d...,33,10,3.0,0
1,4088,TIN CHẤN_ĐỘNG TỪ VŨ HÁN - TRUNG_QUỐC Dữ_liệu t...,6,0,0.0,1
2,600,Mỹ lập kỷ_lục mới về số ca COVID-19 trong vòng...,9,0,2.0,0
3,3856,Uỷ_ban Tư_pháp của Quốc_hội vừa phúc_đáp đơn k...,309,12,41.0,0
4,409,"Ngày 2.6 , Uỷ_ban soạn_thảo dự_luật của Hội_đồ...",8,0,1.0,0


In [6]:
data_train = df_train.post_message
data_test = df_test.post_message
label_train =  df_train.label
label_test = df_test.label

# Pretrain

In [7]:
max_len_ids = 256
def convert_samples_to_ids(texts, tokenizer, max_seq_length = max_len_ids):
    input_ids, attention_masks = [], []
    for line in texts:
        inputs = tokenizer.encode_plus(line, padding = 'max_length', max_length = max_seq_length, truncation = True)
        input_ids.append(inputs['input_ids'])
        attention_masks.append(inputs['attention_mask'])

    return torch.tensor(input_ids, dtype = torch.long)

**PhoBERT**

In [8]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
train_ids = convert_samples_to_ids (data_train, tokenizer)
test_ids = convert_samples_to_ids (data_test, tokenizer)
print (train_ids.shape)

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


torch.Size([3869, 256])


**TF-IDF + SVD**

In [9]:
max_len_tfidf = 256
clf = Pipeline ([ ('tfidf', TfidfVectorizer()),
                  ('svd', TruncatedSVD(n_components = max_len_tfidf, random_state=42)), ])

train_tfidf = clf.fit_transform (data_train)
test_tfidf = clf.transform (data_test)
print(train_tfidf.shape)

(3869, 256)


**Concat**

In [10]:
X_train = np.concatenate((train_ids, (train_tfidf)), axis=1)
X_test = np.concatenate((test_ids, (test_tfidf)), axis=1)
y_train = label_train
y_test = label_test
print(X_train.shape)

(3869, 512)


# Model

In [11]:
model = [RandomForestClassifier(), CatBoostClassifier(verbose = 200), LGBMClassifier(), XGBClassifier()]

# Score

In [12]:
data = {'MODEL': [0], 'ACCURACY_SCORE': [0], 'F1_SCORE': [0], 'ROC_AUC_SCORE': [0],}
score = pd.DataFrame(data)

for i in range (0,len(model)):
    print( model[i].__class__.__name__,".....")
    model[i].fit(X_train, y_train)
    y_pred_proba = model[i].predict_proba(X_test) [:,1]
    new_row = {'MODEL': model[i].__class__.__name__, 
               'ACCURACY_SCORE': round(accuracy_score(y_test, y_pred_proba > 0.5), 4), 
               'F1_SCORE': round(f1_score(y_test, y_pred_proba > 0.5, average = "macro"), 4), 
               'ROC_AUC_SCORE': round(roc_auc_score(y_test, y_pred_proba), 4)}
    score = score.append(new_row, ignore_index = True)

RandomForestClassifier .....
CatBoostClassifier .....
Learning rate set to 0.018359
0:	learn: 0.6810008	total: 162ms	remaining: 2m 41s
200:	learn: 0.2887310	total: 20.2s	remaining: 1m 20s
400:	learn: 0.2197766	total: 41.1s	remaining: 1m 1s
600:	learn: 0.1620535	total: 1m 3s	remaining: 42.4s
800:	learn: 0.1160417	total: 1m 27s	remaining: 21.6s
999:	learn: 0.0852685	total: 1m 49s	remaining: 0us
LGBMClassifier .....
XGBClassifier .....




In [13]:
score.head(5)

Unnamed: 0,MODEL,ACCURACY_SCORE,F1_SCORE,ROC_AUC_SCORE
0,0,0.0,0.0,0.0
1,RandomForestClassifier,0.8564,0.5395,0.8223
2,CatBoostClassifier,0.8936,0.741,0.8778
3,LGBMClassifier,0.8977,0.7585,0.8783
4,XGBClassifier,0.8946,0.7557,0.8773
