# Import Libaries

In [21]:
!pip install transformers
!pip install catboost
!pip install lightgbm
!pip install xgboost



In [22]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split, cross_val_score,  KFold
from sklearn.decomposition import TruncatedSVD
from transformers import AutoModel, AutoTokenizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer, TfidfTransformer
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

# Load Data

In [23]:
df_train = pd.read_csv(r'D:\Fake News Detection\Data\train.csv')
df_train.drop(['id','user_name','user_name_labelEncoder','user_name_freq'], axis='columns', inplace=True)
df_test = pd.read_csv(r'D:\Fake News Detection\Data\test.csv')
df_test.drop(['id','user_name','user_name_labelEncoder','user_name_freq'], axis='columns', inplace=True)

In [24]:
print(df_train.shape)
df_train.head()

(4080, 5)


Unnamed: 0,post_message,num_like_post,num_comment_post,num_share_post,label
0,Chủ_tịch UBND < URL > yêu_cầu Sở Nội_vụ < URL ...,9,1,0.0,0
1,"Ngày 28/6 , Công_an quận Đống_Đa ( Hà_Nội ) vừ...",30,0,3.0,0
2,COVID-19 đang tạo nên một tình_huống chưa từng...,10000,342,2900.0,0
3,Cuối_cùng cũng thừa_nhận ... Trích : “ Công_an...,64,14,4.0,0
4,Trung_Cộng đã quá vội_vã ? Tập_Cận_Bình trong ...,38,4,1.0,1


In [25]:
print(df_test.shape)
df_test.head()

(1020, 5)


Unnamed: 0,post_message,num_like_post,num_comment_post,num_share_post,label
0,TƯ_VẤN MÙA_THI : Cách nộp hồ_sơ để trúng_tuyển...,48,5,19.0,0
1,Chủ đầu_tư tại Hà_Nội tung sản_phẩm có mức giá...,25,1,1.0,0
2,"Trong 1 vụ xô_xát nhỏ thôi , em_trai này bị bắ...",170,51,68.0,1
3,VIỆT NAM ĐÃ CÓ 145 CA KHỎI COVID-19 Thông_tin ...,7393,13,236.0,0
4,1 . Nhiều thứ chúng_ta mong_mỏi có được lại có...,290,5,66.0,0


In [26]:
data_train = df_train.post_message
data_test = df_test.post_message
label_train =  df_train.label
label_test = df_test.label

# Pretrain

In [27]:
max_len_ids = 256
def convert_samples_to_ids(texts, tokenizer, max_seq_length = max_len_ids):
    input_ids, attention_masks = [], []
    for line in texts:
        inputs = tokenizer.encode_plus(line, padding = 'max_length', max_length = max_seq_length, truncation = True)
        input_ids.append(inputs['input_ids'])
        attention_masks.append(inputs['attention_mask'])

    return torch.tensor(input_ids, dtype = torch.long)

**PhoBERT**

In [28]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
train_ids = convert_samples_to_ids (data_train, tokenizer)
test_ids = convert_samples_to_ids (data_test, tokenizer)
print (train_ids.shape)

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


torch.Size([4080, 256])


**TF-IDF + SVD**

In [29]:
max_len_tfidf = 256
clf = Pipeline ([ ('tfidf', TfidfVectorizer()),
                  ('svd', TruncatedSVD(n_components = max_len_tfidf, random_state=42)), ])

train_tfidf = clf.fit_transform (data_train)
test_tfidf = clf.transform (data_test)
print(train_tfidf.shape)

(4080, 256)


**Concat**

In [30]:
X_train = np.concatenate((train_ids, (train_tfidf)), axis=1)
X_test = np.concatenate((test_ids, (test_tfidf)), axis=1)
y_train = label_train
y_test = label_test
print(X_train.shape)

(4080, 512)


# Model

In [31]:
model = CatBoostClassifier(verbose = 100)
# CatBoostClassifier() LGBMClassifier()  XGBClassifier()

# Kfold

In [32]:
score = cross_val_score(model, X_train, y_train , cv=5, scoring = 'roc_auc')
print(score)
print(round(score.mean(),4))

Learning rate set to 0.017073
0:	learn: 0.6811944	total: 128ms	remaining: 2m 7s
100:	learn: 0.3695886	total: 10.1s	remaining: 1m 29s
200:	learn: 0.3020149	total: 19.9s	remaining: 1m 18s
300:	learn: 0.2594972	total: 29.6s	remaining: 1m 8s
400:	learn: 0.2241694	total: 39.2s	remaining: 58.5s
500:	learn: 0.1923478	total: 49.9s	remaining: 49.7s
600:	learn: 0.1629117	total: 58.4s	remaining: 38.8s
700:	learn: 0.1372685	total: 1m 7s	remaining: 28.8s
800:	learn: 0.1168970	total: 1m 16s	remaining: 19s
900:	learn: 0.0997928	total: 1m 25s	remaining: 9.44s
999:	learn: 0.0851954	total: 1m 35s	remaining: 0us
Learning rate set to 0.017073
0:	learn: 0.6811807	total: 119ms	remaining: 1m 59s
100:	learn: 0.3671807	total: 9.08s	remaining: 1m 20s
200:	learn: 0.3025103	total: 18.5s	remaining: 1m 13s
300:	learn: 0.2602848	total: 27.7s	remaining: 1m 4s
400:	learn: 0.2262991	total: 37.4s	remaining: 55.8s
500:	learn: 0.1933102	total: 46.1s	remaining: 45.9s
600:	learn: 0.1628431	total: 54.9s	remaining: 36.5s
700: