In [1]:
from google.colab import drive
import os
drive.mount('/content/gdrive')
path = "/content/gdrive/My Drive/Fake News Detection"
os.chdir(path)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## **Import**

In [2]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron, LinearRegression, PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

## **Data**

In [3]:
cols = ['post_message','num_char','num_url','num_hashtag','num_post','num_like',
        'num_cmt','num_share','pixel','num_image','hour','weekday','day','month','label']
df_train = pd.read_csv('dataset/raw/train.csv', usecols=cols) 
df_test = pd.read_csv('dataset/raw/test.csv', usecols=cols)

In [4]:
print(df_train.shape)
print(df_train.info())
df_train.head(10)

(3788, 15)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3788 entries, 0 to 3787
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   post_message  3788 non-null   object
 1   label         3788 non-null   int64 
 2   num_char      3788 non-null   int64 
 3   num_url       3788 non-null   int64 
 4   num_hashtag   3788 non-null   int64 
 5   num_post      3788 non-null   int64 
 6   num_like      3788 non-null   int64 
 7   num_cmt       3788 non-null   int64 
 8   num_share     3788 non-null   int64 
 9   pixel         3788 non-null   int64 
 10  num_image     3788 non-null   int64 
 11  hour          3788 non-null   int64 
 12  weekday       3788 non-null   int64 
 13  day           3788 non-null   int64 
 14  month         3788 non-null   int64 
dtypes: int64(14), object(1)
memory usage: 444.0+ KB
None


Unnamed: 0,post_message,label,num_char,num_url,num_hashtag,num_post,num_like,num_cmt,num_share,pixel,num_image,hour,weekday,day,month
0,Thủ_tướng CANADA đã chính_thức thông_báo : các...,1,116,0,0,1,37,3,2,0,0,15,6,24,4
1,Sửa Nghị_định 20 : Hồi_tố hàng nghìn tỷ đồng_b...,0,99,0,0,42,1,0,0,289080,0,2,2,20,4
2,Luật_sư cho rằng việc khai_báo nhỏ_giọt gây kh...,0,3276,1,0,45,930,12,160,0,1,13,1,15,3
3,"“ Thiên_tai , Nhân_họa hay khủng_hoảng niềm ti...",0,2668,0,0,1,2700,156,2100,0,0,18,2,3,2
4,Sẽ bán đấu_giá và cấp biển số xe theo sở_thích...,1,154,0,0,1,68,31,4,0,0,22,3,21,4
5,Các bác chia_sẻ cảm_nhận đóng thuế thu_nhập cá...,0,521,2,0,1,85,249,15,0,0,18,6,1,5
6,Việc triển_khai gói hỗ_trợ 62.000 tỷ cho lao_đ...,0,152,0,0,1,6,1,0,0,0,5,2,22,6
7,"Hơn 2 tháng qua , công_an các đơn_vị , địa_phư...",0,131,0,0,90,2,0,0,0,0,4,5,26,3
8,🔥 🔥 🔥 Liệu “ bóng_đá thời_Covid-19 ” có còn gi...,0,853,2,0,1,19,0,3,289440,1,4,5,14,5
9,THÔNG_TIN TỪ C.A TỈNH GIA_LAI Theo thông_báo c...,1,418,0,0,1,319,28,51,2073600,1,23,5,21,2


In [5]:
print(df_test.shape)
print(df_test.info())
df_test.head(10)

(948, 15)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 948 entries, 0 to 947
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   post_message  948 non-null    object
 1   label         948 non-null    int64 
 2   num_char      948 non-null    int64 
 3   num_url       948 non-null    int64 
 4   num_hashtag   948 non-null    int64 
 5   num_post      948 non-null    int64 
 6   num_like      948 non-null    int64 
 7   num_cmt       948 non-null    int64 
 8   num_share     948 non-null    int64 
 9   pixel         948 non-null    int64 
 10  num_image     948 non-null    int64 
 11  hour          948 non-null    int64 
 12  weekday       948 non-null    int64 
 13  day           948 non-null    int64 
 14  month         948 non-null    int64 
dtypes: int64(14), object(1)
memory usage: 111.2+ KB
None


Unnamed: 0,post_message,label,num_char,num_url,num_hashtag,num_post,num_like,num_cmt,num_share,pixel,num_image,hour,weekday,day,month
0,HÃY XÔNG KHI BỊ NHIỄM_VIRUS ( Tin thực_tế từ n...,1,3701,0,0,1,2700,640,12000,0,0,21,2,30,3
1,"Tôi cũng vài lần rơi vào trạng_thái đãng_trí ,...",0,170,0,0,1,7,0,0,0,0,3,4,10,6
2,Thêm 4 tỉnh_thành sẽ cho học_sinh đi học trở_l...,0,67,0,0,58,3180,47,116,518400,1,15,4,22,4
3,Sự_việc phân_chia vùng còn chưa ngã_ngũ thì ở ...,1,122,0,0,1,20,11,2,0,2,18,5,4,6
4,"16 h33p ngày 27/5 ( giờ_địa_phương ) , SpaceX ...",0,146,0,0,64,13,1,3,0,0,11,4,27,5
5,""" Chúng_ta cách_ly Vũ_Hán , nhưng đừng cách_ly...",0,318,1,4,1,23000,1300,4800,0,0,-1,-1,-1,-1
6,"CHUẨN_BỊ CHO "" CÚ HÍCH "" LỚN Theo dự_kiến của ...",0,1816,0,0,47,7334,37,199,177152,1,15,5,2,4
7,Tọa_Đàm Trực_Tuyến - THỜI_ĐIỂM VÀNG KÍCH_CẦU D...,0,65,0,0,1,85,4,18,921600,1,3,4,10,6
8,Cái kết sắp có_hậu về bảo_hiểm trách_nhiệm dân...,0,101,0,0,1,10,1,1,0,0,12,2,25,5
9,Công Phượng gửi tin_vui tới thầy Park khi duyê...,0,123,1,0,1,167,2,0,0,0,13,3,23,6


In [6]:
num_cols = ['num_char','num_url','num_hashtag','num_post',
            'num_like','num_cmt','num_share','pixel','num_image','hour','weekday','day','month']

for i in num_cols:
    scale = StandardScaler().fit(df_train[[i]])
    df_train[i] = scale.transform(df_train[[i]])  
    df_test[i] = scale.transform(df_test[[i]])

df_train.head(10)

Unnamed: 0,post_message,label,num_char,num_url,num_hashtag,num_post,num_like,num_cmt,num_share,pixel,num_image,hour,weekday,day,month
0,Thủ_tướng CANADA đã chính_thức thông_báo : các...,1,-0.446748,-0.372198,-0.223096,-0.472197,-0.289728,-0.216099,-0.282523,-0.445261,-0.368489,0.628546,1.019241,0.960062,-0.107563
1,Sửa Nghị_định 20 : Hồi_tố hàng nghìn tỷ đồng_b...,0,-0.45851,-0.372198,-0.223096,1.467858,-0.294094,-0.219293,-0.28335,0.64905,-0.368489,-1.444186,-0.929717,0.497905,-0.107563
2,Luật_sư cho rằng việc khai_báo nhỏ_giọt gây kh...,0,1.739626,0.609617,-0.223096,1.609813,-0.18143,-0.206519,-0.21716,-0.445261,0.567685,0.309664,-1.416957,-0.079792,-0.741229
3,"“ Thiên_tai , Nhân_họa hay khủng_hoảng niềm ti...",0,1.318957,-0.372198,-0.223096,-0.472197,0.033225,-0.053227,0.585398,-0.445261,-0.368489,1.106868,-0.929717,-1.466264,-1.374895
4,Sẽ bán đấu_giá và cấp biển số xe theo sở_thích...,1,-0.420456,-0.372198,-0.223096,-0.472197,-0.285969,-0.186293,-0.281695,-0.445261,-0.368489,1.744632,-0.442477,0.613444,-0.107563
5,Các bác chia_sẻ cảm_nhận đóng thuế thu_nhập cá...,0,-0.166532,1.591431,-0.223096,-0.472197,-0.283907,0.045773,-0.277145,-0.445261,-0.368489,1.106868,1.019241,-1.697342,0.526103
6,Việc triển_khai gói hỗ_trợ 62.000 tỷ cho lao_đ...,0,-0.421839,-0.372198,-0.223096,-0.472197,-0.293488,-0.218228,-0.28335,-0.445261,-0.368489,-0.965863,-0.929717,0.728984,1.159769
7,"Hơn 2 tháng qua , công_an các đơn_vị , địa_phư...",0,-0.436369,-0.372198,-0.223096,3.739143,-0.293973,-0.219293,-0.28335,-0.445261,-0.368489,-1.125304,0.532002,1.191141,-0.741229
8,🔥 🔥 🔥 Liệu “ bóng_đá thời_Covid-19 ” có còn gi...,0,0.063176,1.591431,-0.223096,-0.472197,-0.291911,-0.219293,-0.282109,0.650413,0.567685,-1.125304,0.532002,-0.195331,0.526103
9,THÔNG_TIN TỪ C.A TỈNH GIA_LAI Theo thông_báo c...,1,-0.237797,-0.372198,-0.223096,-0.472197,-0.255529,-0.189486,-0.262252,7.404341,0.567685,1.904073,0.532002,0.613444,-1.374895


## **Function**

In [7]:
def get_metrics(y_test, y_pred_proba, model):
    print('MODEL: ', model.__class__.__name__)
    print('ACCURACY_SCORE: ', round(accuracy_score(y_test, y_pred_proba >= 0.5), 4))
    print('F1_SCORE: ', round(f1_score(y_test, y_pred_proba >= 0.5, average = "macro"), 4))
    print('ROC_AUC_SCORE: ', round(roc_auc_score(y_test, y_pred_proba), 4))
    print('CONFUSION_MATRIX:\n', confusion_matrix(y_test, y_pred_proba >= 0.5),'\n')

In [8]:
def get_numeric_data(x):
    return [record[:-1].astype(float) for record in x]

def get_text_data(x):
    return [record[-1] for record in x]
    
transfomer_numeric = FunctionTransformer(get_numeric_data)
transformer_text = FunctionTransformer(get_text_data)

In [9]:
pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', transfomer_numeric)
            ])),
            ('text_features', Pipeline([
                ('selector', transformer_text),
                ('tfidf', TfidfVectorizer(max_features=100000, ngram_range=(1,2))),
                #('svd', TruncatedSVD(n_components = 512, random_state=42))
            ]))
    ])),
    #('clf', LGBMClassifier())
])

##**Model**

In [10]:
cols = ['num_char','num_url','num_hashtag','num_post','num_like','num_cmt','num_share',
        'pixel','num_image','hour','weekday','day','month','post_message']

X_train_features = pd.DataFrame(df_train, columns=cols).to_numpy()
X_test_features = pd.DataFrame(df_test, columns=cols).to_numpy()

In [11]:
X_train = pipeline.fit_transform(X_train_features)
X_test = pipeline.transform(X_test_features)
y_train = df_train['label']
y_test = df_test['label']

print(X_train.shape)
print(X_test.shape)

(3788, 100013)
(948, 100013)


In [12]:
model = LGBMClassifier()
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test) [:,1]
get_metrics(y_test, y_pred_proba, model)

MODEL:  LGBMClassifier
ACCURACY_SCORE:  0.904
F1_SCORE:  0.8076
ROC_AUC_SCORE:  0.9449
CONFUSION_MATRIX:
 [[764  17]
 [ 74  93]] 



## **Scores**

In [13]:
model = [SVC(kernel = 'linear', probability = True), LogisticRegression(), DecisionTreeClassifier(), 
         RandomForestClassifier(), LGBMClassifier(), XGBClassifier() ] 
         #CatBoostClassifier(verbose = 200) 

In [14]:
data = {'MODEL': [0], 'ACCURACY_SCORE': [0], 'F1_SCORE': [0], 'ROC_AUC_SCORE': [0],}
score = pd.DataFrame(data)

for i in range (0,len(model)):
    print( model[i].__class__.__name__,".....")
    model[i].fit(X_train, y_train)
    y_pred_proba = model[i].predict_proba(X_test) [:,1]
    new_row = {'MODEL': model[i].__class__.__name__, 
               'ACCURACY_SCORE': round(accuracy_score(y_test, y_pred_proba >= 0.5), 4), 
               'F1_SCORE': round(f1_score(y_test, y_pred_proba >= 0.5, average = "macro"), 4), 
               'ROC_AUC_SCORE': round(roc_auc_score(y_test, y_pred_proba), 4)}
    score = score.append(new_row, ignore_index = True)

SVC .....
LogisticRegression .....


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


DecisionTreeClassifier .....
RandomForestClassifier .....
LGBMClassifier .....
XGBClassifier .....


In [15]:
score.head(10)

Unnamed: 0,MODEL,ACCURACY_SCORE,F1_SCORE,ROC_AUC_SCORE
0,0,0.0,0.0,0.0
1,SVC,0.8945,0.7941,0.9344
2,LogisticRegression,0.8745,0.7045,0.9206
3,DecisionTreeClassifier,0.8259,0.698,0.6966
4,RandomForestClassifier,0.8407,0.5478,0.909
5,LGBMClassifier,0.904,0.8076,0.9449
6,XGBClassifier,0.8914,0.7635,0.9321
