In [None]:
# ==============================
# TRAIN MODELS FOR DATASET 1
# ==============================

import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# ==============================
# 1. LOAD DATASET
# ==============================
df = pd.read_csv("dataset1.csv")

TARGET = "is_fake"

X = df.drop(TARGET, axis=1)
y = df[TARGET]

# handle missing values
X = X.fillna(0)

# ==============================
# 2. TRAIN TEST SPLIT
# ==============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# ==============================
# 3. RANDOM FOREST
# ==============================
rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred) * 100

joblib.dump(rf, "rf_dataset1.pkl")

# ==============================
# 4. LIGHTGBM
# ==============================
lgbm = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    random_state=42
)

lgbm.fit(X_train, y_train)
lgbm_pred = lgbm.predict(X_test)
lgbm_acc = accuracy_score(y_test, lgbm_pred) * 100

joblib.dump(lgbm, "lgbm_dataset1.pkl")

# ==============================
# 5. XGBOOST (BEST)
# ==============================
xgb = XGBClassifier(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric="logloss",
    random_state=42
)

xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
xgb_acc = accuracy_score(y_test, xgb_pred) * 100

joblib.dump(xgb, "xgb_dataset1.pkl")

# ==============================
# 6. PRINT RESULTS
# ==============================
print("DATASET 1 MODEL ACCURACY")
print("-------------------------")
print(f"Random Forest : {rf_acc:.2f}%")
print(f"LightGBM      : {lgbm_acc:.2f}%")
print(f"XGBoost       : {xgb_acc:.2f}%")

best_model = max(
    [("Random Forest", rf_acc),
     ("LightGBM", lgbm_acc),
     ("XGBoost", xgb_acc)],
    key=lambda x: x[1]
)

print("\nBest Model:", best_model[0])


[LightGBM] [Info] Number of positive: 554, number of negative: 74
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000345 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 280
[LightGBM] [Info] Number of data points in the train set: 628, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.882166 -> initscore=2.013100
[LightGBM] [Info] Start training from score 2.013100
DATASET 1 MODEL ACCURACY
-------------------------
Random Forest : 94.27%
LightGBM      : 94.27%
XGBoost       : 92.99%

Best Model: Random Forest


[LightGBM] [Info] Number of positive: 230, number of negative: 230
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000073 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 494
[LightGBM] [Info] Number of data points in the train set: 460, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
DATASET 2 MODEL ACCURACY
-------------------------
Random Forest : 92.24%
LightGBM      : 91.38%
XGBoost       : 94.83%

Best Model: XGBoost


# New section