In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import roc_auc_score, roc_curve, make_scorer, confusion_matrix, classification_report
from tqdm import tqdm

from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
chunk_size = 100000  # Adjust as needed
chunks = pd.read_csv('E:/Phat/data_electricity.csv', chunksize=chunk_size, low_memory=False, na_values='\\N')

data = pd.concat([chunk for chunk in chunks])
for col in data.select_dtypes(include=['float64']).columns:
    data[col] = data[col].astype(np.float32)

for col in data.select_dtypes(include=['int64']).columns:
    data[col] = data[col].astype(np.int32)
missing_value_cols = ['ma_ttcto', 'so_cot', 'so_hop', 'ngay_hhluc_vitri_ddo', 
                      'ngay_hhluc_khang','ngay_hhluc_diemdo', 'ngay_hhluc']
for col in missing_value_cols:
    data[col].fillna("Missing", inplace=True)
numerical_cols = ['id_chiso','id_bcs','hs_nhan','so_cto_chiso','ky','thang','nam',
                  'chiso_cu','chiso_moi','san_luong','sluong_ttiep','sluong_trphu',
                   'thd_le','sluong_1','sluong_2','sluong_3','ma_cto','so_cto_hso_cto',
                    'ma_cloai','so_pha','id_khang','id_ddo','kimua_cspk','csuat','so_cto_csuat_ddo',
                     'thang_csuat_ddo','gia_tri' ]

scaler = StandardScaler()
for col in numerical_cols:
    data[col] = scaler.fit_transform(data[[col]])
cat_cols = ['ma_dviqly','ma_dvictren','ma_ddo','bcs_chiso','loai_chiso',
            'ma_ttcto','ngay_dky','ngay_cky','ngay_tao_chiso','nguoi_tao_chiso',
            'ngay_sua_chiso','nguoi_sua_chiso','ma_cnang_chiso','so_cot','so_hop',
            'ma_tram','ngay_hluc_vitri_ddo','ngay_hhluc_vitri_ddo','ngay_bdong',
            'ngay_kdinh','ma_bdong','dong_dien','dien_ap','vh_cong','ten_khang',
            'ma_khang','ngay_hluc_khang','ngay_hhluc_khang','dia_chi','ngay_hluc_diem_do',
            'ngay_hhluc_diemdo','loai_giatri','ngay_hluc','ngay_hhluc']

oe = OrdinalEncoder()
for col in cat_cols:
    data[col] = oe.fit_transform(data[[col]].astype(str)).astype(int)
target = 'fraud'
X = data.drop(columns=[target])
y = data[target]


X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, stratify=y_temp, random_state=42)

In [4]:
from sklearn.utils import resample
from imblearn.over_sampling import SMOTENC
from catboost import CatBoostClassifier
import numpy as np

# Giả sử bạn đã có X_train, y_train và cat_indices là danh sách các cột categorical

# Tách class 0 và class 1
X_majority = X_train[y_train == 0]
y_majority = y_train[y_train == 0]
X_minority = X_train[y_train == 1]
y_minority = y_train[y_train == 1]

print("Original class distribution:")
print(y_train.value_counts())


Original class distribution:
fraud
0    13138492
1      168491
Name: count, dtype: int64


In [7]:
from sklearn.utils import resample

X_majority = X_train[y_train == 0]
y_majority = y_train[y_train == 0]
X_minority = X_train[y_train == 1]
y_minority = y_train[y_train == 1]

X_majority_sampled, y_majority_sampled = resample(
    X_majority, y_majority,
    replace=False,
    n_samples=1000000,  
    random_state=42
)

X_sampled = pd.concat([X_majority_sampled, X_minority])
y_sampled = pd.concat([y_majority_sampled, y_minority])
print("Sau khi sampling:")
print(y_sampled.value_counts())


Sau khi sampling:
fraud
0    1000000
1     168491
Name: count, dtype: int64


In [8]:
smote_nc = SMOTENC(
    categorical_features=cat_indices,
    random_state=42
)

X_resampled, y_resampled = smote_nc.fit_resample(X_sampled, y_sampled)

print("Sau khi SMOTENC:")
print(np.bincount(y_resampled))


MemoryError: Unable to allocate 196. GiB for an array with shape (168491, 156373) and data type float64

In [None]:
model = CatBoostClassifier(
    cat_features=cat_indices,
    iterations=1000,
    learning_rate=0.1,
    depth=8,
    eval_metric='AUC',
    random_state=42,
    verbose=100
)

model.fit(X_resampled, y_resampled, eval_set=(X_val, y_val), early_stopping_rounds=50)


In [None]:
# Dự đoán xác suất và label trên tập test
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred_label = model.predict(X_test)


In [None]:
from sklearn.metrics import f1_score, roc_auc_score, classification_report

f1 = f1_score(y_test, y_pred_label)
auc = roc_auc_score(y_test, y_pred_proba)

print(f"F1-score: {f1:.4f}")
print(f"ROC AUC:  {auc:.4f}")
print(classification_report(y_test, y_pred_label, digits=4))


In [None]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)

# Lọc vùng fpr ≤ 0.2
mask = fpr <= 0.2
fpr_limited = fpr[mask]
tpr_limited = tpr[mask]

pauc = auc(fpr_limited, tpr_limited) / 0.2  # chuẩn hóa trên khoảng [0, 0.2]
print(f"partial AUC (FPR ≤ 0.1): {pauc:.4f}")


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'ROC AUC = {auc:.4f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.fill_between(fpr_limited, tpr_limited, alpha=0.3, color='orange', label=f'pAUC (≤0.1) = {pauc:.4f}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix

precision = precision_score(y_test, y_pred_label)
recall = recall_score(y_test, y_pred_label)
cm = confusion_matrix(y_test, y_pred_label)

print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print("Confusion Matrix:")
print(cm)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Pred 0', 'Pred 1'], 
            yticklabels=['Actual 0', 'Actual 1'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()
