In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from joblib import dump
from sklearn.metrics import precision_score, recall_score

# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [4]:
fraud = pd.read_csv("https://cs307.org/lab-06/data/fraud.csv")
fraud_train, fraud_test = train_test_split(
    fraud,
    test_size=0.20,
    random_state=42,
    stratify=fraud["Fraud"],
)

# create X and y for train
X_train = fraud_train.drop("Fraud", axis=1)
y_train = fraud_train["Fraud"]

# create X and y for test
X_test = fraud_test.drop("Fraud", axis=1)
y_test = fraud_test["Fraud"]

In [89]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
optimal_params = {
    'n_estimators': 200,
    'max_depth': 10,
    'min_samples_split': 5,
    'min_samples_leaf': 2,
}

# 使用优化后的参数创建随机森林实例
optimized_random_forest = RandomForestClassifier(**optimal_params)

# 创建使用SelectFromModel的Pipeline
# 注意，我们用优化后的随机森林来选择特征
pipeline = Pipeline([
    ('feature_selection', SelectFromModel(optimized_random_forest)),
    ('random_forest', RandomForestClassifier(**optimal_params))  # 同样的优化参数用于最终的分类器
])

pipeline = Pipeline([
    ('random_forest', RandomForestClassifier())  # 使用平衡随机森林分类器并设置参数
])

# 训练模型
pipeline.fit(X_train, y_train)

In [94]:
y_pred = pipeline.predict(X_test)
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))

Precision:  0.984375
Recall:  0.7974683544303798


In [71]:
dump(pipeline, "credit-fraud.joblib")

['credit-fraud.joblib']

In [134]:
from sklearn.model_selection import GridSearchCV
from imblearn.ensemble import BalancedRandomForestClassifier
param_grid = {
    'n_estimators': [300],
    'min_samples_split': [5],
    'min_samples_leaf': [1],
    'bootstrap': [True]
}

# 创建一个基础的随机森林分类器
base_rf = RandomForestClassifier()

# 创建GridSearchCV对象
grid_search = GridSearchCV(estimator=base_rf, param_grid=param_grid,
                           cv=6, scoring='f1', verbose=1, n_jobs=-1)

# 执行网格搜索来找到最优参数
grid_search.fit(X_train, y_train)

Fitting 6 folds for each of 1 candidates, totalling 6 fits


In [135]:
print("Best parameters found: ", grid_search.best_params_)
y_pred = grid_search.predict(X_test)
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))

Best parameters found:  {'bootstrap': True, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Precision:  0.9846153846153847
Recall:  0.810126582278481


In [None]:
dump(pipeline, "credit-fraud.joblib")

['credit-fraud.joblib']

In [174]:
# 创建一个基础的随机森林分类器
base_rf = RandomForestClassifier()

# 执行网格搜索来找到最优参数
base_rf.fit(X_train, y_train)

In [176]:
y_pred = base_rf.predict(X_test)
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))

Precision:  0.984375
Recall:  0.7974683544303798


In [161]:
dump(base_rf, "credit-fraud.joblib")

['credit-fraud.joblib']

In [189]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, recall_score
base_rf = RandomForestClassifier()

# 创建BalancedRandomForestClassifier实例
balanced_rf = BalancedRandomForestClassifier()

# 创建VotingClassifier集成模型
ensemble_model = VotingClassifier(
    estimators=[('base_rf', base_rf), ('balanced_rf', balanced_rf)], voting='soft')

# 训练集成模型
ensemble_model.fit(X_train, y_train)

# 对测试集进行预测
y_pred = ensemble_model.predict(X_test)

KeyboardInterrupt: 

In [169]:
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))

Precision:  0.9571428571428572
Recall:  0.8481012658227848


In [170]:
dump(ensemble_model, "credit-fraud.joblib")

['credit-fraud.joblib']

In [191]:
from sklearn.utils import resample

# 假设 X_train, y_train 是你的训练数据和标签

# 将训练数据分成多数类和少数类
X_train_majority = X_train[y_train == 0]
X_train_minority = X_train[y_train == 1]

y_train_majority = y_train[y_train == 0]
y_train_minority = y_train[y_train == 1]

# 对少数类进行过采样
X_train_minority_upsampled, y_train_minority_upsampled = resample(X_train_minority,
                                                                   y_train_minority,
                                                                   replace=True,     # 采样替换，意味着可以选择相同的样本多次
                                                                   n_samples=len(X_train_majority),    # 增加至多数类的数量
                                                                   random_state=123) # 随机状态确保可重复性

# 对多数类进行欠采样（可选）
X_train_majority_downsampled, y_train_majority_downsampled = resample(X_train_majority,
                                                                       y_train_majority,
                                                                       replace=False,    # 无替换采样，意味着不可以选择相同的样本多次
                                                                       n_samples=len(X_train_minority),  # 减少至少数类的数量
                                                                       random_state=123) # 随机状态确保可重复性

# 合并重新采样后的多数类和少数类
X_train_resampled = np.vstack((X_train_minority_upsampled, X_train_majority_downsampled))
y_train_resampled = np.hstack((y_train_minority_upsampled, y_train_majority_downsampled))

# 现在，X_train_resampled, y_train_resampled 为平衡后的训练集，可以用于训练模型


In [192]:
# 创建一个基础的随机森林分类器
re_rf = RandomForestClassifier(n_estimators=300)

# 执行网格搜索来找到最优参数
re_rf.fit(X_train_resampled, y_train_resampled)

In [193]:
y_pred = re_rf.predict(X_test)
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))

Precision:  0.22950819672131148
Recall:  0.8860759493670886


In [200]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, recall_score
base_rf = RandomForestClassifier()

# 创建BalancedRandomForestClassifier实例
balanced_rf = re_rf

# 创建VotingClassifier集成模型
ensemble_model = VotingClassifier(
    estimators=[('base_rf', base_rf), ('balanced_rf', balanced_rf)], voting='soft')

# 训练集成模型
ensemble_model.fit(X_train, y_train)

# 对测试集进行预测
y_pred = ensemble_model.predict(X_test)

In [201]:
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))

Precision:  0.9846153846153847
Recall:  0.810126582278481
