In [1]:
# 安装必要的库
!pip install pandas numpy scikit-learn xgboost lightgbm

# 导入库
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.utils.class_weight import compute_class_weight

# 加载数据
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')
sample_submission = pd.read_csv('/content/sample_submission.csv')

# 检查数据基本信息
print("训练集信息：")
print(train.info())
print("\n测试集信息：")
print(test.info())




Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



训练集信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370703 entries, 0 to 370702
Data columns (total 24 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          370703 non-null  int64  
 1   trans_num   370703 non-null  object 
 2   trans_date  370703 non-null  object 
 3   trans_time  370703 non-null  object 
 4   unix_time   370703 non-null  int64  
 5   category    370703 non-null  object 
 6   amt         370703 non-null  float64
 7   cc_num      370703 non-null  int64  
 8   first       370703 non-null  object 
 9   last        370703 non-null  object 
 10  gender      370703 non-null  object 
 11  street      370703 non-null  object 
 12  city        370703 non-null  object 
 13  state       370703 non-null  object 
 14  zip         370703 non-null  int64  
 15  lat         370703 non-null  float64
 16  long        370703 non-null  float64
 17  city_pop    370703 non-null  int64  
 18  job         370703 non-null  object 


In [2]:
# 确保时间字段存在
print("训练集列名称：", train.columns.tolist())
print("测试集列名称：", test.columns.tolist())

# 如果字段名称不同，请调整以下代码中的 'trans_date' 和 'trans_time' 为实际字段名称
time_column_name = 'trans_date'  # 确保此名称与数据字段一致

# 时间字段处理
train[time_column_name] = pd.to_datetime(train[time_column_name])
test[time_column_name] = pd.to_datetime(test[time_column_name])

# 提取更多时间特征
train['year'] = train[time_column_name].dt.year
train['month'] = train[time_column_name].dt.month
train['day'] = train[time_column_name].dt.day
train['hour'] = train['trans_time'].str.split(':').str[0].astype(int)

test['year'] = test[time_column_name].dt.year
test['month'] = test[time_column_name].dt.month
test['day'] = test[time_column_name].dt.day
test['hour'] = test['trans_time'].str.split(':').str[0].astype(int)

# 地理距离计算
def haversine(lat1, lon1, lat2, lon2):
    R = 6371
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)
    a = np.sin(dphi / 2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(dlambda / 2)**2
    return 2 * R * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

train['distance'] = haversine(train['lat'], train['long'], train['merch_lat'], train['merch_long'])
test['distance'] = haversine(test['lat'], test['long'], test['merch_lat'], test['merch_long'])

# 类别特征编码
categorical_features = ['category', 'state', 'job', 'gender', 'merchant']
for col in categorical_features:
    train[col] = train[col].astype('category').cat.codes
    test[col] = test[col].astype('category').cat.codes

# 丢弃无用字段
drop_cols = ['trans_num', 'first', 'last', 'street', 'city', 'dob', time_column_name, 'trans_time']
train = train.drop(columns=drop_cols)
test = test.drop(columns=drop_cols)

# 检查预处理后的数据
print("预处理后的训练数据：")
print(train.info())


训练集列名称： ['id', 'trans_num', 'trans_date', 'trans_time', 'unix_time', 'category', 'amt', 'cc_num', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'merchant', 'merch_lat', 'merch_long', 'is_fraud']
测试集列名称： ['id', 'trans_num', 'trans_date', 'trans_time', 'unix_time', 'category', 'amt', 'cc_num', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'merchant', 'merch_lat', 'merch_long']
预处理后的训练数据：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370703 entries, 0 to 370702
Data columns (total 21 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          370703 non-null  int64  
 1   unix_time   370703 non-null  int64  
 2   category    370703 non-null  int8   
 3   amt         370703 non-null  float64
 4   cc_num      370703 non-null  int64  
 5   gender      370703 non-null  int8   
 6   state       370703 non-null  int8   
 7   zip  

In [5]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [6]:
# 分离特征与目标变量
X = train.drop(columns=['is_fraud'])
y = train['is_fraud']

# 数据分割
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 定义类别特征索引（针对 CatBoost）
categorical_features_indices = [
    X_train.columns.get_loc(col) for col in ['category', 'state', 'job', 'gender', 'merchant']
]

# 初始化 CatBoostClassifier
from catboost import CatBoostClassifier
catboost_model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=7,
    loss_function='Logloss',
    eval_metric='F1',
    random_seed=42,
    cat_features=categorical_features_indices,
    verbose=100,
    class_weights=[1, y_train.value_counts(normalize=True)[0] / y_train.value_counts(normalize=True)[1]]
)

# 训练 CatBoost
catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)

# CatBoost 验证集预测
y_val_pred_cat = catboost_model.predict(X_val)

# 评估 CatBoost 模型
from sklearn.metrics import f1_score, classification_report
print("CatBoost 验证集 F1-Score:")
print(f1_score(y_val, y_val_pred_cat))
print("\n分类报告：")
print(classification_report(y_val, y_val_pred_cat))

# 初始化其他模型
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from lightgbm import LGBMClassifier

rf_model = RandomForestClassifier(
    n_estimators=500,
    max_depth=10,
    random_state=42,
    class_weight='balanced'
)

lgbm_model = LGBMClassifier(
    class_weight='balanced',
    random_state=42,
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7
)

# 创建投票分类器（集成模型）
voting_model = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('lgbm', lgbm_model),
        ('catboost', catboost_model)
    ],
    voting='soft'  # 软投票，结合预测概率
)

# 训练集成模型
voting_model.fit(X_train, y_train)

# 验证集预测
y_val_pred_voting = voting_model.predict(X_val)

# 评估集成模型
print("投票分类器 验证集 F1-Score:")
print(f1_score(y_val, y_val_pred_voting))
print("\n分类报告：")
print(classification_report(y_val, y_val_pred_voting))


0:	learn: 0.9360366	test: 0.9371242	best: 0.9371242 (0)	total: 863ms	remaining: 7m 10s
100:	learn: 0.9770580	test: 0.9773416	best: 0.9773416 (100)	total: 1m 13s	remaining: 4m 51s
200:	learn: 0.9816086	test: 0.9812256	best: 0.9812256 (200)	total: 2m 2s	remaining: 3m 1s
300:	learn: 0.9838133	test: 0.9819900	best: 0.9820806 (274)	total: 2m 55s	remaining: 1m 56s
400:	learn: 0.9854456	test: 0.9832525	best: 0.9835012 (357)	total: 3m 48s	remaining: 56.4s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9835011767
bestIteration = 357

Shrink model to first 358 iterations.
CatBoost 验证集 F1-Score:
0.9363692723891112

分类报告：
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     65681
           1       0.89      0.98      0.94      8460

    accuracy                           0.98     74141
   macro avg       0.95      0.98      0.96     74141
weighted avg       0.99      0.98      0.99     74141

[LightGBM] [Info] Number of positi

In [7]:
# 使用投票分类器或 CatBoost 生成预测结果
test_pred = voting_model.predict(test)  # 替换为 catboost_model.predict(test) 如果只用 CatBoost

# 生成提交文件
submission = sample_submission.copy()
submission['is_fraud'] = test_pred
submission.to_csv('submission.csv', index=False)

print("提交文件已保存为 submission.csv")


提交文件已保存为 submission.csv
