In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import xgboost as xgb

# 内存优化函数 [[20]]
def reduce_mem_usage(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                else:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    return df

# def reduce_mem_usage(df):
# 	""" iterate through all the columns of a dataframe and modify the data type
# 		to reduce memory usage.
# 	"""
# 	start_mem = df.memory_usage().sum() / 1024 ** 2
# 	print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
#
# 	for col in df.columns:
# 		col_type = df[col].dtype
#
# 		if col_type != object:
# 			c_min = df[col].min()
# 			c_max = df[col].max()
# 			if str(col_type)[:3] == 'int':
# 				if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
# 					df[col] = df[col].astype(np.int8)
# 				elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
# 					df[col] = df[col].astype(np.int16)
# 				elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
# 					df[col] = df[col].astype(np.int32)
# 				elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
# 					df[col] = df[col].astype(np.int64)
# 			else:
# 				if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
# 					df[col] = df[col].astype(np.float16)
# 				elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
# 					df[col] = df[col].astype(np.float32)
# 				else:
# 					df[col] = df[col].astype(np.float64)
# 		else:
# 			df[col] = df[col].astype('category')
# 	end_mem = df.memory_usage().sum() / 1024 ** 2
# 	print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
# 	print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
#
# 	return df


# 读取数据（需提前下载数据集） [[2,11]]
data = pd.read_csv("C://Users/Lenovo/Documents/Tencent Files/1953846283/FileRecv/insurance/1_data_mysql/input/train.csv")

# 内存优化 [[20]]
data = reduce_mem_usage(data)

# 特征工程
data['Vehicle_Age'] = data['Vehicle_Age'].map({'< 1 Year':0, '1-2 Year':1, '> 2 Years':2})
data['Region_Code'] = data['Region_Code'].astype('category')
data['Policy_Sales_Channel'] = data['Policy_Sales_Channel'].astype('category')

# 标签编码 [[6]]
le = LabelEncoder()
data['Gender'] = le.fit_transform(data['Gender'])
data['Vehicle_Damage'] = le.fit_transform(data['Vehicle_Damage'])

# 划分数据集
X = data.drop(['id','Response'], axis=1)
y = data['Response']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# XGBoost参数配置 [[6,20]]
params = {
    'objective':'binary:logistic',
    'eval_metric':'auc',
    'max_depth':6,
    'learning_rate':0.1,
    'subsample':0.8,
    'colsample_bytree':0.8,
    'scale_pos_weight':(len(y)-sum(y))/sum(y),  # 处理不平衡数据
    # 'tree_method':'gpu_hist',
    'tree_method':'hist',  # GPU加速
    'device':'cuda',
    'random_state':42
}

# # 转换为DMatrix格式提升效率
# dtrain = xgb.DMatrix(X_train, label=y_train)
# dval = xgb.DMatrix(X_val, label=y_val)
#
# # 模型训练
# model = xgb.train(params, dtrain, num_boost_round=1000,
#                  evals=[(dval, 'eval')],
#                  early_stopping_rounds=50,
#                  verbose_eval=50)

# 转换为DMatrix格式（添加enable_categorical参数）
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)  # [[6]]
dval = xgb.DMatrix(X_val, label=y_val, enable_categorical=True)

# 模型训练（确保参数兼容）
model = xgb.train(params, dtrain, num_boost_round=1000,
                  evals=[(dval, 'eval')],
                  early_stopping_rounds=50,
                  verbose_eval=50)

# 预测验证集
y_pred = model.predict(dval)

# 计算AUC
auc = roc_auc_score(y_val, y_pred)
print(f'Validation AUC: {auc:.4f}')  # 预期输出 AUC > 0.85