In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import sweetviz as sv
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
df = reduce_mem_usage(pd.read_csv('credit_card_transactions-ibm_v2.csv'))

In [None]:
report = sv.analyze(df)
report.show_html('Sweetviz_Report.html')

file:///C:/Users/manju/OneDrive%20-%20McGill%20University/Desktop/scientificProject/Sweetviz_Report.html

In [None]:
card = reduce_mem_usage(pd.read_csv('sd254_cards.csv'))
user = reduce_mem_usage(pd.read_csv('sd254_users.csv'))

In [None]:
## Data preprocessing for user
user["User"] = range(2000) # create User ID to join with 'card'
## Remove the dollar signs
user["Yearly Income - Person"] = user["Yearly Income - Person"].str.replace("$", "").astype(float)
user["Total Debt"] = user["Total Debt"].str.replace("$", "").astype(float)
user["User_Location_Income"] = user["Per Capita Income - Zipcode"].str.replace("$", "").astype(float)
## Define new variable indicating users retirement status
user['Retired'] = 'No'
user.loc[user['Current Age'] > user['Retirement Age'], 'Retired'] = 'Yes'
## Define variables that are the ratio of their income, debt, and the income level at their location
user['Person_Location_Income_ratio'] = user["Yearly Income - Person"]/(user["User_Location_Income"])
user['Person_Income_toDebt'] = user["Yearly Income - Person"]/(user["Total Debt"])
user['Location_Income_toDebt'] = user["User_Location_Income"]/(user["Total Debt"])

## Select variables use for further analysis
user = user[['User', 'Gender', "Current Age", "Retirement Age" ,"Retired", "User_Location_Income", 'Yearly Income - Person', "Total Debt", "Num Credit Cards", 'Person_Location_Income_ratio','Person_Income_toDebt','Location_Income_toDebt']]

In [None]:
## Data preprocessing for cards
card["User_Card"] = card['User'].astype(str) + '_' + card['CARD INDEX'].astype(str) ## Create card id to join with transaction data
## Remove the dollor sign
card["Credit Limit"] = card["Credit Limit"].str.replace("$", "").astype(float)
card['Expire_Year'] = pd.to_datetime(card['Expires'], format='%m/%Y').dt.year
card['Expire_Month'] = pd.to_datetime(card['Expires'], format='%m/%Y').dt.month
card['Open_Year'] = pd.to_datetime(card['Acct Open Date'], format='%m/%Y').dt.year
card['Open_Month'] = pd.to_datetime(card['Acct Open Date'], format='%m/%Y').dt.month


Select variables of interest
card = card[["User_Card", "User", 'Card Brand', "Card Type", "Credit Limit"]]

In [None]:
## Left join with 'user'
card = card.merge(user, on='User', how='left')
card = card.drop(columns=['User'])


t = pd.read_csv('User0_credit_card_transactions.csv')

In [None]:
## Create the 'User_Card' index to join with crdit card information
df["User_Card"] = df['User'].astype(str) + '_' + df['Card'].astype(str)
# Remove the dollar sign
df["Amount"] = df["Amount"].str.replace("$", "").astype(float)


Keep data with positive amount
df = df[df['Amount'] > 0]

In [None]:
df = df.merge(card, on='User_Card', how='left')

Remove columns that will not be used
df = df.drop(columns=['User', 'Card', 'User_Card', "Errors?", "Merchant Name", "Merchant State", "Zip", 'MCC'])

In [None]:
df = df.drop(columns=['User', 'Card', 'User_Card'])
df.head()

In [None]:
df['Merchant State']=df['Merchant State'].fillna('unknown')
df['Zip']=df['Merchant State'].fillna('0')
df['Errors?']=df['Errors?'].fillna('unknown')
df['Apartment']=df['Apartment'].fillna('0')
df['Location_Income_toDebt']=df['Location_Income_toDebt'].fillna(df['Location_Income_toDebt'].mean())

In [None]:
df["Transcation_Time"] = df["Year"].astype(str) + '-' + df["Month"].astype(str) + '-' + df["Day"].astype(str) + ' ' + df["Time"]
df["Transcation_Time"] = pd.to_datetime(df["Transcation_Time"])

## Day of the weel
df["Weekday"] = df["Transcation_Time"].dt.day_name()

## Split the time of day into 8 different periods based on hour
df['Time_of_Day'] = ''
df.loc[(df['Transcation_Time'].dt.hour >= 23) | (df['Transcation_Time'].dt.hour < 2), 'Time_of_Day'] = 'Midnight'
df.loc[(df['Transcation_Time'].dt.hour >= 2) & (df['Transcation_Time'].dt.hour < 5), 'Time_of_Day'] = 'Early Morning'
df.loc[(df['Transcation_Time'].dt.hour >= 5) & (df['Transcation_Time'].dt.hour < 8), 'Time_of_Day'] = 'Morning'
df.loc[(df['Transcation_Time'].dt.hour >= 8) & (df['Transcation_Time'].dt.hour < 11), 'Time_of_Day'] = 'Late Morning'
df.loc[(df['Transcation_Time'].dt.hour >= 11) & (df['Transcation_Time'].dt.hour < 14), 'Time_of_Day'] = 'Noon'
df.loc[(df['Transcation_Time'].dt.hour >= 14) & (df['Transcation_Time'].dt.hour < 17), 'Time_of_Day'] = 'Afternoon'
df.loc[(df['Transcation_Time'].dt.hour >= 17) & (df['Transcation_Time'].dt.hour < 20), 'Time_of_Day'] = 'Evening'
df.loc[(df['Transcation_Time'].dt.hour >= 20) & (df['Transcation_Time'].dt.hour < 23), 'Time_of_Day'] = 'Late Night'


In [None]:
df['Per Capita Income - Zipcode'] = df['Per Capita Income - Zipcode'].str.replace("$", "").astype(float)

In [None]:
df_new = reduce_mem_usage(df.drop(columns=["Merchant Name", "Merchant State", "Zip",'Time','Merchant City','Acct Open Date','Expires','Address','City','State']))

In [None]:
df_new.to_csv('df_new', index=False)

In [3]:
df_new = reduce_mem_usage(pd.read_csv('df_new'))
df_new["Is Fraud?"] = df_new["Is Fraud?"].apply(lambda x: 1 if x == 'Yes' else 0)
df_new['transaction_hour'] = pd.to_datetime(df_new['Transcation_Time']).dt.hour
df_new = df_new.drop(columns=['Transcation_Time'])

Mem. usage decreased to 4511.89 Mb (46.1% reduction)


In [4]:
X = df_new.drop(columns=['Is Fraud?'])
y = df_new['Is Fraud?']
X = reduce_mem_usage(X)

Mem. usage decreased to 4163.03 Mb (1.6% reduction)


In [None]:
y.to_csv('y.csv', index=False)

In [5]:
X.replace([np.inf, -np.inf], np.nan, inplace=True)

In [6]:
X['Person_Location_Income_ratio']=X['Person_Location_Income_ratio'].fillna(0)
X['Person_Income_toDebt']=X['Person_Income_toDebt'].fillna(0)
X['Location_Income_toDebt']=X['Location_Income_toDebt'].fillna(0)
X = X.drop(columns=['Person','Card Number'])

In [7]:
X.to_csv('X', index=False)

### data clean 

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)
numerical_columns = numerical_columns_selector(X)
categorical_columns = categorical_columns_selector(X)
for col in categorical_columns:
    X[col] = X[col].astype(str)
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()
preprocessor = ColumnTransformer(
    transformers=[
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
        ("standard_scaler", numerical_preprocessor, numerical_columns)
    ]
)
## Transform the variable
X_preprocessed = preprocessor.fit_transform(X)
feature_names = preprocessor.get_feature_names_out()

In [8]:
from datetime import datetime
datatime = datetime.now()

In [9]:

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer

# 选择器帮助选择数据类型
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

# 选择数值型和类别型列
numerical_columns = numerical_columns_selector(X)
categorical_columns = categorical_columns_selector(X)

# 将类别型变量转换为字符串类型
for col in categorical_columns:
    X[col] = X[col].astype(str)

# 类别型预处理器
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=True)

# 数值型预处理器
standard_scaler_columns = ['Amount', 'Credit Limit', 'Per Capita Income - Zipcode', 'Yearly Income - Person', 'Total Debt', 'FICO Score']
min_max_scaler_columns = ['Latitude', 'Longitude']

# 标准化处理
standard_scaler = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # 处理缺失值
    ('scaler', StandardScaler())
])

# 正则化处理
min_max_scaler = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # 处理缺失值
    ('scaler', MinMaxScaler())
])

In [36]:

# 日期处理（如果需要）
from sklearn.base import BaseEstimator, TransformerMixin

class CustomDateTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        current_year = datetime.now().year
        X = X.copy()
        X['Card_Age'] = current_year - X['Open_Year']
        X['Expires_In'] = X['Expire_Year'] - current_year
        return X[['Card_Age', 'Expires_In']]

    def get_feature_names_out(self, input_features=None):
        return ['Card_Age', 'Expires_In']

# 使用自定义转换器
date_transformer = CustomDateTransformer()

# ColumnTransformer配置
preprocessor = ColumnTransformer(
    transformers=[
        ("one-hot-encoder", OneHotEncoder(handle_unknown="ignore"), categorical_columns),
        ("standard_scaler", StandardScaler(), standard_scaler_columns),
        ("min_max_scaler", MinMaxScaler(), min_max_scaler_columns),
        ("date_processor", date_transformer, ['Open_Year', 'Expire_Year'])
    ],
    remainder='passthrough'
)

X = reduce_mem_usage(X)

NameError: name 'categorical_columns' is not defined

In [None]:
X.info()

In [None]:
# 转换变量
X_preprocessed_new = preprocessor.fit_transform(X)

In [35]:

# 获取新的特征名称，用于模型训练后查看特征重要性等信息
feature_names_new = preprocessor.get_feature_names_out()

NameError: name 'preprocessor' is not defined

类别型变量：使用 OneHotEncoder 转换。
标准化数值型变量：选择了一些关键的财务指标和评分进行标凈化。
正则化数值型变量：选择了地理位置坐标进行正则化处理，以保证它们的比例和范围是统一的。
日期处理：如果需要处理日期信息，这里提供了一个函数转换器来计算日期相关的新特征，比如信用卡的有效期等。

In [12]:
pd.DataFrame(feature_names_new).to_csv('feature_names', index=False)
pd.DataFrame(X_preprocessed_new).to_csv('X_preprocessed', index=False)

### get sample

In [3]:
def resample_split(rsd, X):
    # Calculate the desired number of fraud cases based on the desired proportion
    desired_proportion = 0.05
    total_samples = 50000
    fraud_samples = int(total_samples * desired_proportion)
    
    # Create RandomUnderSampler with the desired sampling strategy
    rus = RandomUnderSampler(sampling_strategy={0: total_samples - fraud_samples, 1: fraud_samples}, random_state = rsd)
    
    # Apply random undersampling to the original dataset
    X_resampled, y_resampled = rus.fit_resample(X, y)
    
    # Split the resampled data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state = rsd)
    
    return X_train, X_test, y_train, y_test, feature_names_new

In [4]:
## model evaluation
def cus_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    return accuracy, auc, precision, recall, f1

In [5]:
import random
sd = random_numbers = random.sample(range(1000, 10000), 5)
sd

[6171, 2292, 4399, 8125, 2492]

### LightGBM

In [45]:
X_preprocessed_new = reduce_mem_usage(pd.read_csv('X_preprocessed'))
y = reduce_mem_usage(pd.read_csv('y'))
y = y.drop(columns=['Unnamed: 0'])
feature_names_new  = reduce_mem_usage(pd.read_csv('feature_names'))

Mem. usage decreased to 4186.29 Mb (74.4% reduction)
Mem. usage decreased to 116.29 Mb (68.7% reduction)
Mem. usage decreased to  0.00 Mb (0.0% reduction)


In [46]:
feature_names_new_list = feature_names_new.iloc[:,0].tolist()
X_preprocessed = X_preprocessed_new.copy()
X_preprocessed.columns = feature_names_new_list

In [68]:
X_preprocessed_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24386900 entries, 0 to 24386899
Data columns (total 88 columns):
 #   Column  Dtype  
---  ------  -----  
 0   0       float16
 1   1       float16
 2   2       float16
 3   3       float16
 4   4       float16
 5   5       float16
 6   6       float16
 7   7       float16
 8   8       float16
 9   9       float16
 10  10      float16
 11  11      float16
 12  12      float16
 13  13      float16
 14  14      float16
 15  15      float16
 16  16      float16
 17  17      float16
 18  18      float16
 19  19      float16
 20  20      float16
 21  21      float16
 22  22      float16
 23  23      float16
 24  24      float16
 25  25      float16
 26  26      float16
 27  27      float16
 28  28      float16
 29  29      float16
 30  30      float16
 31  31      float16
 32  32      float16
 33  33      float16
 34  34      float16
 35  35      float16
 36  36      float16
 37  37      float16
 38  38      float16
 39  39      float16
 40

### SHAP evaluation

In [47]:
import shap

results_SHAP = {}
important_features_overall = {}  # To track overall importance across all seeds

for rsd in sd:
    ## Perform Undersampling to make the data more balance
    X_train, X_test, y_train, y_test, feature_names = resample_split(rsd,X_preprocessed_new)
    
    ## Create LightGBM dataset
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference = lgb_train)
    
    ## Define the parameters
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'binary_logloss', 'binary_error', 'auc'},
        'num_leaves': 31,
        'learning_rate': 0.1,
        'lambda_l1': 0.01, ## Avoid overfitting
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0
    }
    
    model = lgb.train(params, lgb_train, num_boost_round=300, valid_sets=[lgb_train, lgb_eval])
    
    # SHAP 值分析
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_train)
    
    # 得到平均绝对 SHAP 值，表示整体特征重要性
    shap_sum = np.abs(shap_values).mean(axis=0)
    shap_sum = pd.DataFrame(shap_sum)

    # 确保 feature_names 是一个列表或类似数组的结构，且长度与 shap_sum 一致
    # 创建 DataFrame
    importance_df = pd.concat([feature_names_new, shap_sum], axis=1)
    importance_df.columns = ['feature', 'shap_importance']

    # 按重要性排序
    importance_df.sort_values(by='shap_importance', ascending=False, inplace=True)

    #importance_df = pd.DataFrame([feature_names, shap_sum.tolist()]).T
    #importance_df.columns = ['feature', 'shap_importance']
    #importance_df = importance_df.sort_values('shap_importance', ascending=False)
    
    # 保存每次结果
    top_features = importance_df.head(20)['feature'].tolist()  # 取重要性最高的20个特征
    important_features_overall[rsd] = top_features  # Tracking feature importance
    
    # Predict the model using the testing sets
    y_pred = model.predict(X_test)
    y_pred = (y_pred > 0.5).astype(int)
    
    # Metrics
    accuracy, auc, precision, recall, f1 = cus_metrics(y_test, y_pred)
    
    # 保存结果
    results_SHAP[f"Seed {rsd}"] = {
        'Accuracy': accuracy,
        'AUC': auc,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Top 20 Important Features by SHAP': top_features
    }
    print('Round finished for seed:', rsd)

# 分析哪些特征在多个随机种子中重复出现
final_selected_features = pd.Series([feat for sublist in important_features_overall.values() for feat in sublist])
final_selected_features_SHAP = final_selected_features.value_counts().index.tolist()[:20]  # 选择出现次数最多的前20个特征

# 输出最终选定的特征
print("Final selected features based on SHAP importance across all seeds:", final_selected_features)


Round finished for seed: 6171
Round finished for seed: 2292
Round finished for seed: 4399
Round finished for seed: 8125
Round finished for seed: 2492
Final selected features based on SHAP importance across all seeds: 0                                   remainder__MCC
1     one-hot-encoder__Use Chip_Online Transaction
2                                  remainder__Year
3                      remainder__transaction_hour
4                          standard_scaler__Amount
                          ...                     
95                          remainder__Current Age
96    standard_scaler__Per Capita Income - Zipcode
97                remainder__Year PIN last Changed
98         remainder__Person_Location_Income_ratio
99                        min_max_scaler__Latitude
Length: 100, dtype: object


In [48]:
#X_preprocessed_new_df = pd.DataFrame(X_preprocessed_new, columns=feature_names_new_list)

X_selected_SHAP = X_preprocessed[final_selected_features_SHAP]

# 检查所选特征的数据集
print(X_selected_SHAP.head())

# 可选：保存新的特征数据集
#X_selected.to_csv('X_selected.csv', index=False)
#print("Selected features dataset saved as X_selected.csv.")


   remainder__MCC  standard_scaler__Credit Limit  remainder__Birth Year  \
0          5300.0                       0.742676                 1966.0   
1          5412.0                       0.742676                 1966.0   
2          5412.0                       0.742676                 1966.0   
3          5652.0                       0.742676                 1966.0   
4          5912.0                       0.742676                 1966.0   

   one-hot-encoder__Use Chip_Online Transaction  \
0                                           0.0   
1                                           0.0   
2                                           0.0   
3                                           0.0   
4                                           0.0   

   standard_scaler__Per Capita Income - Zipcode  standard_scaler__FICO Score  \
0                                      0.448242                     1.113281   
1                                      0.448242                     1.113281   
2 

In [49]:
result_new_SHAP = {}
for rsd in sd:
    ## Perform Undersampling to make the data more balance
    X_train, X_test, y_train, y_test, feature_names = resample_split(rsd, X_selected_SHAP)
    
    ## Create LightGBM dataset
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference = lgb_train)
    
    ## Define the parameters
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'binary_logloss', 'binary_error', 'auc'},
        'num_leaves': 31,
        'learning_rate': 0.1,
        'lambda_l1': 0.01, ## Avoid overfitting
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0
    }
    
    ## Train the model
    model = lgb.train(params, lgb_train, num_boost_round = 300, valid_sets = lgb_train)
    
    ## Predict the model using the testing sets
    y_pred = model.predict(X_test)
    y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions
    
    ## obtain the model performance metrics
    accuracy, auc, precision, recall, f1 = cus_metrics(y_test, y_pred)  
    ## save the result
    result_new_SHAP[f"Seed {rsd}"] = {
        'Accuracy': accuracy,
        'AUC': auc,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
    }
    print('Round finished for seed:', rsd)

Round finished for seed: 6171
Round finished for seed: 2292
Round finished for seed: 4399
Round finished for seed: 8125
Round finished for seed: 2492


### build-in evaluation

In [50]:
import lightgbm as lgb

results_b = {}
important_features_overall = {}  # To track overall importance across all seeds

for rsd in sd:
    ## Perform Undersampling to balance the data
    X_train, X_test, y_train, y_test, feature_names = resample_split(rsd, X_preprocessed_new)
    
    ## Create LightGBM dataset
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    
    ## Define the parameters
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'binary_logloss', 'binary_error', 'auc'},
        'num_leaves': 31,
        'learning_rate': 0.1,
        'lambda_l1': 0.01,  # To avoid overfitting
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0
    }
    
    ## Train the model
    model = lgb.train(params, lgb_train, num_boost_round=300, valid_sets=[lgb_train, lgb_eval])
    
    ## Get feature importance
    feature_importance = model.feature_importance(importance_type='gain')
    feature_names_list = feature_names.iloc[:,0].tolist()
    importance_df = pd.DataFrame({'feature': feature_names_list, 'importance': feature_importance})
    importance_df.sort_values(by='importance', ascending=False, inplace=True)
    
    ## Save the results for this round
    top_features = importance_df.head(20)['feature'].tolist()  # Get the top 20 important features
    important_features_overall[rsd] = top_features  # Tracking feature importance
    
    ## Predict using the model
    y_pred = model.predict(X_test)
    y_pred = (y_pred > 0.5).astype(int)
    
    ## Calculate metrics
    accuracy, auc, precision, recall, f1 = cus_metrics(y_test, y_pred)
    
    ## Store results
    results_b[f"Seed {rsd}"] = {
        'Accuracy': accuracy,
        'AUC': auc,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Top 20 Important Features by Gain': top_features
    }
    print('Round finished for seed:', rsd)

## Identify the most frequently important features across all seeds
final_selected_features = pd.Series([feat for sublist in important_features_overall.values() for feat in sublist])
final_selected_features_b = final_selected_features.value_counts().index.tolist()[:20]  # Select the top 20 most frequently important features

## Output the final selected features
print("Final selected features based on gain importance across all seeds:", final_selected_features)


Round finished for seed: 6171
Round finished for seed: 2292
Round finished for seed: 4399
Round finished for seed: 8125
Round finished for seed: 2492
Final selected features based on gain importance across all seeds: 0                                   remainder__MCC
1     one-hot-encoder__Use Chip_Online Transaction
2                          standard_scaler__Amount
3                                  remainder__Year
4                      remainder__transaction_hour
                          ...                     
95                        date_processor__Card_Age
96                     standard_scaler__Total Debt
97                 remainder__Person_Income_toDebt
98                     remainder__Num Credit Cards
99                       remainder__Retirement Age
Length: 100, dtype: object


In [51]:
X_selected_b = X_preprocessed[final_selected_features_b]

# 检查所选特征的数据集
print(X_selected_b.head())

# 可选：保存新的特征数据集
#X_selected.to_csv('X_selected.csv', index=False)
#print("Selected features dataset saved as X_selected.csv.")

   remainder__MCC  standard_scaler__Per Capita Income - Zipcode  \
0          5300.0                                      0.448242   
1          5412.0                                      0.448242   
2          5412.0                                      0.448242   
3          5652.0                                      0.448242   
4          5912.0                                      0.448242   

   remainder__Zipcode  remainder__Month  standard_scaler__Total Debt  \
0             91750.0               9.0                      1.31543   
1             91750.0               9.0                      1.31543   
2             91750.0               9.0                      1.31543   
3             91750.0               9.0                      1.31543   
4             91750.0               9.0                      1.31543   

   one-hot-encoder__Use Chip_Online Transaction  remainder__Day  \
0                                           0.0             1.0   
1                             

In [52]:
result_new_b = {}
for rsd in sd:
    ## Perform Undersampling to make the data more balance
    X_train, X_test, y_train, y_test, feature_names = resample_split(rsd, X_selected_b)
    
    ## Create LightGBM dataset
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference = lgb_train)
    
    ## Define the parameters
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'binary_logloss', 'binary_error', 'auc'},
        'num_leaves': 31,
        'learning_rate': 0.1,
        'lambda_l1': 0.01, ## Avoid overfitting
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0
    }
    
    ## Train the model
    model = lgb.train(params, lgb_train, num_boost_round = 300, valid_sets = lgb_train)
    
    ## Predict the model using the testing sets
    y_pred = model.predict(X_test)
    y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions
    
    ## obtain the model performance metrics
    accuracy, auc, precision, recall, f1 = cus_metrics(y_test, y_pred)  
    ## save the result
    result_new_b[f"Seed {rsd}"] = {
        'Accuracy': accuracy,
        'AUC': auc,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
    }
    print('Round finished for seed:', rsd)

Round finished for seed: 6171
Round finished for seed: 2292
Round finished for seed: 4399
Round finished for seed: 8125
Round finished for seed: 2492


### Permutation importance evaluation 

In [53]:
from sklearn.inspection import permutation_importance
import lightgbm as lgb
import numpy as np
import pandas as pd

results_p = {}
important_features_overall = {}  # To track overall importance across all seeds

for rsd in sd:
    ## Perform Undersampling to balance the data
    X_train, X_test, y_train, y_test, feature_names = resample_split(rsd, X_preprocessed_new)
    
    ## Define the model
    model = lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        num_leaves=31,
        learning_rate=0.1,
        lambda_l1=0.01,  # To avoid overfitting
        feature_fraction=0.9,
        bagging_fraction=0.8,
        bagging_freq=5,
        verbose=0,
        n_estimators=300,
        random_state=rsd,
    )
    
    ## Train the model
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric='auc')
    
    ## Compute Permutation Importance
    perm_importance = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=rsd, n_jobs=-1)
    
    ## Organize results into DataFrame
    feature_names_list = feature_names.iloc[:,0].tolist()
    importance_df = pd.DataFrame({'feature': feature_names_list, 'importance': perm_importance.importances_mean})
    importance_df.sort_values(by='importance', ascending=False, inplace=True)
    
    ## Save the results for this round
    top_features = importance_df.head(20)['feature'].tolist()  # Get the top 20 important features
    important_features_overall[rsd] = top_features  # Tracking feature importance
    
    ## Predict using the model
    y_pred = model.predict(X_test)
    
    ## Calculate metrics
    accuracy, auc, precision, recall, f1 = cus_metrics(y_test, y_pred)
    
    ## Store results
    results_p[f"Seed {rsd}"] = {
        'Accuracy': accuracy,
        'AUC': auc,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Top 20 Important Features by Permutation Importance': top_features
    }
    print('Round finished for seed:', rsd)

## Identify the most frequently important features across all seeds
final_selected_features = pd.Series([feat for sublist in important_features_overall.values() for feat in sublist])
final_selected_features_p = final_selected_features.value_counts().index.tolist()[:20]  # Select the top 20 most frequently important features

## Output the final selected features
print("Final selected features based on permutation importance across all seeds:", final_selected_features)


Round finished for seed: 6171
Round finished for seed: 2292
Round finished for seed: 4399
Round finished for seed: 8125
Round finished for seed: 2492
Final selected features based on permutation importance across all seeds: 0     one-hot-encoder__Use Chip_Online Transaction
1                                   remainder__MCC
2                          standard_scaler__Amount
3                                  remainder__Year
4                      remainder__transaction_hour
                          ...                     
95                 one-hot-encoder__Weekday_Sunday
96                         remainder__Expire_Month
97                       remainder__Retirement Age
98                              remainder__Zipcode
99                           remainder__Birth Year
Length: 100, dtype: object


### 1. **"permutation_importance 函数需要一个实现了 fit 方法的估计器对象"**
`permutation_importance` 是 `scikit-learn` 库中的一个函数，用来评估模型中各个特征的重要性。这个函数通过改变每个特征的值顺序（即排列）并计算这种改变对模型预测性能的影响来工作。为了能够进行这种评估，`permutation_importance` 需要能够自主地对数据进行预测，因此它要求传入的参数（即模型）必须有一个 `fit` 方法。`fit` 方法用于训练模型，即根据提供的数据调整模型的内部参数以最好地预测目标变量。

### 2. **"而 LightGBM 的 lgb.train 方法返回的是一个 Booster 对象，这个对象并没有 fit 方法"**
LightGBM 是一个高性能的梯度提升框架，它支持多种类型的接口。`lgb.train` 是 LightGBM 的底层接口，主要用于更精细的模型训练控制。使用 `lgb.train` 训练出来的模型是一个 `Booster` 对象，这是 LightGBM 的一个核心组件，用于持有和管理梯度提升模型的数据。虽然 `Booster` 对象可以用于预测，但它是专为 LightGBM 设计的，没有实现 `fit` 方法。`fit` 方法是 `scikit-learn` 风格的模型通常具备的，用于训练模型。

由于 `permutation_importance` 需要一个具有 `fit` 方法的模型对象来进行特征重要性的评估，因此不能直接使用由 `lgb.train` 返回的 `Booster` 对象。相反，我们需要使用 LightGBM 的 `LGBMClassifier` 或 `LGBMRegressor`，这些都是与 `scikit-learn` 兼容的高层接口，支持 `fit` 方法，并可以无缝与 `scikit-learn` 的工具如 `permutation_importance` 配合使用。这样，你就可以直接在 LightGBM 模型上使用 `scikit-learn` 的各种功能，包括模型评估和特征选择等。

LightGBM
LightGBM 是一个高性能的梯度提升框架，由 Microsoft 开发，用于构建和训练梯度提升树模型。这个框架主要面向两类用户：那些需要高效、快速训练大规模数据的研究人员和工程师，以及需要精细控制模型训练过程的高级用户。它支持各种自定义的优化和配置，包括但不限于并行训练、GPU 加速、处理大规模数据集等。

lgb.LGBMClassifier
lgb.LGBMClassifier 是 LightGBM 提供的一个高层次 API，它是一个分类器，封装了 LightGBM 的功能，使之符合 scikit-learn 的接口。这种封装的主要好处是能够让 LightGBM 的模型与 scikit-learn 的其他功能（如交叉验证、网格搜索以及其他各种模型评估和特征选择方法）无缝集成。简而言之，LGBMClassifier 提供了一种简便的方式来使用 LightGBM，同时保持与 scikit-learn 生态系统的兼容性。

主要区别和使用场景
兼容性：LGBMClassifier 与 scikit-learn 完全兼容，支持所有依赖于 scikit-learn 接口的方法和特性（例如使用 .fit()、.predict() 方法），而 lgb.train 返回的 Booster 对象则专门用于 LightGBM 内部使用，不直接支持 scikit-learn 的模式。
易用性：LGBMClassifier 使得用户可以更方便地利用 scikit-learn 的诸多工具，如参数搜索、模型评估等，而不需要深入了解 LightGBM 的内部细节。
功能性：使用 lgb.train 可能会更加复杂，但它提供了更高的灵活性和控制能力，适合需要对模型训练过程进行精细控制的场景。

In [54]:

X_selected_p = X_preprocessed[final_selected_features_p]

# 检查所选特征的数据集
print(X_selected_p.head())

# 可选：保存新的特征数据集
#X_selected.to_csv('X_selected.csv', index=False)
#print("Selected features dataset saved as X_selected.csv.")

   one-hot-encoder__Use Chip_Online Transaction  standard_scaler__Amount  \
0                                           0.0                 1.103516   
1                                           0.0                -0.062988   
2                                           0.0                 0.935059   
3                                           0.0                 1.041016   
4                                           0.0                 0.744141   

   remainder__Year  remainder__transaction_hour  \
0           2002.0                          6.0   
1           2002.0                          6.0   
2           2002.0                          6.0   
3           2002.0                         17.0   
4           2002.0                          6.0   

   one-hot-encoder__Use Chip_Swipe Transaction  remainder__MCC  \
0                                          1.0          5300.0   
1                                          1.0          5412.0   
2                                     

In [55]:
result_new_p = {}
for rsd in sd:
    ## Perform Undersampling to make the data more balance
    X_train, X_test, y_train, y_test, feature_names = resample_split(rsd, X_selected_p)
    
    ## Create LightGBM dataset
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference = lgb_train)
    
    ## Define the parameters
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'binary_logloss', 'binary_error', 'auc'},
        'num_leaves': 31,
        'learning_rate': 0.1,
        'lambda_l1': 0.01, ## Avoid overfitting
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0
    }
    
    ## Train the model
    model = lgb.train(params, lgb_train, num_boost_round = 300, valid_sets = lgb_train)
    
    ## Predict the model using the testing sets
    y_pred = model.predict(X_test)
    y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions
    
    ## obtain the model performance metrics
    accuracy, auc, precision, recall, f1 = cus_metrics(y_test, y_pred)  
    ## save the result
    result_new_p[f"Seed {rsd}"] = {
        'Accuracy': accuracy,
        'AUC': auc,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
    }
    print('Round finished for seed:', rsd)

Round finished for seed: 6171
Round finished for seed: 2292
Round finished for seed: 4399
Round finished for seed: 8125
Round finished for seed: 2492


### SHAP result compare

In [56]:
flattened_results = []
for seed, data in results_SHAP.items():
    row = {
        'Seed': seed,
        'Accuracy': data['Accuracy'],
        'AUC': data['AUC'],
        'Precision': data['Precision'],
        'Recall': data['Recall'],
        'F1 Score': data['F1 Score']
    }
    flattened_results.append(row)

## Create a DataFrame from the flattened dictionary
lightGBM_SHAP = pd.DataFrame(flattened_results)

In [57]:
flattened_results = []
for seed, data in result_new_SHAP.items():
    row = {
        'Seed': seed,
        'Accuracy': data['Accuracy'],
        'AUC': data['AUC'],
        'Precision': data['Precision'],
        'Recall': data['Recall'],
        'F1 Score': data['F1 Score']
    }
    flattened_results.append(row)

## Create a DataFrame from the flattened dictionary
lightGBM_new_SHAP = pd.DataFrame(flattened_results)

In [58]:
lightGBM_SHAP

Unnamed: 0,Seed,Accuracy,AUC,Precision,Recall,F1 Score
0,Seed 6171,0.9772,0.799474,0.912121,0.602,0.725301
1,Seed 2292,0.9766,0.799158,0.895833,0.602,0.720096
2,Seed 4399,0.9758,0.797789,0.877193,0.6,0.712589
3,Seed 8125,0.9762,0.797053,0.889881,0.598,0.715311
4,Seed 2492,0.9756,0.797684,0.872093,0.6,0.7109


In [59]:
lightGBM_new_SHAP

Unnamed: 0,Seed,Accuracy,AUC,Precision,Recall,F1 Score
0,Seed 6171,0.9777,0.807316,0.906158,0.618,0.734839
1,Seed 2292,0.9769,0.808789,0.88102,0.622,0.729191
2,Seed 4399,0.976,0.798842,0.880117,0.602,0.714964
3,Seed 8125,0.9761,0.794158,0.89426,0.592,0.712395
4,Seed 2492,0.9755,0.791,0.885196,0.586,0.705174


### Build_in rusult compare

In [60]:
flattened_results = []
for seed, data in results_b.items():
    row = {
        'Seed': seed,
        'Accuracy': data['Accuracy'],
        'AUC': data['AUC'],
        'Precision': data['Precision'],
        'Recall': data['Recall'],
        'F1 Score': data['F1 Score']
    }
    flattened_results.append(row)

## Create a DataFrame from the flattened dictionary
lightGBM_b = pd.DataFrame(flattened_results)

In [61]:
flattened_results = []
for seed, data in result_new_b.items():
    row = {
        'Seed': seed,
        'Accuracy': data['Accuracy'],
        'AUC': data['AUC'],
        'Precision': data['Precision'],
        'Recall': data['Recall'],
        'F1 Score': data['F1 Score']
    }
    flattened_results.append(row)

## Create a DataFrame from the flattened dictionary
lightGBM_new_b = pd.DataFrame(flattened_results)

In [62]:
lightGBM_b

Unnamed: 0,Seed,Accuracy,AUC,Precision,Recall,F1 Score
0,Seed 6171,0.9772,0.799474,0.912121,0.602,0.725301
1,Seed 2292,0.9766,0.799158,0.895833,0.602,0.720096
2,Seed 4399,0.9758,0.797789,0.877193,0.6,0.712589
3,Seed 8125,0.9762,0.797053,0.889881,0.598,0.715311
4,Seed 2492,0.9756,0.797684,0.872093,0.6,0.7109


In [63]:
lightGBM_new_b

Unnamed: 0,Seed,Accuracy,AUC,Precision,Recall,F1 Score
0,Seed 6171,0.9771,0.807,0.89049,0.618,0.729634
1,Seed 2292,0.976,0.796947,0.884615,0.598,0.713604
2,Seed 4399,0.9761,0.803632,0.871795,0.612,0.719154
3,Seed 8125,0.9749,0.786895,0.878419,0.578,0.697226
4,Seed 2492,0.9737,0.782474,0.855856,0.57,0.684274


### Permutation result compare

In [64]:
flattened_results = []
for seed, data in results_p.items():
    row = {
        'Seed': seed,
        'Accuracy': data['Accuracy'],
        'AUC': data['AUC'],
        'Precision': data['Precision'],
        'Recall': data['Recall'],
        'F1 Score': data['F1 Score']
    }
    flattened_results.append(row)

## Create a DataFrame from the flattened dictionary
lightGBM_p = pd.DataFrame(flattened_results)

In [65]:
flattened_results = []
for seed, data in result_new_p.items():
    row = {
        'Seed': seed,
        'Accuracy': data['Accuracy'],
        'AUC': data['AUC'],
        'Precision': data['Precision'],
        'Recall': data['Recall'],
        'F1 Score': data['F1 Score']
    }
    flattened_results.append(row)

## Create a DataFrame from the flattened dictionary
lightGBM_new_p = pd.DataFrame(flattened_results)

In [66]:
lightGBM_p

Unnamed: 0,Seed,Accuracy,AUC,Precision,Recall,F1 Score
0,Seed 6171,0.9768,0.797368,0.906061,0.598,0.720482
1,Seed 2292,0.9762,0.799895,0.883041,0.604,0.71734
2,Seed 4399,0.9752,0.789895,0.879518,0.584,0.701923
3,Seed 8125,0.9758,0.788316,0.900621,0.58,0.705596
4,Seed 2492,0.9752,0.793684,0.870588,0.592,0.704762


In [67]:
lightGBM_new_p

Unnamed: 0,Seed,Accuracy,AUC,Precision,Recall,F1 Score
0,Seed 6171,0.977,0.809789,0.881356,0.624,0.730679
1,Seed 2292,0.9767,0.808684,0.876056,0.622,0.727485
2,Seed 4399,0.9761,0.800789,0.878261,0.606,0.71716
3,Seed 8125,0.9773,0.807105,0.895652,0.618,0.731361
4,Seed 2492,0.9757,0.793,0.885886,0.59,0.708283


XGboost

import xgboost as xgb
result = {}
for rsd in sd:
    ## Perform Undersampling to make the data more balanced
    X_train, X_test, y_train, y_test, feature_names = resample_split(rsd)
        
    ## Create XGBoost DMatrix
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
        
    # Define the parameters
    params = {
            'booster': 'gbtree',
            'objective': 'binary:logistic',
            'eval_metric': ['logloss', 'error', 'auc'],
            'max_depth': 6,
            'learning_rate': 0.1,
            'reg_alpha': 0.01,  # L1 regularization to avoid overfitting
            'subsample': 0.8,
            'colsample_bytree': 0.9,
            'verbosity': 0
        }
        
    ## Train the model
    model = xgb.train(params, dtrain, num_boost_round=3000, evals=[(dtrain, 'train')])
        
    ## Predict the model using the testing sets
    y_pred = model.predict(dtest)
    y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions
    
    ## Obtain the model performance metrics
    accuracy, auc, precision, recall, f1 = cus_metrics(y_test, y_pred)

    ## Get the top 20 feature importance
    feature_importance = model.get_score(importance_type='gain')
    feature_importance_dict = dict(zip(feature_names, feature_importance))
    top20feat = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)[:20]
    
    ## Save the result
    result[f"Seed {rsd}"] = {
        'Accuracy': accuracy,
        'AUC': auc,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Top 20 Important Features': top20feat
    }
    print('finish')

flattened_results = []
for seed, data in result.items():
    row = {
        'Seed': seed,
        'Accuracy': data['Accuracy'],
        'AUC': data['AUC'],
        'Precision': data['Precision'],
        'Recall': data['Recall'],
        'F1 Score': data['F1 Score']
    }
    for i, (feat_name, feat_value) in enumerate(data['Top 20 Important Features']):
        row[f'Top{i+1} Feature Name'] = feat_name
        row[f'Top{i+1} Feature Value'] = feat_value
    flattened_results.append(row)


XGB_df = pd.DataFrame(flattened_results)

XGB_df.to_csv('XGB_result.csv', index=False)

X_preprocessed = reduce_mem_usage(pd.read_csv('X_preprocessed'))
y = reduce_mem_usage(pd.read_csv('y'))

y = y.iloc[:, 1]

Autoencoders
These unsupervised deep learning models learn about efficient data representations by reconstructing input data. They can be trained on normal fraudulent data and will flag instances of high reconstruction errors as possible fraud. They are good for identifying new types of fraud and detecting subtle anomalies.

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score

result = {}

for rsd in sd:
    ## Perform Undersampling to make the data more balanced
    X_train, X_test, y_train, y_test, feature_names = resample_split(rsd)
    
    ## Normalize data
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    ## Autoencoder Model Definition
    input_dim = X_train_scaled.shape[1]
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(14, activation='relu')(input_layer)
    decoded = Dense(input_dim, activation='sigmoid')(encoded)
    autoencoder = Model(inputs=input_layer, outputs=decoded)
    
    ## Encoder Model
    encoder = Model(inputs=input_layer, outputs=encoded)
    
    ## Compile the model
    optimizer = Adam(learning_rate=0.001)

    # Compile the model with the defined optimizer
    autoencoder.compile(optimizer=optimizer, loss='mean_squared_error')

    
    ## Train the model
    autoencoder.fit(X_train_scaled, X_train_scaled,
                    epochs=100,
                    batch_size=32,
                    shuffle=True,
                    validation_data=(X_test_scaled, X_test_scaled),
                    verbose=0)
    
    ## Encoding the test set to detect anomalies (fraud)
    encoded_test = encoder.predict(X_test_scaled)
    reconstructions = autoencoder.predict(X_test_scaled)
    mse = np.mean(np.power(X_test_scaled - reconstructions, 2), axis=1)
    mse_threshold = np.percentile(mse, 95)  # Threshold for anomaly detection
    y_pred = (mse > mse_threshold).astype(int)
    
    ## Obtain the model performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    ## Save the result
    result[f"Seed {rsd}"] = {
        'Accuracy': accuracy,
        'AUC': auc,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }
    print('Finished Seed:', rsd)


In [None]:
flattened_results = []
for seed, data in result.items():
    row = {
        'Seed': seed,
        'Accuracy': data['Accuracy'],
        'AUC': data['AUC'],
        'Precision': data['Precision'],
        'Recall': data['Recall'],
        'F1 Score': data['F1 Score']
    }
    flattened_results.append(row)


## Create a DataFrame from the flattened dictionary
autoencoder_result = pd.DataFrame(flattened_results)

In [None]:
autoencoder_result

In [None]:
autoencoder_result.to_csv('autoencoder_result.csv', index=False)