In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import gc
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

sns.set()
sns.set_style('whitegrid')
sns.set_color_codes()

In [None]:
train_trans = pd.read_csv("/content/drive/MyDrive/프로젝트/train_transaction.csv")
train_idf = pd.read_csv("/content/drive/MyDrive/프로젝트/train_identity.csv")
test_trans = pd.read_csv("/content/drive/MyDrive/프로젝트/test_transaction.csv")
test_idf = pd.read_csv("/content/drive/MyDrive/프로젝트/test_identity.csv")

In [None]:
v_cols = [col for col in train_trans.columns if col.startswith('V')]
train_v = train_trans[v_cols]
test_v = test_trans[v_cols]

# 원래 데이터프레임에서 V 컬럼 삭제
train_trans = train_trans.drop(columns=v_cols)
test_trans = test_trans.drop(columns=v_cols)

In [None]:
train_trans['TransactionDay'] = train_trans['TransactionDT'] // (24 * 60 * 60)
test_trans['TransactionDay'] = test_trans['TransactionDT'] // (24 * 60 * 60)

In [None]:
train_idf.columns = [col.replace('-', '_') if 'id' in col else col for col in train_idf.columns]
test_idf.columns = [col.replace('-', '_') if 'id' in col else col for col in test_idf.columns]

In [None]:
for col in train_trans.columns:
    for i in range(1, 16):
      if i != 9:
        train_trans[f'D{i}N'] = train_trans['TransactionDay'] - train_trans[f'D{i}']
        test_trans[f'D{i}N'] = test_trans['TransactionDay'] - test_trans[f'D{i}']

In [None]:
import datetime
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')
train_trans['DT_M'] = train_trans['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
test_trans['DT_M'] = test_trans['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
train_trans['DT_M'] = (train_trans['DT_M'].dt.year-2017)*12 + train_trans['DT_M'].dt.month
test_trans['DT_M'] = (test_trans['DT_M'].dt.year-2017)*12 + test_trans['DT_M'].dt.month

In [None]:
id_feature = [ c for c in train_trans.columns if c.find('id_') !=-1]
v_feature = [ c for c in train_trans.columns if c.find('V') !=-1]
card_feature = [ c for c in train_trans.columns if c.find('card') !=-1]
C_feature = [ c for c in train_trans.columns if c.find('C') !=-1 and c != 'ProductCD']
D_feature = [ c for c in train_trans.columns if c.find('n') == -1 and c.find('D') !=-1 and c not in ['ProductCD','TransactionID','TransactionDT','DeviceType','DeviceInfo','TransactionDay', 'DT_M']]
Dn_feature = [ c for c in train_trans.columns if c.find('n') != -1 and c.find('D') !=-1 and c not in ['ProductCD','TransactionID','TransactionDT','DeviceType','DeviceInfo','TransactionDay', 'DT_M']]
M_feature = [ c for c in train_trans.columns if c.find('M') !=-1]

In [None]:
mapping = {'T': 1, 'F': 0, 'M0': 0, 'M1': 1, 'M2': 2}
for col in M_feature:
    train_trans[col] = train_trans[col].map(mapping)
    test_trans[col] = test_trans[col].map(mapping)

In [None]:
def encode_FE(df1, df2, cols):
    for col in cols:
        df = pd.concat([df1[col],df2[col]])
        vc = df.value_counts(dropna=True, normalize=True).to_dict()
        vc[-1] = -1
        nm = col+'_FE'
        df1[nm] = df1[col].map(vc)
        df1[nm] = df1[nm].astype('float32')
        df2[nm] = df2[col].map(vc)
        df2[nm] = df2[nm].astype('float32')
        print(nm,', ',end='')

In [None]:
def encode_CB(col1,col2,df1=train_trans,df2=test_trans):
    nm = col1+'_'+col2
    df1[nm] = df1[col1].astype(str)+'_'+df1[col2].astype(str)
    df2[nm] = df2[col1].astype(str)+'_'+df2[col2].astype(str)
    encode_LE(nm,verbose=False)
    print(nm,', ',end='')

In [None]:
def encode_LE(col,train=train_trans,test=test_trans,verbose=True):
    df_comb = pd.concat([train[col],test[col]],axis=0)
    df_comb,_ = df_comb.factorize(sort=True)
    nm = col
    if df_comb.max()>32000:
        train[nm] = df_comb[:len(train)].astype('int32')
        test[nm] = df_comb[len(train):].astype('int32')
    else:
        train[nm] = df_comb[:len(train)].astype('int16')
        test[nm] = df_comb[len(train):].astype('int16')
    del df_comb; x=gc.collect()
    if verbose: print(nm,', ',end='')

In [None]:
def encode_AG(main_columns, uids, aggregations=['mean'], train_df = train_trans, test_df = test_trans,
              fillna=True, usena=False):
    # AGGREGATION OF MAIN WITH UID FOR GIVEN STATISTICS
    for main_column in main_columns:
        for col in uids:
            for agg_type in aggregations:
                new_col_name = main_column+'_'+col+'_'+agg_type
                temp_df = pd.concat([train_df[[col, main_column]], test_df[[col,main_column]]])
                if usena: temp_df.loc[temp_df[main_column]==-1,main_column] = np.nan
                temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
                                                        columns={agg_type: new_col_name})

                temp_df.index = list(temp_df[col])
                temp_df = temp_df[new_col_name].to_dict()

                train_df[new_col_name] = train_df[col].map(temp_df).astype('float32')
                test_df[new_col_name]  = test_df[col].map(temp_df).astype('float32')

                if fillna:
                    train_df[new_col_name].fillna(-1,inplace=True)
                    test_df[new_col_name].fillna(-1,inplace=True)

                print("'"+new_col_name+"'",', ',end='')

In [None]:
def encode_AG2(main_columns, uids, train_df=train_trans, test_df=test_trans):
    for main_column in main_columns:
        for col in uids:
            comb = pd.concat([train_df[[col, main_column]],test_df[[col, main_column]]],axis=0)
            mp = comb.groupby(col)[main_column].agg(['nunique'])['nunique'].to_dict()
            train_df[col+'_'+main_column+'_ct'] = train_df[col].map(mp).astype('float32')
            test_df[col+'_'+main_column+'_ct'] = test_df[col].map(mp).astype('float32')
            print(col+'_'+main_column+'_ct, ',end='')

In [None]:
def combine_card_information(df1, df2):
    cols = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6']
    df1['card_information'] = df1[cols].astype(str).agg('_'.join, axis=1)
    df2['card_information'] = df2[cols].astype(str).agg('_'.join, axis=1)
    # encode_LE('card_information', train=df1, test=df2, verbose=True)


In [None]:
combine_card_information(train_trans, test_trans)

In [None]:
def group_and_merge_stat(train_trans, test_trans, group_cols, target_col='TransactionAmt', agg_funcs=['mean', 'std']):
    agg_dict = {target_col: agg_funcs}

    # 그룹화하여 통계 계산
    grouped = train_trans.groupby(group_cols).agg(agg_dict).reset_index()
    grouped.columns = group_cols + [f"{'_'.join(group_cols)}_{target_col}_{agg_func}" for agg_func in agg_funcs]

    # train_df에 병합
    train_trans = train_trans.merge(grouped, on=group_cols, how='left')

    # test_df에 병합
    grouped_test = test_trans.groupby(group_cols).agg(agg_dict).reset_index()
    grouped_test.columns = group_cols + [f"{'_'.join(group_cols)}_{target_col}_{agg_func}" for agg_func in agg_funcs]
    test_trans = test_trans.merge(grouped_test, on=group_cols, how='left')

    return train_trans, test_trans


In [None]:
# C 컬럼과 card_information 조합하여 통계 계산 함수
def group_and_merge_stat_with_card_info(train_trans, test_trans, C_cols, card_info_col, target_col='TransactionAmt', agg_funcs=['mean', 'std']):
    agg_dict = {target_col: agg_funcs}

    for col in C_cols:
        # C 컬럼과 card_information 조합하여 그룹화
        group_cols = [col, card_info_col]

        # 그룹화하여 통계 계산
        grouped = train_trans.groupby(group_cols).agg(agg_dict).reset_index()
        grouped.columns = group_cols + [f"{col}_{card_info_col}_{target_col}_{agg_func}" for agg_func in agg_funcs]

        # train_df에 병합
        train_trans = train_trans.merge(grouped, on=group_cols, how='left')

        # test_df에 병합
        grouped_test = test_trans.groupby(group_cols).agg(agg_dict).reset_index()
        grouped_test.columns = group_cols + [f"{col}_{card_info_col}_{target_col}_{agg_func}" for agg_func in agg_funcs]
        test_trans = test_trans.merge(grouped_test, on=group_cols, how='left')

    return train_trans, test_trans

In [None]:
# 그룹화 및 병합 수행
train_trans, test_trans = group_and_merge_stat_with_card_info(train_trans, test_trans, C_cols, 'card_information')

In [None]:
# 그룹화하여 통계 계산
grouped = train_trans.groupby(['card_information', 'addr1', 'D1N'])['TransactionAmt'].agg(['mean', 'std']).reset_index()
grouped2 = test_trans.groupby(['card_information', 'addr1', 'D1N'])['TransactionAmt'].agg(['mean', 'std']).reset_index()

# 컬럼명 변경
grouped = grouped.rename(columns={'mean': 'card_information_addr1_D1N_TransactionAmt_mean', 'std': 'card_information_addr1_D1N_TransactionAmt_std'})
grouped2 = grouped2.rename(columns={'mean': 'card_information_addr1_D1N_TransactionAmt_mean', 'std': 'card_information_addr1_D1N_TransactionAmt_std'})

# 원래 데이터프레임에 병합
train_trans = train_trans.merge(grouped, on=['card_information', 'addr1', 'D1N'], how='left')
test_trans = test_trans.merge(grouped2, on=['card_information', 'addr1', 'D1N'], how='left')

In [None]:
uid_cols = ['card_information', 'addr1']
train_trans, test_trans = group_and_merge_stat(train_trans, test_trans, uid_cols, 'dist1')

In [None]:
# 그룹화 및 병합 수행
email_cols = ['P_emaildomain', 'R_emaildomain']
uid_cols = ['card_information']

# P_emaildomain과 card_information 결합하여 통계 계산
for email_col in email_cols:
    train_trans, test_trans = group_and_merge_stat(train_trans, test_trans, [email_col] + uid_cols, 'TransactionAmt')

In [None]:
# 그룹화 및 병합 수행
uid_cols = ['card_information', 'addr1', 'DT_M']

# 카드 정보, addr1, DT_M을 결합하여 통계 계산
train_trans, test_trans = group_and_merge_stat(train_trans, test_trans, uid_cols, 'TransactionAmt')

In [None]:
# 불필요한 컬럼 제거
drop_cols = ['P_emaildomain', 'R_emaildomain', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6']
train_trans.drop(columns=drop_cols, inplace=True)
test_trans.drop(columns=drop_cols, inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

def encode_LE(columns, train_df, test_df, verbose=True):
    for col in columns:
        df_comb = pd.concat([train_df[col], test_df[col]], axis=0)
        df_comb, _ = df_comb.factorize(sort=True)
        nm = col
        if df_comb.max() > 32000:
            train_df[nm] = df_comb[:len(train_df)].astype('int32')
            test_df[nm] = df_comb[len(train_df):].astype('int32')
        else:
            train_df[nm] = df_comb[:len(train_df)].astype('int16')
            test_df[nm] = df_comb[len(train_df):].astype('int16')
        del df_comb
        gc.collect()
        if verbose:
            print(nm, ', ', end='')

# 라벨 인코딩이 필요한 컬럼
label_cols = ['ProductCD', 'card_information', 'TransactionID']
encode_LE(label_cols, train_trans, test_trans)


# 스탠다드 스케일링이 필요한 수치형 컬럼
scale_cols = [
    'TransactionDT', 'TransactionAmt', 'C1_card_information_TransactionAmt_mean', 'C1_card_information_TransactionAmt_std',
    'C2_card_information_TransactionAmt_mean', 'C2_card_information_TransactionAmt_std', 'C3_card_information_TransactionAmt_mean', 'C3_card_information_TransactionAmt_std',
    'C4_card_information_TransactionAmt_mean', 'C4_card_information_TransactionAmt_std', 'C5_card_information_TransactionAmt_mean', 'C5_card_information_TransactionAmt_std',
    'C6_card_information_TransactionAmt_mean', 'C6_card_information_TransactionAmt_std', 'C7_card_information_TransactionAmt_mean', 'C7_card_information_TransactionAmt_std',
    'C8_card_information_TransactionAmt_mean', 'C8_card_information_TransactionAmt_std', 'C9_card_information_TransactionAmt_mean', 'C9_card_information_TransactionAmt_std',
    'C10_card_information_TransactionAmt_mean', 'C10_card_information_TransactionAmt_std', 'card_information_addr1_D1N_TransactionAmt_mean', 'card_information_addr1_D1N_TransactionAmt_std',
    'card_information_addr1_dist1_mean', 'card_information_addr1_dist1_std', 'P_emaildomain_card_information_TransactionAmt_mean', 'P_emaildomain_card_information_TransactionAmt_std',
    'R_emaildomain_card_information_TransactionAmt_mean', 'R_emaildomain_card_information_TransactionAmt_std', 'card_information_addr1_DT_M_TransactionAmt_mean', 'card_information_addr1_DT_M_TransactionAmt_std'
]

scaler = StandardScaler()
train_trans[scale_cols] = scaler.fit_transform(train_trans[scale_cols])
test_trans[scale_cols] = scaler.transform(test_trans[scale_cols])

In [None]:
# import lightgbm as lgb
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import roc_auc_score

# # LightGBM 모델 학습
# target_col = 'isFraud'
# X_train = train_trans.drop(columns=[target_col])
# y_train = train_trans[target_col]
# X_test = test_trans.drop(columns=[target_col], errors='ignore')

# params = {
#     'objective': 'binary',
#     'metric': 'auc',
#     'num_iterations': 100,
# }

# train_data = lgb.Dataset(X_train, label=y_train)
# lgbm_model = lgb.train(params, train_data)

# # 테스트 데이터에 대한 예측
# y_test_pred = lgbm_model.predict(X_test, num_iteration=lgbm_model.best_iteration)

# # 피처 중요도 출력
# importance = lgbm_model.feature_importance(importance_type='gain')
# feature_names = X_train.columns
# feature_importances = pd.DataFrame({'feature': feature_names, 'importance': importance})
# feature_importances = feature_importances.sort_values(by='importance', ascending=False)

In [None]:
# # 피처 중요도 시각화
# plt.figure(figsize=(12, 20))  # 그래프 크기를 키움
# plt.barh(feature_importances['feature'], feature_importances['importance'])
# plt.xlabel('Importance')
# plt.ylabel('Feature')
# plt.title('Feature Importances')
# plt.gca().invert_yaxis()
# plt.xticks(fontsize=10)
# plt.yticks(fontsize=8)
# plt.show()

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# 데이터셋 나누기
target_col = 'isFraud'
X = train_trans.drop(columns=[target_col])
y = train_trans[target_col]

# 학습 데이터와 검증 데이터 나누기
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=47)

# LightGBM 파라미터 설정
params = {
    'num_leaves': 491,
    'min_child_weight': 0.03454472573214212,
    'feature_fraction': 0.3797454081646243,
    'bagging_fraction': 0.4181193142567742,
    'min_data_in_leaf': 106,
    'objective': 'binary',
    'max_depth': -1,
    'learning_rate': 0.006883242363721497,
    "boosting_type": "gbdt",
    "bagging_seed": 11,
    "metric": 'auc',
    "verbosity": -1,
    'reg_alpha': 0.3899927210061127,
    'reg_lambda': 0.6485237330340494,
    'random_state': 47
}

# LightGBM 데이터셋 생성
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

# 모델 학습
lgbm_model = lgb.train(
    params,
    train_data,
    num_boost_round=500,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'valid'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=50)
    ]
)

# 검증 데이터에 대한 예측
y_valid_pred = lgbm_model.predict(X_valid, num_iteration=lgbm_model.best_iteration)

# AUC 점수 계산
auc_score = roc_auc_score(y_valid, y_valid_pred)
print(f'Validation AUC: {auc_score}')

# 피처 중요도 출력
importance = lgbm_model.feature_importance(importance_type='gain')
feature_names = X_train.columns
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': importance})
feature_importances = feature_importances.sort_values(by='importance', ascending=False)

# 상위 20개의 피처 중요도만 선택
top_n = 20
top_features = feature_importances.head(top_n)

# 피처 중요도 시각화
plt.figure(figsize=(12, 8))  # 그래프 크기 조정
plt.barh(top_features['feature'], top_features['importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title(f'Top {top_n} Feature Importances')
plt.gca().invert_yaxis()
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.show()