In [1]:
# magic column V301 id와 묶어서
# productCD  transactionamt 묶어서
# card4
# Transform id_30, id_31¶
# addr2 특정 지역 무적권 사기
# id 30 id31

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Preprocessing, modelling and evaluating
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
from xgboost import XGBClassifier
import xgboost as xgb
import hashlib
import os
import gc

In [3]:
# From https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
#        else:
#            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
train_transaction = pd.read_csv('train_transaction.csv', index_col='TransactionID')
train_identity = pd.read_csv('train_identity.csv', index_col='TransactionID')
train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
del train_transaction, train_identity
gc.collect()

0

In [5]:
test_transaction = pd.read_csv('test_transaction.csv', index_col='TransactionID')
test_identity = pd.read_csv('test_identity.csv', index_col='TransactionID')
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)
del test_transaction, test_identity
gc.collect()

0

In [6]:
test.columns = test.columns.str.replace('-', '_')

In [7]:
selected_features = ['TransactionDT',
 'TransactionAmt','card1','card2','card3','card5',
 'addr1','addr2',
 'C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14',
 'D1','D2','D3','D4','D5','D6','D7','D8','D9','D10','D11','D12','D13','D14','D15',
 
 'V307',
 
 'ProductCD',
 'card4',
 'card6',
 'P_emaildomain',
 'R_emaildomain', 
 'dist1','id_30','id_31', 'id_32', 'id_33', 'id_34',
 'DeviceType','DeviceInfo',]

len(selected_features)

51

In [8]:
train = train[selected_features + ['isFraud']]
test = test[selected_features ]

In [9]:
train = train.fillna(-999)
test = test.fillna(-999)

Card_device 조합에서 V301 적용

In [10]:
train['card1'] = train['card1'].fillna(0)
train['card2'] = train['card2'].fillna(0)
train['card3'] = train['card3'].fillna(0)
train['card5'] = train['card5'].fillna(0)
train['card4'] = train['card4'].fillna('nan')
train['card6'] = train['card6'].fillna('nan')

test['card1'] = test['card1'].fillna(0)
test['card2'] = test['card2'].fillna(0)
test['card3'] = test['card3'].fillna(0)
test['card5'] = test['card5'].fillna(0)
test['card4'] = test['card4'].fillna('nan')
test['card6'] = test['card6'].fillna('nan')

def card_info_hash(x):
    s = (str(int(x['card1']))+
         str(int(x['card2']))+
         str(int(x['card3']))+
         str(x['card4'])+
         str(int(x['card5']))+
         str(x['card6']))
    h = hashlib.sha256(s.encode('utf-8')).hexdigest()[0:15]
    return h

def device_hash(x):
    s =  str(x['id_30'])+str(x['id_31'])+str(x['id_32'])+str(x['id_33'])+str( x['DeviceType'])+ str(x['DeviceInfo'])
    h = hashlib.sha256(s.encode('utf-8')).hexdigest()[0:15]
    return h

train['card_hash'] = train.apply(lambda x: card_info_hash(x), axis=1   )
train['device_hash'] = train.apply(lambda x: device_hash(x), axis=1   )

test['card_hash'] = test.apply(lambda x: card_info_hash(x), axis=1   )
test['device_hash'] = test.apply(lambda x: device_hash(x), axis=1   )

In [11]:
train['V307_diff'] = train['V307'].diff().shift(-1)
train['difference'] = train['V307_diff'] - train['TransactionAmt']

test['v307_diff'] = test['V307'].diff().shift(-1)
test['difference'] = test['v307_diff'] - test['TransactionAmt']

In [12]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Memory usage of dataframe is 256.81 MB
Memory usage after optimization is: 115.45 MB
Decreased by 55.0%
Memory usage of dataframe is 216.48 MB
Memory usage after optimization is: 98.58 MB
Decreased by 54.5%


Extract target variable¶


In [13]:
from sklearn.preprocessing import LabelEncoder

# 범주형 열을 숫자형으로 변환
def encode_categorical_columns(df):
    label_encoders = {}
    for column in df.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column].astype(str))
        label_encoders[column] = le
    return df, label_encoders

# train 데이터 인코딩
train, _ = encode_categorical_columns(train)

# test 데이터 인코딩
test_encoded, _ = encode_categorical_columns(test)

In [14]:
train_columns = set(train.columns)
test_columns = set(test_encoded.columns)

column_difference = train_columns - test_columns

print("Columns in train but not in test_encoded:", column_difference)


Columns in train but not in test_encoded: {'isFraud', 'V307_diff'}


In [15]:
train_columns = train.shape[1]
test_columns = test_encoded.shape[1]

print("Number of columns in train:", train_columns)
print("Number of columns in test_encoded:", test_columns)

Number of columns in train: 56
Number of columns in test_encoded: 55


In [16]:
y = train['isFraud']
X = train.drop('isFraud', axis=1)

In [17]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb
# train, valid split
X_tr, X_v, y_tr, y_v = train_test_split(X, y, test_size=0.25,
random_state=2024,
stratify=y)

# 데이터 셋 생성
train_data = lgb.Dataset(X_tr, label=y_tr)
valid_data = lgb.Dataset(X_v, label=y_v)


params = {
'objective': 'binary',
'metric': 'auc'
}

# 모델 학습
callbacks = [lgb.early_stopping(stopping_rounds=100),
lgb.log_evaluation(period=100)]
clf = lgb.train(params, train_data,
num_boost_round = 10000,
valid_sets = [train_data, valid_data],
callbacks=callbacks)

[LightGBM] [Info] Number of positive: 15497, number of negative: 427408
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.057119 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10343
[LightGBM] [Info] Number of data points in the train set: 442905, number of used features: 55
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034989 -> initscore=-3.317093
[LightGBM] [Info] Start training from score -3.317093
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.940915	valid_1's auc: 0.925415
[200]	training's auc: 0.960158	valid_1's auc: 0.938779
[300]	training's auc: 0.97071	valid_1's auc: 0.945266
[400]	training's auc: 0.978117	valid_1's auc: 0.949553
[500]	training's auc: 0.983636	valid_1's auc: 0.952362
[600]	training's auc: 0.987071	valid_1's auc: 0.954919
[700]	training's auc: 0.989964	valid_1's auc: 0.

In [18]:

# test 제출
preds = clf.predict(test_encoded)
result = pd.read_csv("sample_submission.csv")
result ['isFraud'] = preds
result .to_csv('result.csv', index=False)