In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Preprocessing, modelling and evaluating
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
from xgboost import XGBClassifier
import xgboost as xgb
import hashlib
import os
import gc

In [2]:

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
#        else:
#            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
train_transaction = pd.read_csv('train_transaction.csv', index_col='TransactionID')
train_identity = pd.read_csv('train_identity.csv', index_col='TransactionID')
train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
del train_transaction, train_identity
gc.collect()

0

In [4]:
test_transaction = pd.read_csv('test_transaction.csv', index_col='TransactionID')
test_identity = pd.read_csv('test_identity.csv', index_col='TransactionID')
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)
del test_transaction, test_identity
gc.collect()

0

In [5]:
test.columns = test.columns.str.replace('-', '_')

In [6]:
train['card'] = train['card1'].astype(str) + train['card2'].astype(str) + train['card3'].astype(str) + train['card4'].astype(str) + train['card5'].astype(str) + train['card6'].astype(str)

In [7]:
train['card']

TransactionID
2987000        13926nan150.0discover142.0credit
2987001     2755404.0150.0mastercard102.0credit
2987002            4663490.0150.0visa166.0debit
2987003     18132567.0150.0mastercard117.0debit
2987004     4497514.0150.0mastercard102.0credit
                           ...                 
3577535              6550nan150.0visa226.0debit
3577536     10444225.0150.0mastercard224.0debit
3577537     12037595.0150.0mastercard224.0debit
3577538      7826481.0150.0mastercard224.0debit
3577539    15066170.0150.0mastercard102.0credit
Name: card, Length: 590540, dtype: object

In [13]:
train[['ProductCD','card','TransactionAmt', 'V307',]]

Unnamed: 0_level_0,ProductCD,card,TransactionAmt,V307
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2987000,W,13926nan150.0discover142.0credit,68.50,117.000000
2987001,W,2755404.0150.0mastercard102.0credit,29.00,0.000000
2987002,W,4663490.0150.0visa166.0debit,59.00,0.000000
2987003,W,18132567.0150.0mastercard117.0debit,50.00,1758.000000
2987004,H,4497514.0150.0mastercard102.0credit,50.00,0.000000
...,...,...,...,...
3577535,W,6550nan150.0visa226.0debit,49.00,47.950001
3577536,W,10444225.0150.0mastercard224.0debit,39.50,0.000000
3577537,W,12037595.0150.0mastercard224.0debit,30.95,0.000000
3577538,W,7826481.0150.0mastercard224.0debit,117.00,2903.500000


In [37]:
# 필요한 열 선택
subset_df = train[['ProductCD', 'card', 'TransactionAmt', 'V307']]

# 'card' 열 기준으로 정렬
sorted_df = subset_df.sort_values(by='card')

# 'ProductCD' 기준으로 그룹바이
grouped_df = sorted_df.groupby('ProductCD')
grouped_df.head(80)

Unnamed: 0_level_0,ProductCD,card,TransactionAmt,V307
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3169988,W,10000111.0150.0mastercard117.0debit,29.000,0.000000
3328484,C,10003555.0128.0visa226.0debit,39.394,42.777401
3337343,C,10003555.0128.0visa226.0debit,10.755,82.171204
3337365,C,10003555.0128.0visa226.0debit,19.093,92.926003
3337822,C,10003555.0128.0visa226.0debit,19.093,112.018700
...,...,...,...,...
3390972,C,10054289.0185.0visa226.0credit,36.614,0.000000
3259505,S,10057225.0150.0mastercard224.0debit,15.000,0.000000
3464372,S,10057225.0150.0mastercard224.0debit,50.000,0.000000
3084417,S,10057225.0150.0mastercard224.0debit,25.000,0.000000


In [39]:
train['V307']

TransactionID
2987000     117.000000
2987001       0.000000
2987002       0.000000
2987003    1758.000000
2987004       0.000000
              ...     
3577535      47.950001
3577536       0.000000
3577537       0.000000
3577538    2903.500000
3577539     279.950012
Name: V307, Length: 590540, dtype: float64