In [1]:
import cupy, cudf, cuml

In [2]:
%%time
# LOAD TRAIN
train_data = cudf.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
# LOAD TEST
test_data = cudf.read_csv('../input/ieee-fraud-detection/test_transaction.csv')

CPU times: user 1.5 s, sys: 1.19 s, total: 2.69 s
Wall time: 18.1 s


In [3]:
print(train_data.columns.to_list())
print(train_data['isFraud'].value_counts())

['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V

Types of columns:

- Transaction_Id
- TransactionDT: timedelta from a given reference datetime (not an actual timestamp)
-  TransactionAMT: transaction payment amount in USD
- ProductCD: product code, the product for each transaction
- card1 - card6: payment card information, such as card type, card category, issue bank, country, etc.
- addr: address
- dist: distance
- P_ and (R__) emaildomain: purchaser and recipient email domain
- C1-C14: counting, such as how many addresses are found to be associated with the payment card, etc. The actual meaning is masked.
- D1-D15: timedelta, such as days between previous transaction, etc.
- M1-M9: match, such as names on card and address, etc.
- Vxxx: Vesta engineered rich features, including ranking, counting, and other entity relations.

Categorical Features:
- ProductCD
- card1 - card6
- addr1, addr2
- P_emaildomain
- R_emaildomain
- M1 - M9



In [4]:
good_cols = []
for col in train_data.columns:
    num_missing = train_data[col].isna().sum()
    percent_missing = 100*num_missing/len(train_data)
    status = percent_missing < 10
    if status:
        good_cols.append(col)
#     print(col, num_missing, round(percent_missing, 2), status)
print(good_cols)

['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V126', 'V127', 'V128', 'V129', 'V130', 'V131', 'V132', 'V133', 'V134', 'V135', 'V136', 'V137', 'V279', 'V280', 'V281', 'V282', 'V283', 'V284', 'V285', 'V286', 'V287', 'V288', 'V289', 'V290', 'V291', 'V292', 'V293', 'V294', 'V295', 'V296', 'V297', 'V298', 'V299', 'V300', 'V301', 'V302', 'V303', 'V304', 'V305', 'V306', 'V307', 'V308', 'V309', 'V310', 'V311', 'V312', 'V313', 'V314', 'V315', 'V316', 'V317', 'V318', 'V319', 'V320', 'V321']


In [5]:
print([])

[]


In [6]:
!pip install xgboost



In [7]:
#adversarial crossvalidation to check if the test and train sets are similar
# feature_cols = [col for col in good_cols if
#                sum([col.startswith(x) for x in ['C', 'V', 'D']])]
feature_cols = [col for col in good_cols if col.startswith('V')]
feature_cols += [
                'TransactionAmt', 
#                'TransactionDT',
#                 'ProductCD'
                ]

# feature_cols += [f'card{i}' for i in range(1, 7)]
# feature_cols.remove('C3')
print(feature_cols)
adv_cv_data = cudf.concat([train_data[feature_cols],
                           test_data[feature_cols]],
                          axis=0)
adv_cv_data['label'] = cupy.array([0] * len(train_data) + [1] * len(test_data))

def freqeuncy_encoder(df, col):
    '''to encode a categoric variable using the frequency of occurance'''
    df[col] = df[col].map(df[col].value_counts().to_dict())
    return df
    
# if 'ProductCD' in feature_cols:
#     adv_cv_data = freqeuncy_encoder(adv_cv_data, 'ProductCD')

# for col in [f'card{i}' for i in range(1, 7)]:
#     adv_cv_data = freqeuncy_encoder(adv_cv_data, col)
    

print(adv_cv_data.columns)
print(adv_cv_data.shape)
adv_cv_data = adv_cv_data.dropna()
print(adv_cv_data.shape)

import gc
del train_data
del test_data
gc.collect()

['V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V126', 'V127', 'V128', 'V129', 'V130', 'V131', 'V132', 'V133', 'V134', 'V135', 'V136', 'V137', 'V279', 'V280', 'V281', 'V282', 'V283', 'V284', 'V285', 'V286', 'V287', 'V288', 'V289', 'V290', 'V291', 'V292', 'V293', 'V294', 'V295', 'V296', 'V297', 'V298', 'V299', 'V300', 'V301', 'V302', 'V303', 'V304', 'V305', 'V306', 'V307', 'V308', 'V309', 'V310', 'V311', 'V312', 'V313', 'V314', 'V315', 'V316', 'V317', 'V318', 'V319', 'V320', 'V321', 'TransactionAmt']
Index(['V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103',
       'V104', 'V105', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112',
       'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121',
       'V122', 'V123', 'V124', 'V125', 'V126', 'V127', 'V128', 'V1

0

In [8]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [9]:
from cuml.model_selection import train_test_split as train_test_split

In [10]:
%time
import xgboost as xgb
params = {"device": "cuda"}
auc_scores = []
for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(
    adv_cv_data.drop(['label'], axis=1).astype('float64'),
    adv_cv_data['label'].astype('int'),
    train_size=0.8)
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)
    from cuml.metrics import roc_auc_score
    auc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    auc_scores.append(auc_score)


CPU times: user 4 µs, sys: 2 µs, total: 6 µs
Wall time: 9.78 µs


  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()


In [11]:
print(auc_scores)

[0.5901880264282227, 0.5923685431480408, 0.5916910767555237, 0.590609073638916, 0.5910504460334778, 0.591681182384491, 0.5917620062828064, 0.5901205539703369, 0.5922631025314331, 0.5903685688972473]


In [25]:
# Get feature importances
importances = model.feature_importances_

# Combine feature names and their importances, then sort by importance
feature_importances = sorted(zip(X_train.columns, importances), key=lambda x: x[1], reverse=True)

# Print the most important features
print("Most Important Features:")
importance_df = cudf.DataFrame({
    "feature": [p[0] for p in feature_importances],
    "importance": [p[1] for p in feature_importances]}).sort_values("importance", ascending=False)
display(importance_df)

Most Important Features:


Unnamed: 0,feature,importance
0,V295,0.046352
1,V133,0.034945
2,V105,0.034891
3,V302,0.032949
4,V294,0.032942
...,...,...
82,V111,0.003762
83,V108,0.003045
84,V120,0.002545
85,V107,0.000000
