In [172]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
import json
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import KFold
from itertools import product


In [173]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            if str(col_type)[:3] == 'int':
                df[col] = pd.to_numeric(df[col], downcast="integer")
            else:
                df[col] = pd.to_numeric(df[col], downcast="float")

                    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.
              format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [174]:
DIR = "../ieee-fraud-detection"
train_transaction = reduce_mem_usage(pd.read_csv(f'{DIR}/train_transaction.csv', index_col='TransactionID'))
test_transaction = reduce_mem_usage(pd.read_csv(f'{DIR}/test_transaction.csv', index_col='TransactionID'))
train_identity = reduce_mem_usage(pd.read_csv(f'{DIR}/train_identity.csv', index_col='TransactionID'))
test_identity = reduce_mem_usage(pd.read_csv(f'{DIR}/test_identity.csv', index_col='TransactionID'))
sample_submission = reduce_mem_usage(pd.read_csv(f'{DIR}/sample_submission.csv', index_col='TransactionID'))

train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)
dataset = pd.concat([train.drop("isFraud", axis=1), test], ignore_index=True, sort=False)
del train_transaction, train_identity
del test_transaction, test_identity
gc.collect()

Mem. usage decreased to 918.55 Mb (48.3% reduction)
Mem. usage decreased to 787.65 Mb (48.2% reduction)
Mem. usage decreased to 32.46 Mb (28.0% reduction)
Mem. usage decreased to 31.94 Mb (28.0% reduction)
Mem. usage decreased to  5.80 Mb (25.0% reduction)


303

# 1. Email feature engineering

In [175]:
class Email_Engineering(BaseEstimator, TransformerMixin):
    """
    对太多的域名进行了降纬分类，处理
    
    credit to ``https://www.kaggle.com/amirhmi/a-comprehensive-guide-to-get-0-9492``
    """
    def __init__(self, file_path, names):
        if not isinstance(names, list):
            self.names = list(names)
        else:
            self.names = names
        self.us_emails = set(['gmail', 'net', 'edu'])
        
        with open(file_path) as f:
            self.emails = json.load(f)
            
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        for c in self.names:
            x[c+"_bin"] = x[c].map(self.emails)
            x[c + '_suffix'] = x[c].map(lambda x: str(x).split('.')[-1]) 
            x[c + '_suffix'] = x[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
            
        x['is_proton_mail'] = ((x['P_emaildomain'] == 'protonmail.com') | \
                                    (x['R_emaildomain']  == 'protonmail.com')).astype(np.int8)
            
        return x
    
    
class Browser_Engineering(BaseEstimator, TransformerMixin):
    """
    对浏览器进行了处理
    
    credit to ``https://www.kaggle.com/amirhmi/a-comprehensive-guide-to-get-0-9492``
    """
    def __init__(self, file_path, name, verbose=1):
        self.name = name
        self.verbose = verbose
        
        with open(file_path) as f:
            self.latest_browser = set(map(str.strip, f.readlines()))
                    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        nan_mask = x[self.name].isnull()
        x['is_latest_browser'] = x[self.name].fillna("NaN")
        x['is_latest_browser'] = x['is_latest_browser'].map(lambda y: 1 if y in self.latest_browser else 0)
        x['is_latest_browser'] = x['is_latest_browser'].astype(np.int8)
        x.loc[nan_mask, 'is_latest_browser'] = np.nan
        if self.verbose:
            print(f"Summarize: # of 1 = {x['is_latest_browser'].sum()}, # of NaN = {x['is_latest_browser'].isnull().sum()}")
        return x
    
class Std_2var_Engineering(BaseEstimator, TransformerMixin):
    """
    双变量交互（std）
    
    credit to ``https://www.kaggle.com/amirhmi/a-comprehensive-guide-to-get-0-9492``
    """
    def __init__(self, numerical_features, categorical_features, verbose=1):
        self.n_feas = list(numerical_features)
        self.c_feas = list(categorical_features)
        self.verbose = verbose
                    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        for a, b in product(self.n_feas, self.c_feas):
            nan_mask = x[a].isnull() | x[b].isnull()
            name = a+"_to_std_"+b
            x[name] = x[a] / x.groupby([b])[a].transform('std')
            x.loc[nan_mask, name] = np.nan
            if self.verbose:
                print(f"Generate: {name}")
        return x
    
class Mean_2var_Engineering(BaseEstimator, TransformerMixin):
    """
    双变量交互（mean）
    credit to ``https://www.kaggle.com/amirhmi/a-comprehensive-guide-to-get-0-9492``
    """
    def __init__(self, numerical_features, categorical_features, verbose=1):
        self.n_feas = list(numerical_features)
        self.c_feas = list(categorical_features)
        self.verbose = verbose
                    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x): 
        for a, b in product(self.n_feas, self.c_feas):
            nan_mask = x[a].isnull() | x[b].isnull()
            name = a+"_to_mean_"+b
            x[name] = x[a] / x.groupby([b])[a].transform('mean')
            x.loc[nan_mask, name] = np.nan
            if self.verbose:
                print(f"Generate: {name}")
        return x
    
class Add_2var_Engineering(BaseEstimator, TransformerMixin):
    """
    双分类变量交互
    credit to ``https://www.kaggle.com/amirhmi/a-comprehensive-guide-to-get-0-9492``
    """
    def __init__(self, feature_pairs, verbose=1):
        self.pairs = list(feature_pairs)
        self.verbose = verbose
                    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x): 
        for feas in self.pairs:
            name= None
            if len(feas) == 2:
                a, b = feas
                nan_mask = x[a].isnull() | x[b].isnull()
                name = a+"_"+b
                x[name] = x[a].astype(str) + "_" + x[b].astype(str)
            elif len(feas) == 3:
                a, b, c = feas
                nan_mask = x[a].isnull() | x[b].isnull() | x[c].isnull()
                name = a+"_"+b+"_"+c
                x[name] = x[a].astype(str) + "_" + x[b].astype(str) + "_" + x[c].astype(str)
                
            x.loc[nan_mask, name] = np.nan
            if self.verbose:
                print(f"Generate: {name}")
        return x
    
class Count_Engineering(BaseEstimator, TransformerMixin):
    """
    添加分类变量的频率信息
    credit to ``https://www.kaggle.com/cdeotte/200-magical-models-santander-0-920``
    """
    def __init__(self, categorical_features, verbose=1):
        self.names = list(categorical_features)
        self.verbose = verbose
        self.counts = dict()
                    
    def fit(self, x, y=None):
        for c in self.names:
            self.counts[c] = x[c].value_counts(dropna=False)
        return self
    
    def transform(self, x): 
        for c in self.names:
            name= c+"_count"
            nan_mask = x[c].isnull()
            if not (c in self.counts):
                self.counts[c] = x[c].value_counts(dropna=False)
                
            if name in x.columns:
                name += "X"
            x[name] = x[c].map(self.counts[c])
            x.loc[nan_mask, name] = np.nan
            if self.verbose:
                print(f"Generate: {name}")
        return x
    
class Drop_Features(BaseEstimator, TransformerMixin):
    """
    删除一些的特征
    
    credit to ``https://www.kaggle.com/amirhmi/a-comprehensive-guide-to-get-0-9492``
    """
    def __init__(self, percentage, percentage_dup, verbose=1):
        self.perc = percentage
        self.perc_dup = percentage_dup
        self.verbose = verbose
                    
    def fit(self, x, y=None):
        missing_values = x.isnull().sum() / len(x)
        missing_drop_cols = list(missing_values[missing_values > self.perc].keys())
        if "isFraud" in missing_drop_cols:
            missing_drop_cols.remove("isFraud")
        self.dropped_cols = missing_drop_cols            
        duplicate_drop_cols = [col for col in x.columns if x[col].value_counts(dropna=False, normalize=True).values[0] > self.perc_dup]
        if "isFraud" in duplicate_drop_cols:
            duplicate_drop_cols.remove("isFraud")
        self.dropped_cols.extend(duplicate_drop_cols)
        if self.verbose:
            print(f"Summarize: {len(missing_drop_cols)} columns have missing value(%) > {self.perc}")
            print(f"Summarize: {len(duplicate_drop_cols)} columns have duplicate value(%) > {self.perc_dup}")
        
        return self
    
    def transform(self, x):
        return x.drop(self.dropped_cols, axis=1)

In [176]:
E1 = Email_Engineering(f"{DIR}/email.json", ['P_emaildomain', 'R_emaildomain'])
train = E1.transform(train)
test = E1.transform(test)

In [177]:
B1 = Browser_Engineering(f"{DIR}/latest_browsers.txt", "id_31", verbose=1)
train = B1.transform(train)
test = B1.transform(test)

Summarize: # of 1 = 36598.0, # of NaN = 450258
Summarize: # of 1 = 20568.0, # of NaN = 370066


In [178]:
D1 = Drop_Features(percentage=0.8, percentage_dup=0.9,verbose=1)
D1.fit(dataset)
train = D1.transform(train)
test = D1.transform(test)

Summarize: 145 columns have missing value(%) > 0.8
Summarize: 71 columns have duplicate value(%) > 0.9


In [179]:
M1 = Mean_2var_Engineering(numerical_features=["TransactionAmt", "id_02", "D15"],
                          categorical_features=['card1', 'card4', 'addr1'])
train = M1.transform(train)
test = M1.transform(test)

Generate: TransactionAmt_to_mean_card1
Generate: TransactionAmt_to_mean_card4
Generate: TransactionAmt_to_mean_addr1
Generate: id_02_to_mean_card1
Generate: id_02_to_mean_card4
Generate: id_02_to_mean_addr1
Generate: D15_to_mean_card1
Generate: D15_to_mean_card4
Generate: D15_to_mean_addr1
Generate: TransactionAmt_to_mean_card1
Generate: TransactionAmt_to_mean_card4
Generate: TransactionAmt_to_mean_addr1
Generate: id_02_to_mean_card1
Generate: id_02_to_mean_card4
Generate: id_02_to_mean_addr1
Generate: D15_to_mean_card1
Generate: D15_to_mean_card4
Generate: D15_to_mean_addr1


In [180]:
A1 = Add_2var_Engineering(feature_pairs=[('card1', 'card4')])
train = A1.transform(train)
test = A1.transform(test)

Generate: card1_card4
Generate: card1_card4


In [181]:
A2 = Add_2var_Engineering(feature_pairs=[('card1_card4', 'card3', 'card5')])
train = A2.transform(train)
test = A2.transform(test)
A3 = Add_2var_Engineering(feature_pairs=[('card1_card4_card3_card5', 'addr1', 'addr2')])
train = A3.transform(train)
test = A3.transform(test)

Generate: card1_card4_card3_card5
Generate: card1_card4_card3_card5
Generate: card1_card4_card3_card5_addr1_addr2
Generate: card1_card4_card3_card5_addr1_addr2


In [182]:
# 未来数据？？？
train['TransactionAmt_check'] = np.where(train['TransactionAmt'].isin(test['TransactionAmt']), 1, 0)
test['TransactionAmt_check']  = np.where(test['TransactionAmt'].isin(train['TransactionAmt']), 1, 0)

In [183]:
train['TransactionAmt'] = np.log1p(train['TransactionAmt'])
test['TransactionAmt'] = np.log1p(test['TransactionAmt'])  

In [184]:
C1 = Count_Engineering(categorical_features=['id_36'])
C2 = Count_Engineering(categorical_features=['id_01', 'id_31', 'id_35', 'id_36'])
C1.fit(dataset)
train = C1.transform(train)
test = C1.transform(test)
train = C2.transform(train)
test = C2.transform(test)

Generate: id_36_count
Generate: id_36_count
Generate: id_01_count
Generate: id_31_count
Generate: id_35_count
Generate: id_36_countX
Generate: id_01_count
Generate: id_31_count
Generate: id_35_count
Generate: id_36_countX


In [185]:
#看不懂的操作
for col in ['card1']: 
    valid_card = pd.concat([train[[col]], test[[col]]])
    valid_card = valid_card[col].value_counts()
    valid_card = valid_card[valid_card > 2]
    valid_card = list(valid_card.index)

    train[col] = np.where(train[col].isin(test[col]), train[col], np.nan)
    test[col]  = np.where(test[col].isin(train[col]), test[col], np.nan)

    train[col] = np.where(train[col].isin(valid_card), train[col], np.nan)
    test[col]  = np.where(test[col].isin(valid_card), test[col], np.nan)

In [186]:
numerical_columns = list(test.select_dtypes(exclude=['object']).columns)

train[numerical_columns] = train[numerical_columns].fillna(train[numerical_columns].median())
test[numerical_columns] = test[numerical_columns].fillna(train[numerical_columns].median())


In [187]:
# 众数填充
categorical_columns = list(filter(lambda x: x not in numerical_columns, list(test.columns)))
train[categorical_columns] = train[categorical_columns].fillna(train[categorical_columns].mode())
test[categorical_columns] = test[categorical_columns].fillna(train[categorical_columns].mode())

In [188]:
from sklearn.preprocessing import LabelEncoder

# 很有问题啊，这个假设的是有序分类变量，才能这样
for col in categorical_columns:
    le = LabelEncoder()
    le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
    train[col] = le.transform(list(train[col].astype(str).values))
    test[col] = le.transform(list(test[col].astype(str).values))

# 2. 训练模型

In [189]:
labels = train["isFraud"]
train.drop(["isFraud"], axis=1, inplace=True)

X_train, y_train = train, labels
del train, labels
gc.collect()

275

In [190]:
lgb_submission=sample_submission.copy()
lgb_submission['isFraud'] = 0
n_fold = 5
folds = KFold(n_fold)

In [None]:
for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train)):
    print(fold_n)
    
    X_train_, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)
    
    lgbclf = lgb.LGBMClassifier(
            num_leaves= 512,
            n_estimators=512,
            max_depth=9,
            learning_rate=0.064,
            subsample=0.85,
            colsample_bytree=0.85,
            boosting_type= "gbdt",
            reg_alpha=0.3,
            reg_lamdba=0.243
    )
    
    X_train_, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]
    lgbclf.fit(X_train_,y_train_)
    
    del X_train_,y_train_
    
    print('finish train')
    pred=lgbclf.predict_proba(test)[:,1]
    val=lgbclf.predict_proba(X_valid)[:,1]
    print('finish pred')
    del lgbclf, X_valid
    print('ROC accuracy: {}'.format(roc_auc_score(y_valid, val)))
    del val,y_valid
    lgb_submission['isFraud'] = lgb_submission['isFraud']+ pred/n_fold
    del pred
    gc.collect()

0


In [None]:
lgb_submission.insert(0, "TransactionID", np.arange(3663549, 3663549 + 506691))
lgb_submission.to_csv('prediction.csv', index=False)