In [105]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
import json
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from itertools import product

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)  
                    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.
              format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
DIR = "../ieee-fraud-detection"
train_transaction = reduce_mem_usage(pd.read_csv(f'{DIR}/train_transaction.csv', index_col='TransactionID'))
test_transaction = reduce_mem_usage(pd.read_csv(f'{DIR}/test_transaction.csv', index_col='TransactionID'))
train_identity = reduce_mem_usage(pd.read_csv(f'{DIR}/train_identity.csv', index_col='TransactionID'))
test_identity = reduce_mem_usage(pd.read_csv(f'{DIR}/test_identity.csv', index_col='TransactionID'))
sample_submission = reduce_mem_usage(pd.read_csv(f'{DIR}/sample_submission.csv', index_col='TransactionID'))

train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)
dataset = pd.concat([train.drop("isFraud", axis=1), test], ignore_index=True, sort=False)
del train_transaction, train_identity
del test_transaction, test_identity
gc.collect()

Mem. usage decreased to 544.60 Mb (69.3% reduction)
Mem. usage decreased to 474.52 Mb (68.8% reduction)
Mem. usage decreased to 26.41 Mb (41.5% reduction)
Mem. usage decreased to 25.98 Mb (41.5% reduction)
Mem. usage decreased to  4.83 Mb (37.5% reduction)


42

# 1. Email feature engineering

In [135]:
class Email_Engineering(BaseEstimator, TransformerMixin):
    """
    对太多的域名进行了降纬分类，处理
    
    credit to ``https://www.kaggle.com/amirhmi/a-comprehensive-guide-to-get-0-9492``
    """
    def __init__(self, file_path, names):
        if not isinstance(names, list):
            self.names = list(names)
        else:
            self.names = names
        self.us_emails = set(['gmail', 'net', 'edu'])
        
        with open(file_path) as f:
            self.emails = json.load(f)
            
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        for c in self.names:
            x[c+"_bin"] = x[c].map(self.emails)
            x[c + '_suffix'] = x[c].map(lambda x: str(x).split('.')[-1]) 
            x[c + '_suffix'] = x[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
            
        x['is_proton_mail'] = ((x['P_emaildomain'] == 'protonmail.com') | \
                                    (x['R_emaildomain']  == 'protonmail.com')).astype(np.int8)
            
        return x
    
    
class Browser_Engineering(BaseEstimator, TransformerMixin):
    """
    对浏览器进行了处理
    
    credit to ``https://www.kaggle.com/amirhmi/a-comprehensive-guide-to-get-0-9492``
    """
    def __init__(self, file_path, name, verbose=1):
        self.name = name
        self.verbose = verbose
        
        with open(file_path) as f:
            self.latest_browser = set(map(str.strip, f.readlines()))
                    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        nan_mask = x[self.name].isnull()
        x['is_latest_browser'] = x[self.name].fillna("NaN")
        x['is_latest_browser'] = x['is_latest_browser'].map(lambda y: 1 if y in self.latest_browser else 0)
        x['is_latest_browser'] = x['is_latest_browser'].astype(np.int8)
        x.loc[nan_mask, 'is_latest_browser'] = np.nan
        if self.verbose:
            print(f"Summarize: # of 1 = {x['is_latest_browser'].sum()}, # of NaN = {x['is_latest_browser'].isnull().sum()}")
        return x
    
class Std_2var_Engineering(BaseEstimator, TransformerMixin):
    """
    双变量交互（std）
    
    credit to ``https://www.kaggle.com/amirhmi/a-comprehensive-guide-to-get-0-9492``
    """
    def __init__(self, numerical_features, categorical_features, verbose=1):
        self.n_feas = list(numerical_features)
        self.c_feas = list(categorical_features)
        self.verbose = verbose
                    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        for a, b in product(self.n_feas, self.c_feas):
            nan_mask = x[a].isnull() | x[b].isnull()
            name = a+"_to_std_"+b
            x[name] = x[a] / x.groupby([b])[a].transform('std')
            x.loc[nan_mask, name] = np.nan
            if self.verbose:
                print(f"Generate: {name}")
        return x
    
class Mean_2var_Engineering(BaseEstimator, TransformerMixin):
    """
    双变量交互（mean）
    credit to ``https://www.kaggle.com/amirhmi/a-comprehensive-guide-to-get-0-9492``
    """
    def __init__(self, numerical_features, categorical_features, verbose=1):
        self.n_feas = list(numerical_features)
        self.c_feas = list(categorical_features)
        self.verbose = verbose
                    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x): 
        for a, b in product(self.n_feas, self.c_feas):
            nan_mask = x[a].isnull() | x[b].isnull()
            name = a+"_to_mean_"+b
            x[name] = x[a] / x.groupby([b])[a].transform('mean')
            x.loc[nan_mask, name] = np.nan
            if self.verbose:
                print(f"Generate: {name}")
        return x
    
class Add_2var_Engineering(BaseEstimator, TransformerMixin):
    """
    双分类变量交互
    credit to ``https://www.kaggle.com/amirhmi/a-comprehensive-guide-to-get-0-9492``
    """
    def __init__(self, feature_pairs, verbose=1):
        self.pairs = list(feature_pairs)
        self.verbose = verbose
                    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x): 
        for feas in self.pairs:
            name= None
            if len(feas) == 2:
                a, b = feas
                nan_mask = x[a].isnull() | x[b].isnull()
                name = a+"_"+b
                x[name] = x[a].astype(str) + "_" + x[b].astype(str)
            elif len(feas) == 3:
                a, b, c = feas
                nan_mask = x[a].isnull() | x[b].isnull() | x[c].isnull()
                name = a+"_"+b+"_"+c
                x[name] = x[a].astype(str) + "_" + x[b].astype(str) + "_" + x[c].astype(str)
                
            x.loc[nan_mask, name] = np.nan
            if self.verbose:
                print(f"Generate: {name}")
        return x
    
class Count_Engineering(BaseEstimator, TransformerMixin):
    """
    添加分类变量的频率信息
    credit to ``https://www.kaggle.com/cdeotte/200-magical-models-santander-0-920``
    """
    def __init__(self, categorical_features, verbose=1):
        self.names = list(categorical_features)
        self.verbose = verbose
        self.counts = dict()
                    
    def fit(self, x, y=None):
        for c in self.names:
            self.counts[c] = x[c].value_counts(dropna=False)
        return self
    
    def transform(self, x): 
        for c in self.names:
            name= c+"_count"
            nan_mask = x[c].isnull()
            if not (c in self.counts):
                self.counts[c] = x[c].value_counts(dropna=False)
                
            if name in x.columns:
                name += "X"
            x[name] = x[c].map(self.counts[c])
            x.loc[nan_mask, name] = np.nan
            if self.verbose:
                print(f"Generate: {name}")
        return x
    
class Drop_Features(BaseEstimator, TransformerMixin):
    """
    删除一些的特征
    
    credit to ``https://www.kaggle.com/amirhmi/a-comprehensive-guide-to-get-0-9492``
    """
    def __init__(self, percentage, percentage_dup, verbose=1):
        self.perc = percentage
        self.perc_dup = percentage_dup
        self.verbose = verbose
                    
    def fit(self, x, y=None):
        missing_values = x.isnull().sum() / len(x)
        missing_drop_cols = list(missing_values[missing_values > self.perc].keys())
        if "isFraud" in missing_drop_cols:
            missing_drop_cols.remove("isFraud")
        self.dropped_cols = missing_drop_cols            
        duplicate_drop_cols = [col for col in x.columns if x[col].value_counts(dropna=False, normalize=True).values[0] > self.perc_dup]
        if "isFraud" in duplicate_drop_cols:
            duplicate_drop_cols.remove("isFraud")
        self.dropped_cols.extend(duplicate_drop_cols)
        if self.verbose:
            print(f"Summarize: {len(missing_drop_cols)} columns have missing value(%) > {self.perc}")
            print(f"Summarize: {len(duplicate_drop_cols)} columns have duplicate value(%) > {self.perc_dup}")
        
        return self
    
    def transform(self, x):
        return x.drop(self.dropped_cols, axis=1)

In [94]:
E1 = Email_Engineering(f"{DIR}/email.json", ['P_emaildomain', 'R_emaildomain'])
train = E1.transform(train)
test = E1.transform(test)

In [95]:
B1 = Browser_Engineering(f"{DIR}/latest_browsers.txt", "id_31", verbose=1)
train = B1.transform(train)
test = B1.transform(test)

Summarize: # of 1 = 36598.0, # of NaN = 450258
Summarize: # of 1 = 20568.0, # of NaN = 370066


In [None]:
D1 = Drop_Features(percentage=0.8, percentage_dup=0.9,verbose=1)
D1.fit(dataset)
train = D1.transform(train)
test = D1.transform(test)

In [113]:
M1 = Mean_2var_Engineering(numerical_features=["TransactionAmt", "id_02", "D15"],
                          categorical_features=['card1', 'card4', 'addr1'])
train = M1.transform(train)
test = M1.transform(test)

In [122]:
A1 = Add_2var_Engineering(feature_pairs=[('card1', 'card4')])
train = A1.transform(train)
test = A1.transform(test)

Generate: card1_card4
Generate: card1_card4


In [125]:
A2 = Add_2var_Engineering(feature_pairs=[('card1_card4', 'card3', 'card5')])
train = A2.transform(train)
test = A2.transform(test)
A3 = Add_2var_Engineering(feature_pairs=[('card1_card4_card3_card5', 'addr1', 'addr2')])
train = A3.transform(train)
test = A3.transform(test)

Generate: card1_card4_card3_card5
Generate: card1_card4_card3_card5
Generate: card1_card4_card3_card5_addr1_addr2
Generate: card1_card4_card3_card5_addr1_addr2


In [129]:
# 未来数据？？？
train['TransactionAmt_check'] = np.where(train['TransactionAmt'].isin(test['TransactionAmt']), 1, 0)
test['TransactionAmt_check']  = np.where(test['TransactionAmt'].isin(train['TransactionAmt']), 1, 0)

In [130]:
train['TransactionAmt'] = np.log1p(train['TransactionAmt'])
test['TransactionAmt'] = np.log1p(test['TransactionAmt'])  

In [136]:
C1 = Count_Engineering(categorical_features=['id_36'])
C2 = Count_Engineering(categorical_features=['id_01', 'id_31', 'id_35', 'id_36'])
C1.fit(dataset)
train = C1.transform(train)
test = C1.transform(test)
train = C2.transform(train)
test = C2.transform(test)

Generate: id_36_count
Generate: id_36_count
Generate: id_01_count
Generate: id_31_count
Generate: id_35_count
Generate: id_36_countX
Generate: id_01_count
Generate: id_31_count
Generate: id_35_count
Generate: id_36_countX


In [137]:
#看不懂的操作
for col in ['card1']: 
    valid_card = pd.concat([train[[col]], test[[col]]])
    valid_card = valid_card[col].value_counts()
    valid_card = valid_card[valid_card > 2]
    valid_card = list(valid_card.index)

    train[col] = np.where(train[col].isin(test[col]), train[col], np.nan)
    test[col]  = np.where(test[col].isin(train[col]), test[col], np.nan)

    train[col] = np.where(train[col].isin(valid_card), train[col], np.nan)
    test[col]  = np.where(test[col].isin(valid_card), test[col], np.nan)

In [139]:
numerical_columns = list(test.select_dtypes(exclude=['object']).columns)

train[numerical_columns] = train[numerical_columns].fillna(train[numerical_columns].median())
test[numerical_columns] = test[numerical_columns].fillna(train[numerical_columns].median())


KeyError: "['C3' 'V14' 'V27' 'V28' 'V65' 'V68' 'V88' 'V98' 'V101' 'V102' 'V103'\n 'V104' 'V105' 'V106' 'V107' 'V108' 'V109' 'V110' 'V111' 'V112' 'V113'\n 'V114' 'V115' 'V116' 'V117' 'V118' 'V119' 'V120' 'V121' 'V122' 'V123'\n 'V124' 'V125' 'V129' 'V132' 'V133' 'V134' 'V135' 'V136' 'V137' 'V281'\n 'V284' 'V286' 'V290' 'V293' 'V295' 'V297' 'V298' 'V299' 'V300' 'V301'\n 'V305' 'V309' 'V311' 'V316' 'V318' 'V319' 'V320' 'V321'] not in index"

In [None]:
categorical_columns = list(filter(lambda x: x not in numerical_columns, list(test.columns)))