## 导入所需的包

In [51]:
from autox import AutoX
from autox.file_io import read_data_from_path
from autox.process_data import Feature_type_recognition
from autox.util import log, reduce_mem_usage
from autox.feature_engineer import FeatureStat
from autox.feature_engineer import FeatureCount
from autox.models.regressor import CrossLgbRegression
from autox.ensemble.stacking import StackingRegressor
from autox.models.classifier import CrossLgbBiClassifier, CrossXgbBiClassifier, CrossTabnetBiClassifier
import json
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from lightgbm import LGBMRegressor
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

## 初始化AutoX类

In [3]:
relations = [
    {
            "related_to_main_table": "true", # 是否为和主表的关系
            "left_entity": "train_transaction.csv",  # 左表名字
            "left_on": ["TransactionID"],  # 左表拼表键
            "right_entity": "train_identity.csv",  # 右表名字
            "right_on": ["TransactionID"], # 右表拼表键
            "type": "1-1" # 左表与右表的连接关系
        },
    {
            "related_to_main_table": "true", # 是否为和主表的关系
            "left_entity": "test_transaction.csv",  # 左表名字
            "left_on": ["TransactionID"],  # 左表拼表键
            "right_entity": "test_identity.csv",  # 右表名字
            "right_on": ["TransactionID"], # 右表拼表键
            "type": "1-1" # 左表与右表的连接关系
        }
]

In [4]:
# 选择数据集
data_name = 'kaggle_ieee'
path = f'./data/{data_name}'

autox = AutoX(target = 'isFraud', train_name = 'train_transaction.csv', test_name = 'test_transaction.csv', 
               id = ['TransactionID'], path = path, relations = relations)

   INFO ->  [+] read test_identity.csv
   INFO ->  Memory usage of dataframe is 44.39 MB
   INFO ->  Memory usage after optimization is: 9.84 MB
   INFO ->  Decreased by 77.8%
   INFO ->  table = test_identity.csv, shape = (141907, 41)
   INFO ->  [+] read sample_submission.csv
   INFO ->  Memory usage of dataframe is 7.73 MB
   INFO ->  Memory usage after optimization is: 2.90 MB
   INFO ->  Decreased by 62.5%
   INFO ->  table = sample_submission.csv, shape = (506691, 2)
   INFO ->  [+] read train_identity.csv
   INFO ->  Memory usage of dataframe is 45.12 MB
   INFO ->  Memory usage after optimization is: 10.00 MB
   INFO ->  Decreased by 77.8%
   INFO ->  table = train_identity.csv, shape = (144233, 41)
   INFO ->  [+] read test_transaction.csv
   INFO ->  Memory usage of dataframe is 1519.24 MB
   INFO ->  Memory usage after optimization is: 425.24 MB
   INFO ->  Decreased by 72.0%
   INFO ->  table = test_transaction.csv, shape = (506691, 393)
   INFO ->  [+] read train_transacti

## 特征工程

In [12]:
df = autox.dfs_['train_test']
feature_type = autox.info_['feature_type']['train_test']
id_ = autox.info_['id']
target = autox.info_['target']

### target_encoding

In [13]:
# from autox.feature_engineer import FeatureTargetEncoding

In [14]:
# featureTE = FeatureTargetEncoding()
# featureTE.fit(df, 'orders_3h_15h', df_feature_type = feature_type, silence_cols = id_, select_all=False)

In [15]:
# log(featureTE.get_ops())

In [16]:
# 手动修改配置
# featureCount.set_keys([[],[]])

In [17]:
# FE_te = featureTE.transform(df)

In [18]:
# FE_te

### rank特征

In [19]:
from autox.feature_engineer import FeatureRank

In [20]:
featureRank = FeatureRank()
featureRank.fit(df, df_feature_type = feature_type, select_all = False)

In [21]:
cnt = 0
for key_ in featureRank.get_ops().keys():
    cnt += len(featureRank.get_ops()[key_])
cnt

4260

In [22]:
# FE_rank = featureRank.transform(df)

In [23]:
# FE_rank.head()

### 统计特征

In [24]:
featureStat = FeatureStat()
featureStat.fit(df, df_feature_type = feature_type, silence_group_cols = id_ + [target], 
                silence_agg_cols = id_ + [target], select_all = False)


In [25]:
featureStat.get_ops().keys()

dict_keys(['ProductCD', 'P_emaildomain', 'R_emaildomain', 'id_30', 'id_31', 'id_33', 'DeviceInfo', 'id-30', 'id-31', 'id-33'])

In [26]:
cnt = 0
for key_ in featureStat.get_ops().keys():
    cnt += len(featureStat.get_ops()[key_])
cnt

4690

In [27]:
# 手动修改配置
# featureGroupby.set_keys()

In [28]:
# FE_stat = featureStat.transform(df)

In [29]:
# FE_stat.head()

### count特征

In [30]:
featureCount = FeatureCount()
featureCount.fit(df, degree=2, df_feature_type = feature_type, silence_cols = id_ + [target], select_all=False)

In [31]:
log(featureCount.get_ops())

   INFO ->  [['ProductCD', 'card4'], ['ProductCD', 'card6'], ['ProductCD', 'P_emaildomain'], ['ProductCD', 'R_emaildomain'], ['ProductCD', 'M1'], ['ProductCD', 'M2'], ['ProductCD', 'M3'], ['ProductCD', 'M4'], ['ProductCD', 'M5'], ['ProductCD', 'M6'], ['ProductCD', 'M7'], ['ProductCD', 'M8'], ['ProductCD', 'M9'], ['ProductCD', 'id_12'], ['ProductCD', 'id_15'], ['ProductCD', 'id_16'], ['ProductCD', 'id_23'], ['ProductCD', 'id_27'], ['ProductCD', 'id_28'], ['ProductCD', 'id_29'], ['ProductCD', 'id_30'], ['ProductCD', 'id_31'], ['ProductCD', 'id_33'], ['ProductCD', 'id_34'], ['ProductCD', 'id_35'], ['ProductCD', 'id_36'], ['ProductCD', 'id_37'], ['ProductCD', 'id_38'], ['ProductCD', 'DeviceType'], ['ProductCD', 'DeviceInfo'], ['ProductCD', 'id-12'], ['ProductCD', 'id-15'], ['ProductCD', 'id-16'], ['ProductCD', 'id-23'], ['ProductCD', 'id-27'], ['ProductCD', 'id-28'], ['ProductCD', 'id-29'], ['ProductCD', 'id-30'], ['ProductCD', 'id-31'], ['ProductCD', 'id-33'], ['ProductCD', 'id-34'], ['Pr

In [32]:
len(featureCount.get_ops())

2116

In [33]:
# 手动修改配置
# featureCount.set_keys([[],[]])

In [34]:
# FE_count = featureCount.transform(df)

In [35]:
# FE_count.head()

## 特征合并

In [36]:
from autox.process_data import feature_combination

In [37]:
df_list = [df]
# df_list = [df, FE_count, FE_stat, FE_rank]

FE_all = feature_combination(df_list)

In [38]:
FE_all.shape

(1097231, 472)

## train和test数据切分

In [40]:
from autox.process_data import train_test_divide

In [41]:
train_length = autox.info_['shape_of_train']
train, test = train_test_divide(FE_all, train_length)

In [42]:
train.shape, test.shape

((590540, 472), (506691, 472))

## 特征过滤

In [43]:
from autox.process_data import feature_filter

In [44]:
used_features = feature_filter(train, test, id_, target)

100%|██████████| 426/426 [06:26<00:00,  1.10it/s]
   INFO ->  filtered features: ['TransactionID', 'isFraud', 'TransactionDT', 'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09', 'id_10', 'id_11', 'id_13', 'id_14', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_24', 'id_25', 'id_26', 'id_32']


## 查看最终使用的特征

In [45]:
len(used_features)

400

In [46]:
used_features

['TransactionAmt',
 'card1',
 'card2',
 'card3',
 'card5',
 'addr1',
 'addr2',
 'dist1',
 'dist2',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V40',
 'V41',
 'V42',
 'V43',
 'V44',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V57',
 'V58',
 'V59',
 'V60',
 'V61',
 'V62',
 'V63',
 'V64',
 'V65',
 'V66',
 'V67',
 'V68',
 'V69',
 'V70',
 'V71',
 'V72',
 'V73',
 'V74',
 'V75',
 'V76',
 'V77',
 'V78',
 'V79',
 'V80',
 'V81',
 'V82',
 'V83',
 'V84',
 'V85',
 'V86',
 'V87',


## 模型训练

In [52]:
model_lgb = CrossLgbBiClassifier()
model_lgb.fit(train[used_features], train[target], tuning=False)

   INFO ->  (590540, 400)


Training on fold 1
Training until validation scores don't improve for 150 rounds
[100]	training's auc: 0.870914	valid_1's auc: 0.858531
[200]	training's auc: 0.885236	valid_1's auc: 0.869557
[300]	training's auc: 0.893103	valid_1's auc: 0.877265
[400]	training's auc: 0.898723	valid_1's auc: 0.882655
[500]	training's auc: 0.902938	valid_1's auc: 0.886648
[600]	training's auc: 0.906642	valid_1's auc: 0.890071
[700]	training's auc: 0.909995	valid_1's auc: 0.893169
[800]	training's auc: 0.912443	valid_1's auc: 0.89551
[900]	training's auc: 0.914933	valid_1's auc: 0.897679
[1000]	training's auc: 0.917149	valid_1's auc: 0.899653
[1100]	training's auc: 0.9194	valid_1's auc: 0.901604
[1200]	training's auc: 0.921115	valid_1's auc: 0.902974
[1300]	training's auc: 0.923097	valid_1's auc: 0.904674
[1400]	training's auc: 0.924758	valid_1's auc: 0.906087
[1500]	training's auc: 0.926161	valid_1's auc: 0.907344
[1600]	training's auc: 0.927565	valid_1's auc: 0.908556
[1700]	training's auc: 0.929065	val

## 查看模型特征重要性

In [53]:
fimp = model_lgb.feature_importances_

In [54]:
fimp

Unnamed: 0,feature,fold_1,fold_2,fold_3,fold_4,fold_5,average
0,card2,6121,5839,5580,5690,5822,5810.4
1,card1,4818,5083,4700,4615,4663,4775.8
2,addr1,4524,4697,4824,4639,4522,4641.2
3,TransactionAmt,4139,4327,4293,4135,4140,4206.8
4,C13,2851,2868,2943,2890,2772,2864.8
...,...,...,...,...,...,...,...
395,V241,0,0,0,0,0,0.0
396,V305,0,0,0,0,0,0.0
397,V27,0,0,0,0,0,0.0
398,V89,0,0,0,0,0,0.0


## 模型预测

In [55]:
predict_lgb = model_lgb.predict(test[used_features])

In [56]:
predict = predict_lgb

## 预测结果后处理

In [57]:
from autox.process_data import clip_label

In [58]:
min_ = autox.info_['min_target']
max_ = autox.info_['max_target']
predict = clip_label(predict, min_, max_)

In [59]:
min_, max_

(0, 1)

## 获取sub

In [60]:
sub = test[id_]
sub[target] = predict
sub.index = range(len(sub))

In [61]:
sub.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.004643
1,3663550,0.003027
2,3663551,0.004011
3,3663552,0.00111
4,3663553,0.003344


In [62]:
sub.shape

(506691, 2)

In [63]:
sub[target].max(), sub[target].min()

(0.9999280926655512, 5.87678498901363e-06)

## 输出结果

In [64]:
tag = "ieee_lgb"

In [65]:
path = f'./sub/sub_{tag}.csv'
sub.to_csv(path, index = False)