In [None]:
from main.utils.analysis import *
%matplotlib inline

In [None]:
import psutil

# 内存使用
mem = psutil.virtual_memory()
print(f"总内存: {mem.total / (1024**3):.1f} GB")
print(f"已用内存: {mem.used / (1024**3):.1f} GB")

# CPU使用
print(f"CPU核心数: {psutil.cpu_count(logical=True)}")
print(f"当前CPU占用: {psutil.cpu_percent()}%")

In [None]:
feas = pd.read_csv("data/tx待筛选指标.csv")
feas = list(feas['0'])

In [None]:
from joblib import load

model = load('tx process wo balanced/mx_mixedmodel.pkl')
feas = model.feature_name()+['matchingid', 'org', 'target', 'apply_date', 'idnumber', 'mobile']

In [None]:
params = {'data_pth':'data/tx_unfilterdata.csv',
    'date_colName': 'apply_date',
    'y_colName': 'target',
    'org_colName':'org',
    'data_encode':'utf-8',
    'key_colNames' : ['matchingid'],
    'use_cols':feas}
data = get_dataset(**params)

## 1. 数据分析(EDA)

### 1.1 去除异常月份

In [None]:
params = {
    'data': data,
    'minYmBadsample': 10,
    'minYmSample': 0
}
data = drop_abnormal_ym(**params)

### 1.2 机构逐月坏样率概览

In [None]:
datasetStatis = org_analysis(data)
display(datasetStatis.head(2))
datasetStatis.to_csv("tx process last/datasetStatis.csv", index=False)

In [None]:
data = data[~data.new_org.isin(['202412050001_光大信用卡_分期', '202408260001_上海银行', '202412040001_中原银行',
                  '202412040002_中原银行', '202406140001_分期乐_欺诈', '202403280002_长银消金_唯品会',
                 '202504210002_华通_360', '202408130001_洋钱罐', '202411040001_汇登数字', '202412090001_广州农商行'])]

### 1.3 变量分机构和总体坏样率概览

In [None]:
channel = {'银行': ['202407100001_青岛银行', '202412110001_浦发信用卡_贷前',
                   '20241205_光大信用卡_提额', '202401210004_华夏银行（京东金条）',
                   '202503110003-上海银行信贷', '202503240001-晋商银行信用卡',
                   '202412110002_浦发信用卡_贷中监控', '202503110001-上海银行信贷', 
                   '202502250001-友利银行', '202410090001_华晨东亚汽车金融'],
           
           '24非银': ['20240328_长银消金_百融', '202403280001_长银消金_360','202504240001-南银法巴',
                  '202411150001_哈银金租', '202405290001_恒昌上海',
                  '20240802_度小满','202504210001_华通_乐信',
                  '202504090004_360_低利率', '202409260001_滴滴金融_新户', '202409260002_滴滴金融_老户'], 
           
           '36非银': ['202504210003_华通_拍拍贷','202504090003_360_高利率',
                     '202503120002-久恒融担', '202503120003-久恒融担', '202503120001-久恒融担', 
                   '202504010001-久恒融担', '202407090001_分期乐_贷中', '202503120002-久恒融担',
                    '202408300001_时光金科','202412300001_爱租机', '202412300002_爱租机',
                   '202409060001_恒昌北京_复贷', '202405290001_恒昌上海','202409060002_恒昌北京_新客'],
           
           '整体':list(data.new_org.unique())}

In [None]:
miss_org, miss_channel = missing_check(**{'data': data, 'channel': channel})
display(miss_org.head(1), miss_channel.head(1))
miss_org.to_csv('tx process last/miss_org.csv', index=False)
miss_channel.to_csv('tx process last/miss_channel.csv', index=False)

### 1.3.1 去除高缺失率变量

In [None]:
data = drop_highmiss_features(**{'data':data, 'ratio': 0.6, 'cnt': 16, 'miss_org': miss_org, 'miss_channel': miss_channel})

### 1.4 变量分机构和渠道和总体iv概览

In [None]:
res_iv_org, res_iv_channel = detect_iv(**{'data':data, 'method':'quantile', 'bins':5, 'channel': channel})
res_iv_org.to_csv('tx process last/iv_org.csv', index=False)
res_iv_channel.to_csv("tx process last/iv_channel.csv", index=False)
display(res_iv_org.head(2), res_iv_channel.head(2))

### 1.4.1 去除低iv变量

In [None]:
data = drop_lowiv_features(**{'data':data, 'miniv_org':0.02, 'miniv_channel':0.05, 'cnt':10, 'res_iv_org':res_iv_org, 'res_iv_channel': res_iv_channel})

### 1.5 变量分机构和渠道psi概览

In [None]:
res_psi_org, res_psi_channel = detect_psi(**{'data': data, 'channel': channel})
res_psi_org.to_csv("tx process last/psi_org.csv", index=False)
res_psi_channel.to_csv('tx process last/psi_channel.csv', index=False)

### 1.5.1 去除高psi变量

In [None]:
data = drop_highpsi_features(**{'data':data, 'res_psi_org':res_psi_org, 'cnt':10, 'ratio':0.1})

### 1.6 变量总体相似性概览

In [None]:
indices, corr = detect_correlation(**{'data':data, 'method':'pearson', 'max_corr':0.85})
indices = pd.DataFrame(indices)
corr.to_csv('tx process last/correlation.csv', index=False)
indices.to_csv("tx process last/indices.csv", index=False)

In [None]:
indices = pd.read_csv("tx process last/indices.csv")

### 1.6.1 去除高相似性变量

In [None]:
data = drop_highcorrelation_features(**{'data':data, 'res_iv_channel':res_iv_channel,'indices':indices, 'channel':'整体'})

In [None]:
data.shape

### 2.5 null importance去除高噪音变量

In [None]:
data = drop_highnoise_features(**{'data': data, 'n_estimators':100, 'max_depth': 5})[0]