In [109]:
# -*- coding: utf-8 -*-
# model
# author = 'huangth'

import os
import time

import numpy as np
import pandas as pd

import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor

BASE_PATH = 'D:\\Azy\\'
tt=pd.read_csv("D:\\Azy\\RawData\\train_dataset.csv")
ETL_DATA_PATH = os.path.join(BASE_PATH, "EtlData")


def get_feature(name):
    data_name = os.path.join(ETL_DATA_PATH, "{}.csv".format(name))
    df = pd.read_csv(data_name)
    return df


def lgb_mae_model(train_df, test_df, params):
    NFOLDS = 10
    train_label = train_df['信用分']
    kfold = KFold(n_splits=NFOLDS, shuffle=False, random_state=2019)
    kf = kfold.split(train_df, train_label)

    train = train_df.drop(['用户编码', '信用分'], axis=1)
    test = test_df.drop(['用户编码'], axis=1)

    cv_pred = np.zeros(test.shape[0])
    valid_best_l2_all = 0

    count = 0
    for i, (train_fold, validate) in enumerate(kf):
        print("model: lgb_mae. fold: ", i , "training...")
        X_train, label_train = train.iloc[train_fold], train_label.iloc[train_fold]
        X_validate, label_validate = train.iloc[validate], train_label.iloc[validate]

        dtrain = lgb.Dataset(X_train, label_train)
        dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)

        bst = lgb.train(params, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1, early_stopping_rounds=50)
        cv_pred += bst.predict(test, num_iteration=bst.best_iteration)
        valid_best_l2_all += bst.best_score['valid_0']['l1']

        count += 1

    cv_pred /= NFOLDS
    valid_best_l2_all /= NFOLDS
    print("lgb_mae cv score for valid is: ", 1/(1+valid_best_l2_all))

    print("----------------------------------------")
    print("----------------------------------------")
    print("lgb_mae  feature importance：")
    fea_importances = pd.DataFrame({
        'column': train.columns,
        'importance': bst.feature_importance(importance_type='split', iteration=bst.best_iteration)
    }).sort_values(by='importance', ascending=False)
    print(fea_importances)
    print("----------------------------------------")
    print("----------------------------------------")

    return cv_pred


def lgb_mse_model(train_df, test_df, params):
    NFOLDS = 10
    train_label = train_df['信用分']
    kfold = KFold(n_splits=NFOLDS, shuffle=False, random_state=2019)
    kf = kfold.split(train_df, train_label)

    train = train_df.drop(['用户编码', '信用分'], axis=1)
    test = test_df.drop(['用户编码'], axis=1)

    cv_pred = np.zeros(test.shape[0])
    valid_best_l2_all = 0

    count = 0
    for i, (train_fold, validate) in enumerate(kf):
        print("model:lgb_mse. fold: ", i , "training...")
        X_train, label_train = train.iloc[train_fold], train_label.iloc[train_fold]
        X_validate, label_validate = train.iloc[validate], train_label.iloc[validate]

        dtrain = lgb.Dataset(X_train, label_train)
        dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)

        bst = lgb.train(params, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1, early_stopping_rounds=50)
        cv_pred += bst.predict(test, num_iteration=bst.best_iteration)
        valid_best_l2_all += bst.best_score['valid_0']['l1']

        count += 1

    cv_pred /= NFOLDS
    valid_best_l2_all /= NFOLDS
    print("lgb_mse cv score for valid is: ", 1/(1+valid_best_l2_all))

    print("----------------------------------------")
    print("----------------------------------------")
    print("lgb_mse  feature importance：")
    fea_importances = pd.DataFrame({
        'column': train.columns,
        'importance': bst.feature_importance(importance_type='split', iteration=bst.best_iteration)
    }).sort_values(by='importance', ascending=False)
    print(fea_importances)
    print("----------------------------------------")
    print("----------------------------------------")

    return cv_pred


def xgb_mae_model(train_df, test_df, params):
    NFOLDS = 5
    train_label = train_df['信用分']
    kfold = KFold(n_splits=NFOLDS, shuffle=False, random_state=2019)
    kf = kfold.split(train_df, train_label)

    train = train_df.drop(['用户编码', '信用分'], axis=1)
    test = test_df.drop(['用户编码'], axis=1)

    cv_pred = np.zeros(test.shape[0])

    count = 0
    preds_list = list()
    oof = np.zeros(train_df.shape[0])
    for i, (train_fold, validate) in enumerate(kf):
        print("model: xgb_mae. fold: ", i , "training...")
        X_train, label_train = train.iloc[train_fold], train_label.iloc[train_fold]
        X_validate, label_validate = train.iloc[validate], train_label.iloc[validate]

        gbm = xgb.XGBRegressor(**params)
        bst = gbm.fit(X_train, label_train, eval_set=[(X_train, label_train), (X_validate, label_validate)],
                          early_stopping_rounds=200, verbose=500)

        k_pred = bst.predict(X_validate)
        oof[validate] = k_pred

        preds = gbm.predict(test)
        preds_list.append(preds)

        count += 1

    fold_mae_error = mean_absolute_error(train_label, oof)
def cat_model(train_df, test_df, params):
    NFOLDS = 5
    train_label = train_df['信用分']
    kfold = KFold(n_splits=NFOLDS, shuffle=False, random_state=2019)
    kf = kfold.split(train_df, train_label)

    train = train_df.drop(['用户编码', '信用分'], axis=1)
    test = test_df.drop(['用户编码'], axis=1)

    cv_pred = np.zeros(test.shape[0])

    count = 0
    preds_list = list()
    oof = np.zeros(train_df.shape[0])
    for i, (train_fold, validate) in enumerate(kf):
        print("model: xgb_mae. fold: ", i , "training...")
        X_train, label_train = train.iloc[train_fold], train_label.iloc[train_fold]
        X_validate, label_validate = train.iloc[validate], train_label.iloc[validate]

        cat = CatBoostRegressor(**params)
        bst = cat.fit(X_train, label_train, eval_set=[(X_train, label_train), (X_validate, label_validate)],
                          early_stopping_rounds=1000, verbose=500)

        k_pred = bst.predict(X_validate)
        oof[validate] = k_pred

        preds = cat.predict(test)
        preds_list.append(preds)

        count += 1

    fold_mae_error = mean_absolute_error(train_label, oof)


    preds_columns = ['preds_{id}'.format(id=i) for i in range(NFOLDS)]
    preds_df = pd.DataFrame(data=preds_list)
    preds_df = preds_df.T
    preds_df.columns = preds_columns
    preds_list = list(preds_df.mean(axis=1))
    cv_pred = preds_list

    print("xgb_mae cv score for valid is: ", 1/(1+fold_mae_error))

    # print("----------------------------------------")
    # print("----------------------------------------")
    # print("xgb_mae  feature importance：")
    # fea_importances = pd.DataFrame({
    #     'column': train.columns,
    #     'importance': bst.feature_importance(importance_type='split', iteration=bst.best_iteration)
    # }).sort_values(by='importance', ascending=False)
    # print(fea_importances)
    # print("----------------------------------------")
    # print("----------------------------------------")

    return cv_pred


def model_bagging(pred1, pred2):
    cv_pred = (pred1)
    return cv_pred





In [113]:
# -*- coding: utf-8 -*-
# 特征工程
# author = 'huangth'

import os
import time
import gc

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')


BASE_PATH = 'D:\\Azy\\'
RAW_DATA_PATH = os.path.join(BASE_PATH, "RawData")
ETL_DATA_PATH = os.path.join(BASE_PATH, "EtlData")


class Processing(object):
    # 读取数据
    @staticmethod
    def _get_data_(name):
        data_name = os.path.join(RAW_DATA_PATH, '{}.csv'.format(name))
        df = pd.read_csv(data_name)
        return df

    # 改变变量类型节省内存空间
    @staticmethod
    def _reduce_mem_usage_(df, verbose=True):
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        start_mem = df.memory_usage().sum() / 1024**2
        for col in df.columns:
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
        end_mem = df.memory_usage().sum() / 1024**2
        if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
        return df

    # 众数填充异常值
    @staticmethod
    def _mode_fill_(df, col):
        df.loc[df[col] == 0, col] = df[col].mode()
        return df

    # 疯狂找特征呀呀呀
    @staticmethod
    def _feature_(df):
        #df['缴费金额是否能覆盖当月账单'] = df['缴费用户最近一次缴费金额（元）'] - df['用户账单当月总费用（元）']
        #df['最近一次缴费是否超过平均消费额'] = df['缴费用户最近一次缴费金额（元）'] - df['用户近6个月平均消费值（元）']
        #df['当月账单是否超过平均消费额'] = df['用户账单当月总费用（元）'] - df['用户近6个月平均消费值（元）']

        #df['是否去过高档商场'] = df['当月是否逛过福州仓山万达'] * df['当月是否到过福州山姆会员店']
        #df['交通类应用使用次数'] = df['当月飞机类应用使用次数'] + df['当月火车类应用使用次数']

        # 缴费方式
        df['缴费方式'] = 0
        df['缴费方式'][(df['缴费用户最近一次缴费金额（元）'] % 10 == 0) & df['缴费用户最近一次缴费金额（元）'] != 0] = 1

        # 消费稳定性  话费/余额
        #df['缴费稳定性'] = df['用户账单当月总费用（元）'] / (df['用户近6个月平均消费值（元）'] + 1)
        #df['当月话费使用率'] = df['用户账单当月总费用（元）'] / (df['用户当月账户余额（元）'] + 1)
        return df

    # 年龄分箱
    @staticmethod
    def _group_age_(x):
        if x <= 18:
            return 1
        elif x <= 30:
            return 2
        elif x <= 35:
            return 3
        elif x <= 45:
            return 4
        else:
            return 5

    # 长尾数据处理
    @staticmethod
    def _log_feature_(df):
        user_bill_features = ['缴费用户最近一次缴费金额（元）', '用户近6个月平均消费值（元）',
                              '用户账单当月总费用（元）', '用户当月账户余额（元）']
        log_features = ['当月网购类应用使用次数', '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数']
        for col in user_bill_features + log_features:
            df[col] = df[col].map(lambda x: np.log1p(x))
        return df
    @staticmethod
    def _search_(df):
        rf=pd.DataFrame()
        rf["用户编码"]=df["用户编码"]
        rf["信用分"]=df["信用分"]
        rf["当前消费动荡"]=((df["用户近6个月平均消费值（元）"]*6-df["用户账单当月总费用（元）"])/5)-(df["用户账单当月总费用（元）"])
        #rf["当前消费动荡2"]=((df["用户近6个月平均消费值（元）"]*6-df["用户账单当月总费用（元）"])/5)-(df["用户账单当月总费用（元）"])
        #rf["当前消费动荡a"]=(df["用户近6个月平均消费值（元）"]+1)/(df['用户近6个月平均消费值（元）']+1)
        rf["用户近6个月平均消费值（元）"]=(df["用户近6个月平均消费值（元）"]*6-df["用户账单当月总费用（元）"])/5
         #rf["用户近6个月平均消费值（元）"]=(df["用户近6个月平均消费值（元）"]*6-df["用户账单当月总费用（元）"])/5
        rf["用户账单当月总费用（元）"]=df["用户账单当月总费用（元）"]       
        rf["用户网龄（月）"]=df["用户网龄（月）"]
        rf["当月通话交往圈人数 "]=df["当月通话交往圈人数"]
        rf["用户年龄"]=df["用户年龄"]
        rf['缴费用户最近一次缴费金额（元）']=df['缴费用户最近一次缴费金额（元）']
        rf["当月金融理财类应用使用总次数"]=df["当月金融理财类应用使用总次数"]
        rf['当月网购类应用使用次数']=df['当月网购类应用使用次数']
        rf['当月视频播放类应用使用次数']=df['当月视频播放类应用使用次数']
        rf['用户话费敏感度']=df['用户话费敏感度']
        rf = pd.get_dummies(rf, columns=["用户话费敏感度"])
        rf['近三个月月均商场出现次数']=df['近三个月月均商场出现次数']
        rf['当月话费使用率'] = df['用户账单当月总费用（元）'] / (df['用户当月账户余额（元）'] + 1)
        rf["当月旅游资讯类应用使用次数"]=df["当月旅游资讯类应用使用次数"]
        rf["交通类应用使用次数"]=df['当月飞机类应用使用次数'] + df['当月火车类应用使用次数']
        rf['是否去过高档商场'] = (1-df['当月是否逛过福州仓山万达'])*(1-df['当月是否到过福州山姆会员店'])
        df["交通类应用使用次数"]=rf["交通类应用使用次数"]
        rf['当月是否景点游览']=df['当月是否景点游览']
        rf["当月是否体育场馆消费"]=(1-df["当月是否体育场馆消费"])*(1-df["当月是否看电影"])
        rf["当月是否体育场馆消费"]=df["当月是否体育场馆消费"]
        rf["当月是否看电影"]=df["当月是否看电影"]
        rf["是否4G不健康客户"]=df["是否4G不健康客户"]
        rf["缴费用户当前是否欠费缴费"]=df["缴费用户当前是否欠费缴费"]
        rf["当月物流快递类应用使用次数"]=df["当月物流快递类应用使用次数"]
        #rf["是否大学生客户"]=df["是否大学生客户"]
        #rf["是否黑名单客户"]=df["是否黑名单客户"]
        #rf["用户实名制是否通过核实"]=df["用户实名制是否通过核实"]         
        #rf["实名大"]=(1-df["是否黑名单客户"])*(1-df["是否大学生客户"])
        #rf["偏好"]=0
        #rf["偏好"][(df["当月网购类应用使用次数"]>df['当月金融理财类应用使用总次数'])&(df['当月网购类应用使用次数']>df['当月视频播放类应用使用次数'])]=1
        #rf["偏好"][(df["当月金融理财类应用使用总次数"]>df['当月网购类应用使用次数'])&(df['当月金融理财类应用使用总次数']>df['当月视频播放类应用使用次数'])]=2
        #rf = pd.get_dummies(rf, columns=["偏好"])
        #rf["缴费方式"]= df["缴费方式"]
        #rf["是否经常逛商场的人"]=df["是否经常逛商场的人"]
        #rf['次数'] = df['当月网购类应用使用次数'] +  df['当月物流快递类应用使用次数'] +  df['当月金融理财类应用使用总次数'] + df['当月视频播放类应用使用次数']+ df['当月飞机类应用使用次数'] + df['当月火车类应用使用次数'] + df['当月旅游资讯类应用使用次数']  + 1
        #for col in ['当月金融理财类应用使用总次数','当月旅游资讯类应用使用次数','交通类应用使用次数','当月网购类应用使用次数','当月视频播放类应用使用次数']: # 这两个比较积极向上一点
        #    rf[col + '百分比'] = df[col].values / rf['次数'].values 
        #rf["出行"]=rf["当月旅游资讯类应用使用次数"]+rf["交通类应用使用次数"]
        #rf["bili"]=rf["用户网龄（月）"]/rf["用户年龄"]
        #rf['']
        #rf["比例"]=df["用户近6个月平均消费值（元）"]/df["用户当月账户余额（元）"]
        #rf["偏好"]=0
        #rf["偏好"][(df["当月网购类应用使用次数"]>df['当月金融理财类应用使用总次数'])&(df['当月网购类应用使用次数']>df['当月视频播放类应用使用次数'])]=1
        #rf["偏好"][(df["当月金融理财类应用使用总次数"]>df['当月网购类应用使用次数'])&
        #         (df['当月金融理财类应用使用总次数']>df['当月视频播放类应用使用次数'])]=2
        #rf = pd.get_dummies(rf, columns=["偏好"])
        #rf["用户当月账户余额（元）"]=df["用户当月账户余额（元）"]
        #rf['当月通话人均话费'] = df['用户账单当月总费用（元）'].values / (df['当月通话交往圈人数'].values + 1)
        #rf['上个月费用'] = df['用户当月账户余额（元）'].values + df['用户账单当月总费用（元）'].values-df["缴费用户最近一次缴费金额（元）"].values
        #rf['用户上网年龄'] = df['用户年龄'] - df['用户网龄（月）']/12
        return rf
    @staticmethod
    def get_recharge_way(item):
        # 是否能被10整除
        if item == 0:
            return -1
        if item % 10 == 0:
            return 1
        else:
            return 0
    def get_processing(self):
        train_df = self._get_data_('train_dataset')
        test_df = self._get_data_('test_dataset')

        train_df = self._reduce_mem_usage_(train_df)
        test_df = self._reduce_mem_usage_(test_df)

        test_df['信用分'] = -1
        data = pd.concat([train_df, test_df], axis=0, ignore_index=True)

        del train_df, test_df
        gc.collect()
        #data['缴费方式'] = 0
        #data['缴费方式']=data['缴费用户最近一次缴费金额（元）'].apply(self.get_recharge_way)
        data = self._mode_fill_(data, '用户账单当月总费用（元）')
        data = self._mode_fill_(data, '用户年龄')
        data = self._mode_fill_(data, '用户话费敏感度')
        data = self._mode_fill_(data, '缴费用户最近一次缴费金额（元）')
        data = self._mode_fill_(data, '用户话费敏感度')
        data = self._feature_(data)
        # data['年龄段'] = data['用户年龄'].apply(self._group_age_)
        
        data = self._log_feature_(data)
        data=self._search_(data)
        train, test = data[:50000], data[50000:]
        test = test.drop(['信用分'], axis=1)

        train_data_name = os.path.join(ETL_DATA_PATH, 'train_data.csv')
        test_data_name = os.path.join(ETL_DATA_PATH, 'test_data.csv')
        train.to_csv(train_data_name, index=False)
        test.to_csv(test_data_name, index=False)
        print('Gen train shape: {}, test shape: {}'.format(train.shape, test.shape))
        print('features num: ', test.shape[1] - 1)


if  __name__ == "__main__":
    t0 = time.time()
    processing = Processing()
    processing.get_processing()
    print("Feature engineering has finished!")
    print("Cost {} s.".format(time.time() - t0))

Mem. usage decreased to  2.81 Mb (75.4% reduction)
Mem. usage decreased to  2.81 Mb (74.6% reduction)
Gen train shape: (50000, 28), test shape: (50000, 27)
features num:  26
Feature engineering has finished!
Cost 3.4923670291900635 s.


In [116]:

train_data = get_feature(name="train_data")
test_data = get_feature(name="test_data")

print('Gen train shape: {}, test shape: {}'.format(train_data.shape, test_data.shape))
print('features num: ', test_data.shape[1] - 1)
# 'random_seed': 4590,
    # lgb_mae参数
ctb_params = {
        'n_estimators': 10000,
        'learning_rate': 0.01,
        'random_seed': 5570,
        'reg_lambda': 5,
        'subsample': 0.7,
        'bootstrap_type': 'Bernoulli',
        'boosting_type': 'Plain',
        'one_hot_max_size': 10,
        'rsm': 0.5,
        'leaf_estimation_iterations': 5,
        'use_best_model': True,
        'max_depth': 6,
        'verbose': -1,
        'thread_count': 4
    }
cat_mae_pred = cat_model(train_data, test_data, ctb_params)
    #lgb_mae_pred = lgb_mae_model(train_data, test_data, params_mae_lgb)
    #lgb_mse_pred = lgb_mse_model(train_data, test_data, params_mse_lgb)

bagging_pred = cat_mae_pred

test_data_sub = test_data[['用户编码']]
    #test_data_sub['target']=tt["信用分"]
test_data_sub['score'] = bagging_pred
test_data_sub.columns = ['id', 'score']
test_data_sub['score'] = test_data_sub['score']
test_data_sub['score'] = test_data_sub['score']#.apply(lambda x: int(np.round(x)))
test_data_sub[['id', 'score']].to_csv('3.13_cat5.csv', index=False)


Gen train shape: (50000, 29), test shape: (50000, 28)
features num:  27
model: xgb_mae. fold:  0 training...
0:	learn: 613.4190934	test: 613.4190934	test1: 613.0101538	best: 613.0101538 (0)	total: 144ms	remaining: 23m 57s
500:	learn: 20.7709637	test: 20.7709637	test1: 21.1281294	best: 21.1281294 (500)	total: 14.3s	remaining: 4m 31s
1000:	learn: 19.2295591	test: 19.2295591	test1: 19.6656169	best: 19.6656169 (1000)	total: 27.5s	remaining: 4m 7s
1500:	learn: 18.8173254	test: 18.8173254	test1: 19.4209050	best: 19.4209050 (1500)	total: 40.3s	remaining: 3m 47s
2000:	learn: 18.5246280	test: 18.5246280	test1: 19.3096321	best: 19.3096321 (2000)	total: 53.6s	remaining: 3m 34s
2500:	learn: 18.2874143	test: 18.2874143	test1: 19.2375921	best: 19.2375921 (2500)	total: 1m 6s	remaining: 3m 18s
3000:	learn: 18.0759012	test: 18.0759012	test1: 19.1910297	best: 19.1910105 (2998)	total: 1m 18s	remaining: 3m 4s
3500:	learn: 17.8862067	test: 17.8862067	test1: 19.1562983	best: 19.1558057 (3497)	total: 1m 31s	

4500:	learn: 17.5512386	test: 17.5512386	test1: 19.1953729	best: 19.1951348 (4493)	total: 1m 52s	remaining: 2m 17s
5000:	learn: 17.3953362	test: 17.3953362	test1: 19.1946454	best: 19.1935275 (4925)	total: 2m 5s	remaining: 2m 5s
5500:	learn: 17.2468680	test: 17.2468680	test1: 19.1946004	best: 19.1935275 (4925)	total: 2m 18s	remaining: 1m 53s
6000:	learn: 17.1041578	test: 17.1041578	test1: 19.1946791	best: 19.1935200 (5880)	total: 2m 31s	remaining: 1m 41s
6500:	learn: 16.9691565	test: 16.9691565	test1: 19.1942159	best: 19.1935200 (5880)	total: 2m 44s	remaining: 1m 28s
7000:	learn: 16.8386903	test: 16.8386903	test1: 19.1945781	best: 19.1932342 (6606)	total: 2m 57s	remaining: 1m 16s
7500:	learn: 16.7133230	test: 16.7133230	test1: 19.1982726	best: 19.1932342 (6606)	total: 3m 10s	remaining: 1m 3s
Stopped by overfitting detector  (1000 iterations wait)

bestTest = 19.19323418
bestIteration = 6606

Shrink model to first 6607 iterations.
model: xgb_mae. fold:  4 training...
0:	learn: 613.452334

In [115]:
test_data_sub['score'] = test_data_sub['score'].apply(lambda x: int(np.round(x)))
test_data_sub[['id', 'score']].to_csv('3.17_cat5.csv', index=False)

In [102]:
test_data_sub['score'].describe()

count    50000.000000
mean       617.883780
std         38.283659
min        463.000000
25%        596.000000
50%        627.000000
75%        646.000000
max        700.000000
Name: score, dtype: float64

In [82]:
# -*- coding: utf-8 -*-
# 特征工程
# author = 'huangth'

import os
import time
import gc

import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')


BASE_PATH = 'D:\\Azy\\'
RAW_DATA_PATH = os.path.join(BASE_PATH, "RawData")
ETL_DATA_PATH = os.path.join(BASE_PATH, "EtlData")
def kuaidi(item):
    if item==0:
        return 0
    if item>0 and item<=10:
        return 1
    if item>10 and item<=100:
        return 2
    if item>100:
        return 3


class Processing(object):
    # 读取数据
    @staticmethod
    def _get_data_(name):
        data_name = os.path.join(RAW_DATA_PATH, '{}.csv'.format(name))
        df = pd.read_csv(data_name)
        return df

    # 改变变量类型节省内存空间
    @staticmethod
    def _reduce_mem_usage_(df, verbose=True):
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        start_mem = df.memory_usage().sum() / 1024**2
        for col in df.columns:
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
        end_mem = df.memory_usage().sum() / 1024**2
        if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
        return df

    # 众数填充异常值
    @staticmethod
    def _mode_fill_(df, col):
        df.loc[df[col] == 0, col] = df[col].mode()
        return df

    # 疯狂找特征呀呀呀
    @staticmethod
    def _feature_(df):
        return df
    @staticmethod
    def get_recharge_way(item):
        # 是否能被10整除
        if item == 0:
            return -1
        if item % 10 == 0:
            return 1
        else:
            return 0

    # 长尾数据处理
    @staticmethod
    def _log_feature_(df):
        return df
    @staticmethod
    def _gethob_(df):
        #log_features = ['当月网购类应用使用次数', '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数']
        return df

    def get_processing(self):
        train_df = self._get_data_('train_dataset')
        test_df = self._get_data_('test_dataset') 
        train_df = self._reduce_mem_usage_(train_df)
        test_df = self._reduce_mem_usage_(test_df)
        test_df['信用分'] = -1
        data = pd.concat([train_df, test_df], axis=0, ignore_index=True)
        for col in ['当月网购类应用使用次数', '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数',
                    "当月旅游资讯类应用使用次数",'当月飞机类应用使用次数','当月火车类应用使用次数',"当月物流快递类应用使用次数",
                    '近三个月月均商场出现次数']:
            #large_high = np.percentile(data[col].values, 99.98)
            high = np.percentile(data[col].values, 99.97)
            mean = np.mean(data[col].values)
            #data.loc[data[col] > large_high, col] = mean
            data.loc[data[col] > high, col] = high
        #data["用户年龄"][data["用户年龄"]==0]=data["用户年龄"].mean()
        data["用户近6个月平均消费值（元）"][data["用户近6个月平均消费值（元）"]<=1]=data["用户近6个月平均消费值（元）"].mean()
        data["用户账单当月总费用（元）"][data["用户账单当月总费用（元）"]<=1]=data["用户账单当月总费用（元）"].mean()
        data["消费稳定性2"]=(data["用户账单当月总费用（元）"]-data["用户近6个月平均消费值（元）"]+1)/(data["用户近6个月平均消费值（元）"]+1)
        #data["还有几个"]=data["用户账单当月总费用（元）"]+data["用户当月账户余额（元）"]
        #data["交通"]=data['当月飞机类应用使用次数']+data['当月飞机类应用使用次数']
        #data=data.drop({'当月飞机类应用使用次数','当月火车类应用使用次数'},axis=1)
            #data[col]=np.log(data[col]+1)
            #df.loc[df[col] < low, col] = low
        #for col in ["用户账单当月总费用（元）","用户近6个月平均消费值（元）","用户当月账户余额（元）","缴费用户最近一次缴费金额（元）"]:
        #    high = np.percentile(data[col].values, 99.999)
        #    mean = np.mean(data[col].values)
            #data.loc[data[col] > large_high, col] = mean
        #    data.loc[data[col] > high, col] =mean   
        
        #data["上月余额"]=(-data["用户账单当月总费用（元）"]+data["用户当月账户余额（元）"]+data["缴费用户最近一次缴费金额（元）"]).astype("int64")
        #data["余额稳定性"]=data["上月余额"]/(data["用户当月账户余额（元）"]+1)
        #data["消费稳定性"]=data["用户账单当月总费用（元）"]/(1+data["用户近6个月平均消费值（元）"])       
        #data=data.drop({'当月是否逛过福州仓山万达','当月是否到过福州山姆会员店'},axis=1)   
        #data=data.drop({"用户最近一次缴费距今时长（月）"},axis=1)
        #data=data.drop({'是否经常逛商场的人'},axis=1)
#    df=df.drop({'是否大学生客户'},axis=1)
        #data=data.drop({'是否黑名单客户'},axis=1)
        #data=data.drop("上月余额",axis=1)
        del train_df, test_df
        gc.collect()
        

        data = self._log_feature_(data)

        train, test = data[:50000], data[50000:]
        test = test.drop(['信用分'], axis=1)

        train_data_name = os.path.join(ETL_DATA_PATH, 'train_data.csv')
        test_data_name = os.path.join(ETL_DATA_PATH, 'test_data.csv')
        train.to_csv(train_data_name, index=False)
        test.to_csv(test_data_name, index=False)
        print('Gen train shape: {}, test shape: {}'.format(train.shape, test.shape))
        print('features num: ', test.shape[1] - 1)


if  __name__ == "__main__":
    t0 = time.time()
    processing = Processing()
    processing.get_processing()
    print("Feature engineering has finished!")
    print("Cost {} s.".format(time.time() - t0))

Mem. usage decreased to  2.81 Mb (75.4% reduction)
Mem. usage decreased to  2.81 Mb (74.6% reduction)
Gen train shape: (50000, 32), test shape: (50000, 31)
features num:  30
Feature engineering has finished!
Cost 2.5257692337036133 s.
