In [643]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from  sklearn.model_selection import train_test_split
import lightgbm as lgb
from bubbly.bubbly import bubbleplot
from plotly.offline import iplot
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [644]:
root = "/Users/dinglu/Documents/code/DataMining/Risk_Prediction_Of_Illegal_Fund_Raising/data/"
path_base_info = root + "train/base_info.csv"
path_annual_report_info = root + "train/annual_report_info.csv"
path_change_info = root + "train/change_info.csv"
path_news_info = root + "train/news_info.csv"
path_other_info = root + "train/other_info.csv"
path_tax_info = root + "train/tax_info.csv"
path_entprise_evaluate = root + "entprise_evaluate.csv"
path_entprise_info = root + "train/entprise_info.csv"

In [1014]:
class BaseInfo:
    def __init__(self, data, type='train'):
        self.data = data
        self.data_type = {
            'opfrom': 'time',
            'opto': 'time',
            'reccap': 'int64',
            'enttypeminu': 'category',
            'venind': 'category',
            'enttypeitem': 'category',
            'empnum': 'int64',
            'regcap': 'int64',
            'industryco': 'category',
            'oploc': 'category',
            'oplocdistrict': 'category',
            'regtype': 'category',
            'townsign': 'category',
            'adbusign': 'category',
            'jobid': 'category',
            'orgid': 'category',
            'state': 'category',
            'enttype': 'category',
            'dom': 'category',
            'industryphy': 'category',
            'enttypegb': 'category',
            'opform': 'category'
        }
        self.useless_columns = [
            'ptbusscope',
            'midpreindcode',
            'protype',
            'forreccap',
            'congro',
            'forregcap',
            'exenum',
            'parnum',
            'compform',
            'opscope',
            'id',
        ]

        if type == 'test':
            self.useless_columns.remove('id')
            self.useless_columns.append('score')

        return

    def fill_nan(self, name, value, column_type):
        self.data[name] = self.data[name].fillna(value)
        self.data[name] = self.data[name].astype(column_type)
        return

    def label_encoder(self, name, column_type):
        label_encode = LabelEncoder()
        value_data = self.data[self.data[name].isnull() == 0]
        null_data = self.data[self.data[name].isnull() != 0]
        value_data[name] = label_encode.fit_transform(value_data[name])
        self.data = pd.concat([null_data, value_data])
        self.data[name] = self.data[name].astype(column_type)
        return

    def drop_columns(self, drop_columns):
        self.data.drop(drop_columns, axis=1, inplace=True)
        return

    def unify_time(self, name):
        value_data = self.data[self.data[name].isnull() == 0]
        null_data = self.data[self.data[name].isnull() != 0]
        value_data[name] = value_data[name].apply(
            lambda x: x if len(x) > 10 else (x + " 00:00:00"))
        value_data[name] = value_data[name].apply(
            lambda x: x if x is None else datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        value_data[name] = value_data[name].apply(lambda x: x.year)
        self.data = pd.concat([value_data, null_data])
        return


    def process_v1(self):
        for name in self.data_type.keys():
            if self.data_type.get(name) == 'category':
                self.label_encoder(name,'category')
            elif self.data_type.get(name) == 'int64':
                mean = self.data[self.data[name].isnull() == 0][name].mean()
                self.fill_nan(name, mean, 'int64')
            elif self.data_type.get(name) == 'time':
                self.unify_time(name)
                mode = self.data[name].mode()[0]
                self.fill_nan(name, mode, 'int64')

        self.data['diff_year'] = (self.data['opto'] - self.data['opfrom']).astype('int64')
        self.drop_columns(self.useless_columns)
        return self.data

def get_balance_data(data_frame):
    """

    :param data_frame:
    :return:
    """
    pos_data = data_frame[data_frame['label'] == 1]
    neg_data = data_frame[data_frame['label'] == 0]
    neg_data = neg_data.sample(n=len(pos_data), axis=0, random_state=2020, replace=True)
    data_frame = pd.concat([neg_data, pos_data])
    return data_frame


def probed_data(data_frame, model):
    label = data_frame['label']
    data_frame.drop(['label'], inplace=True, axis=1)
    prob = pd.Series(model.predict_proba(data_frame)[:, 1], name='prob')
    data_frame = pd.concat([data_frame.reset_index(),label.reset_index(),prob.reset_index()],axis=1)
    return data_frame


def predict_result(model, test_data, result_path):
    """

    :param model:
    :param test_data:
    :param result_path:
    :return:
    """

    test_id = test_data['id']
    test_data.drop(['id'], inplace=True, axis=1)
    test_prob = model.predict_proba(test_data)[:, 1]
    result = pd.DataFrame({'id': test_id, 'score': test_prob})
#     result = result.groupby("id").agg('mean').reset_index()
    result.to_csv(result_path, index=False)
    return

In [1015]:
df_entprise_info = pd.read_csv(path_entprise_info)
df_base_info = pd.read_csv(path_base_info)
df_annual_report_info = pd.read_csv(path_annual_report_info)
df_entprise_evaluate = pd.read_csv(path_entprise_evaluate)
df_tax_info = pd.read_csv(path_tax_info)
df_other_info = pd.read_csv(path_other_info)
df_news_info = pd.read_csv(path_news_info)
df_change_info = pd.read_csv(path_change_info)

In [1016]:
train_data = pd.merge(df_entprise_info, df_base_info, how='inner', on='id')
test_data = pd.merge(df_entprise_evaluate, df_base_info, how='inner', on='id')

In [1017]:
base_info = BaseInfo(train_data)
train_data = base_info.process_v1()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [994]:
train_data = get_balance_data(train_data)

In [995]:
train_x, val_x, train_y, val_y = split_data(train_data, split_ratio=0.1)

In [1013]:
lgb_param_list = {
    'boosting_type':'gbdt',
    'objective_type':'binary',
    'n_estimators':200,
    'learning_rate':0.01,
    'max_depth':5,
    'num_leaves':31,
    'subsample':0.7, 
    'colsample_bytree':0.5,
    'subsample_freq':1, 
    'min_split_gain':0.5,
    'min_child_samples':50, 
    'reg_alpha':3.5, 
    'reg_lambda':3.0,
    'random_state':2019, 
    'n_jobs':-1
}

lgb_model = lgb.LGBMClassifier(**lgb_param_list)
lgb_clf = lgb_model.fit(train_x,
                        train_y,
                        eval_set=[(train_x,train_y),(val_x,val_y)],
#                         categorical_feature = categorical_columns,
                        eval_metric="binary_error",
                        early_stopping_rounds=10,
                        verbose=10)

Training until validation scores don't improve for 10 rounds
[10]	training's binary_error: 0.0305949	training's binary_logloss: 0.625462	valid_1's binary_error: 0.035533	valid_1's binary_logloss: 0.627177
Early stopping, best iteration is:
[7]	training's binary_error: 0.0300283	training's binary_logloss: 0.644165	valid_1's binary_error: 0.035533	valid_1's binary_logloss: 0.645504


In [1018]:
prob_train_data = probed_data(train_data, lgb_clf)

In [None]:
prob_train_data[]

In [962]:
base_info = BaseInfo(test_data,'test')
test_data = base_info.process_v1()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [963]:
predict_result(lgb_clf, test_data, root + "result/baseline_7.csv")