In [372]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from  sklearn.model_selection import train_test_split
import lightgbm as lgb
from bubbly.bubbly import bubbleplot
from plotly.offline import iplot
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.metrics import confusion_matrix 
import pylab as pl

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [373]:
root = "/Users/dinglu/Documents/code/DataMining/Risk_Prediction_Of_Illegal_Fund_Raising/data/"
path_base_info = root + "train/base_info.csv"
path_annual_report_info = root + "train/annual_report_info.csv"
path_change_info = root + "train/change_info.csv"
path_news_info = root + "train/news_info.csv"
path_other_info = root + "train/other_info.csv"
path_tax_info = root + "train/tax_info.csv"
path_entprise_evaluate = root + "entprise_evaluate.csv"
path_entprise_info = root + "train/entprise_info.csv"

In [440]:
class BaseInfo:
    def __init__(self, data, type='train'):
        self.data = data
        self.data_type = {
            'opfrom': 'time',
            'opto': 'time',
            'reccap': 'int64',
            'enttypeminu': 'category',
            'venind': 'category',
            'enttypeitem': 'category',
            'empnum': 'int64',
            'regcap': 'int64',
            'industryco': 'category',
            'oploc': 'category',
            'oplocdistrict': 'category',
            'regtype': 'category',
            'townsign': 'category',
            'adbusign': 'category',
            'jobid': 'category',
            'orgid': 'category',
            'state': 'category',
            'enttype': 'category',
            'dom': 'category',
            'industryphy': 'category',
            'enttypegb': 'category',
            'opform': 'category'
        }
        self.useless_columns = [
            'ptbusscope',
            'midpreindcode',
            'protype',
            'forreccap',
            'congro',
            'forregcap',
            'exenum',
            'parnum',
            'compform',
            'opscope',
            'id'

#             'orgid',
#             'industryco',
#             'dom',
#             'enttypegb',
#             'enttypeitem',
#             'opfrom',
#             'state',
#             'adbusign',
#             'jobid',
#             'enttypegb',
#             'regtype',
#             'empnum',
#             'venind',
#             'enttypeminu',
#             'oploc',
#             'enttype',
#             'oplocdistrict'

        ]

        if type == 'test':
            self.useless_columns.remove('id')
            self.useless_columns.append('score')

        return

    def fill_nan(self, name, value, column_type):
        self.data[name] = self.data[name].fillna(value)
        self.data[name] = self.data[name].astype(column_type)
        return

    def label_encoder(self, name, column_type):
        label_encode = LabelEncoder()
        value_data = self.data[self.data[name].isnull() == 0]
        null_data = self.data[self.data[name].isnull() != 0]
        value_data[name] = label_encode.fit_transform(value_data[name])
        self.data = pd.concat([null_data, value_data])
        self.data[name] = self.data[name].astype(column_type)
        return

    def drop_columns(self, drop_columns):
        self.data.drop(drop_columns, axis=1, inplace=True)
        return

    def unify_time(self, name):
        value_data = self.data[self.data[name].isnull() == 0]
        null_data = self.data[self.data[name].isnull() != 0]
        value_data[name] = value_data[name].apply(
            lambda x: x if len(x) > 10 else (x + " 00:00:00"))
        value_data[name] = value_data[name].apply(
            lambda x: x if x is None else datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        value_data[name] = value_data[name].apply(lambda x: x.year)
        self.data = pd.concat([value_data, null_data])
        return

    def feature_process_v1(self):
        for name in self.data_type.keys():
            if self.data_type.get(name) == 'category':
                self.label_encoder(name,'category')
            elif self.data_type.get(name) == 'int64':
                mean = self.data[self.data[name].isnull() == 0][name].mean()
                self.fill_nan(name, mean, 'int64')
            elif self.data_type.get(name) == 'time':
                self.unify_time(name)
                mode = self.data[name].mode()[0]
                self.fill_nan(name, mode, 'int64')

        self.data['diff_year'] = (self.data['opto'] - self.data['opfrom']).astype('int64')
        self.drop_columns(self.useless_columns)
        return self.data
    
    def feature_process_v2(self):
        for name in self.data_type.keys():
            if self.data_type.get(name) == 'category':
                self.fill_nan(name, '-1', 'str')
                self.label_encoder(name, 'category')
            elif self.data_type.get(name) == 'int64':
                mean = self.data[self.data[name].isnull() == 0][name].mean()
                self.fill_nan(name, mean, 'int64')
            elif self.data_type.get(name) == 'time':
                self.unify_time(name)
                mode = self.data[name].mode()[0]
                self.fill_nan(name, mode, 'int64')

        self.data['diff_year'] = (self.data['opto'] - self.data['opfrom']).astype('int64')
        self.drop_columns(self.useless_columns)
        return self.data
    
def split_data(data_frame, split_ratio):
    label = data_frame[['label']]
    data_frame.drop(['label'], axis=1, inplace=True)
    train_data, val_data, train_label, val_label = train_test_split(data_frame,
                                                                    label,
                                                                    test_size=split_ratio,
                                                                    random_state=2020)
    return train_data, val_data, train_label, val_label



def get_balance_data(data_frame):
    """

    :param data_frame:
    :return:
    """
    pos_data = data_frame[data_frame['label'] == 1]
    neg_data = data_frame[data_frame['label'] == 0]
    neg_data = neg_data.sample(n=len(pos_data), axis=0, random_state=2020, replace=True)
    data_frame = pd.concat([neg_data, pos_data])
    return data_frame


def predict_data(data_frame, model):
    label = data_frame['label']
    data_frame.drop(['label'], inplace=True, axis=1)
    prob = pd.Series(model.predict_proba(data_frame)[:, 1], name='prob')
    data_frame = pd.concat([data_frame.reset_index(), label.reset_index(), prob.reset_index()], axis=1)
    pred = pd.Series([1 if prob > 0.5 else 0 for prob in data_frame['prob']], name='pred')
    data_frame = pd.concat([data_frame, pred.reset_index()], axis=1)
    return data_frame


def plot_confusion_matrix(data_frame):
    ax = plt.subplot()

    label = data_frame['label']
    pred = data_frame['pred']
    conf_matrix = confusion_matrix(label, pred)
    print(conf_matrix)

    neg_to_neg = conf_matrix[0, 0]
    pos_to_neg = conf_matrix[0, 1]
    neg_to_pos = conf_matrix[1, 0]
    pos_to_pos = conf_matrix[1, 1]

    pos_acc = pos_to_pos / (pos_to_pos + pos_to_neg)
    neg_acc = neg_to_neg / (neg_to_neg + neg_to_pos)

    print("neg acc is: %f" % (neg_acc))
    print("pos acc is: %f" % (pos_acc))

    sns.heatmap(conf_matrix, annot=True, ax=ax)
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels');
    ax.set_title('Confusion Matrix')
    ax.xaxis.set_ticklabels(['0', '1'])
    ax.yaxis.set_ticklabels(['0', '1'])
    plt.show()
    return


def predict_result(model, test_data, result_path):
    """

    :param model:
    :param test_data:
    :param result_path:
    :return:
    """

    test_id = test_data['id']
    test_data.drop(['id'], inplace=True, axis=1)
    test_prob = model.predict_proba(test_data)[:, 1]
    result = pd.DataFrame({'id': test_id, 'score': test_prob})
#     result = result.groupby("id").agg('mean').reset_index()
    result.to_csv(result_path, index=False)
    return

def plot_SHAP(tree_model, val_data):
    values = shap.TreeExplainer(tree_model).shap_values(val_data)
    shap.summary_plot(values, tree_model)
    return

In [441]:
df_entprise_info = pd.read_csv(path_entprise_info)
df_base_info = pd.read_csv(path_base_info)
df_annual_report_info = pd.read_csv(path_annual_report_info)
df_entprise_evaluate = pd.read_csv(path_entprise_evaluate)
df_tax_info = pd.read_csv(path_tax_info)
df_other_info = pd.read_csv(path_other_info)
df_news_info = pd.read_csv(path_news_info)
df_change_info = pd.read_csv(path_change_info)

In [442]:
base_info = BaseInfo(train_data)
train_data = base_info.feature_process_v2()

In [None]:
train_data = get_balance_data(train_data)

In [None]:
train_x, val_x, train_y, val_y = split_data(train_data, split_ratio=0.1)

In [2]:
lgb_param_list = {
    'boosting_type':'gbdt',
    'objective_type':'binary',
    'n_estimators':200,
    'learning_rate':0.08,
    'max_depth':5,
    'num_leaves':31,
    'subsample':0.7, 
    'colsample_bytree':0.5,
    'subsample_freq':1, 
    'min_split_gain':0.5,
    'min_child_samples':50, 
    'reg_alpha':3.5, 
    'reg_lambda':3.0,
    'random_state':2019, 
    'n_jobs':-1
}

lgb_model = lgb.LGBMClassifier(**lgb_param_list)
lgb_clf = lgb_model.fit(train_x,
                        train_y,
                        eval_set=[(train_x,train_y),(val_x,val_y)],
#                         categorical_feature = categorical_columns,
                        eval_metric="binary_error",
                        early_stopping_rounds=10,
                        verbose=10)

In [None]:
predict_train_data = predict_data(train_data, lgb_clf)

In [446]:
plot_confusion_matrix(predict_train_data)

In [None]:
base_info = BaseInfo(test_data,'test')
test_data = base_info.feature_process_v2()
predict_result(lgb_clf, test_data, root + 'result/baseline_8.csv')