In [1]:
import pandas as pd
import numpy as np

In [2]:
def load_csv(file_path, converters):
    df = pd.read_csv(file_path, converters=converters, index_col=False, encoding='utf-8')
    return df

In [3]:
train_id = load_csv('../ccf_data/entprise_info.csv', converters={})

In [4]:
base_info = load_csv('../ccf_data/base_info.csv', converters={'oplocdistrict': str, 'enttype': str, 'enttypegb': str,
                                                              'state': str, 'orgid': str, 'jobid': str, 'adbusign': str,
                                                              'townsign': str, 'regtype': str})

## base info

In [5]:
# 行业细类代码 规范化成离散型变量
base_info['industryco'].fillna(-1, inplace=True)
base_info['industryco'] = base_info['industryco'].astype('int')
base_info['industryco'] = base_info['industryco'].astype('str')

In [6]:
# 企业类型小类 规范化成离散型变量
base_info['enttypeitem'].fillna(-1, inplace=True)
base_info['enttypeitem'] = base_info['enttypeitem'].astype('int')
base_info['enttypeitem'] = base_info['enttypeitem'].astype('str')

In [7]:
# 从业人数 规范化连续变量
base_info['empnum'].fillna(-1, inplace=True)
base_info['empnum'] = base_info['empnum'].astype('int')

In [8]:
# 组织形式 规范化成离散型变量
base_info['compform'].fillna(-1, inplace=True)
base_info['compform'] = base_info['compform'].astype('int')
base_info['compform'] = base_info['compform'].astype('str')

In [9]:
# 合伙人人数 规范化连续变量
base_info['parnum'].fillna(-1, inplace=True)
base_info['parnum'] = base_info['parnum'].astype('int')

In [10]:
# 执行人数人数 规范化连续变量
base_info['exenum'].fillna(-1, inplace=True)
base_info['exenum'] = base_info['exenum'].astype('int')

In [11]:
# 风险行业 规范化成离散型变量
base_info['venind'].fillna(-1, inplace=True)
base_info['venind'] = base_info['venind'].astype('int')
base_info['venind'] = base_info['venind'].astype('str')

In [12]:
# 企业类型细类 规范化成离散型变量
base_info['enttypeminu'].fillna(-1, inplace=True)
base_info['enttypeminu'] = base_info['enttypeminu'].astype('int')
base_info['enttypeminu'] = base_info['enttypeminu'].astype('str')

In [13]:
# 项目类型 规范化成离散型变量
base_info['protype'].fillna(-1, inplace=True)
base_info['protype'] = base_info['protype'].astype('int')
base_info['protype'] = base_info['protype'].astype('str')

In [14]:
# 多种注册资本金额赋值为-1
cols = ['regcap', 'reccap', 'forreccap', 'forregcap', 'congro']
for col in cols:
    base_info[col].fillna(-1, inplace=True)

In [15]:
base_info.dtypes

id                object
oplocdistrict     object
industryphy       object
industryco        object
dom               object
opscope           object
enttype           object
enttypeitem       object
opfrom            object
opto              object
state             object
orgid             object
jobid             object
adbusign          object
townsign          object
regtype           object
empnum             int32
compform          object
parnum             int32
exenum             int32
opform            object
ptbusscope       float64
venind            object
enttypeminu       object
midpreindcode    float64
protype           object
oploc             object
regcap           float64
reccap           float64
forreccap        float64
forregcap        float64
congro           float64
enttypegb         object
dtype: object

In [16]:
# 去除空值列
drop_list = ['opform','ptbusscope','midpreindcode']

for col in drop_list:
    del base_info[col]

In [17]:
base_info.columns

Index(['id', 'oplocdistrict', 'industryphy', 'industryco', 'dom', 'opscope',
       'enttype', 'enttypeitem', 'opfrom', 'opto', 'state', 'orgid', 'jobid',
       'adbusign', 'townsign', 'regtype', 'empnum', 'compform', 'parnum',
       'exenum', 'venind', 'enttypeminu', 'protype', 'oploc', 'regcap',
       'reccap', 'forreccap', 'forregcap', 'congro', 'enttypegb'],
      dtype='object')

In [18]:
base_info.to_csv('../code/fea_explore/base_info.csv', index=False)

## 根据标签探索非法集资风险特征

In [21]:
base_info.columns

Index(['id', 'oplocdistrict', 'industryphy', 'industryco', 'dom', 'opscope',
       'enttype', 'enttypeitem', 'opfrom', 'opto', 'state', 'orgid', 'jobid',
       'adbusign', 'townsign', 'regtype', 'empnum', 'compform', 'parnum',
       'exenum', 'venind', 'enttypeminu', 'protype', 'oploc', 'regcap',
       'reccap', 'forreccap', 'forregcap', 'congro', 'enttypegb'],
      dtype='object')

In [22]:
train_dataset = pd.merge(train_id, base_info, on=['id'], how='left')

In [24]:
train_dataset.columns

Index(['id', 'label', 'oplocdistrict', 'industryphy', 'industryco', 'dom',
       'opscope', 'enttype', 'enttypeitem', 'opfrom', 'opto', 'state', 'orgid',
       'jobid', 'adbusign', 'townsign', 'regtype', 'empnum', 'compform',
       'parnum', 'exenum', 'venind', 'enttypeminu', 'protype', 'oploc',
       'regcap', 'reccap', 'forreccap', 'forregcap', 'congro', 'enttypegb'],
      dtype='object')

In [49]:
train_dataset.groupby('label')['parnum'].value_counts()

label  parnum
0      -1        12758
        2          549
        1          399
        3          108
        4           28
        5           16
        6            3
        7            2
        8            2
        9            2
        13           2
        27           2
        46           2
        10           1
        12           1
        14           1
        22           1
        23           1
        25           1
        33           1
        37           1
        41           1
        44           1
        49           1
1      -1          437
        2          292
        3           86
        4           37
        5           13
        6           11
                 ...  
        10           4
        12           4
        19           4
        9            3
        20           3
        21           3
        24           3
        30           3
        32           3
        38           3
        40           3
        41          

## other info


In [68]:
other_info = load_csv('../ccf_data/other_info.csv', converters={'id': str})

In [69]:
other_info

Unnamed: 0,id,legal_judgment_num,brand_num,patent_num
0,f000950527a6feb6d340f91da09e61347d8200cd2f0d1602,4.0,,
1,f000950527a6feb608dd9322b74a99f60851207f36a3c94c,1.0,,
2,d8071a739aa75a3b9f23966f8dae78fd226c272515b9c255,2.0,,
3,216bd2aaf4d079242209b1496f81a36c7abed9dd0bb65ed3,,1.0,
4,e9f7b28ec10e0470de9631c789f49acdd4e7cf9ed6db094b,,2.0,
5,d8071a739aa75a3bcf6fb0041ee883243251d30025ab9d45,1.0,,
6,f000950527a6feb6f97af739bb95531db891a11df80bdb8b,3.0,,
7,f000950527a6feb65929509d9be855bf75b7337d4465843e,47.0,,
8,f000950527a6feb6a81704c38a21ae17f4f09eaa9aa77bea,,3.0,
9,f000950527a6feb6c0ab37e6eb4a4bf25ee20f0ff9ff070d,,1.0,


In [70]:
convert_col = ['legal_judgment_num', 'brand_num', 'patent_num' ]

In [71]:
for col in convert_col:
    other_info[col].fillna(-1, inplace=True)
    other_info[col] = other_info[col].astype('float64')

In [72]:
other_info.dtypes

id                     object
legal_judgment_num    float64
brand_num             float64
patent_num            float64
dtype: object

In [73]:
other_info.to_csv('../code/fea_explore/other_info.csv', index=False)