In [1]:
import pandas as pd
import numpy as np

In [2]:
def load_csv(file_path, converters):
    df = pd.read_csv(file_path, converters=converters, index_col=False, encoding='utf-8')
    return df

In [3]:
train_id = load_csv('../ccf_data/entprise_info.csv', converters={})

In [4]:
base_info = load_csv('../ccf_data/base_info.csv', converters={'oplocdistrict': str, 'enttype': str, 'enttypegb': str,
                                                              'state': str, 'orgid': str, 'jobid': str, 'adbusign': str,
                                                              'townsign': str, 'regtype': str})

## base info

In [5]:
# 行业细类代码 规范化成离散型变量
base_info['industryco'].fillna(-1, inplace=True)
base_info['industryco'] = base_info['industryco'].astype('int')
base_info['industryco'] = base_info['industryco'].astype('str')

In [6]:
# 企业类型小类 规范化成离散型变量
base_info['enttypeitem'].fillna(-1, inplace=True)
base_info['enttypeitem'] = base_info['enttypeitem'].astype('int')
base_info['enttypeitem'] = base_info['enttypeitem'].astype('str')

In [7]:
# 从业人数 规范化连续变量
base_info['empnum'].fillna(-1, inplace=True)
base_info['empnum'] = base_info['empnum'].astype('int')

In [8]:
# 组织形式 规范化成离散型变量
base_info['compform'].fillna(-1, inplace=True)
base_info['compform'] = base_info['compform'].astype('int')
base_info['compform'] = base_info['compform'].astype('str')

In [9]:
# 合伙人人数 规范化连续变量
base_info['parnum'].fillna(-1, inplace=True)
base_info['parnum'] = base_info['parnum'].astype('int')

In [10]:
# 执行人数人数 规范化连续变量
base_info['exenum'].fillna(-1, inplace=True)
base_info['exenum'] = base_info['exenum'].astype('int')

In [11]:
# 风险行业 规范化成离散型变量
base_info['venind'].fillna(-1, inplace=True)
base_info['venind'] = base_info['venind'].astype('int')
base_info['venind'] = base_info['venind'].astype('str')

In [12]:
# 企业类型细类 规范化成离散型变量
base_info['enttypeminu'].fillna(-1, inplace=True)
base_info['enttypeminu'] = base_info['enttypeminu'].astype('int')
base_info['enttypeminu'] = base_info['enttypeminu'].astype('str')

In [13]:
# 项目类型 规范化成离散型变量
base_info['protype'].fillna(-1, inplace=True)
base_info['protype'] = base_info['protype'].astype('int')
base_info['protype'] = base_info['protype'].astype('str')

In [14]:
# 多种注册资本金额赋值为-1
cols = ['regcap', 'reccap', 'forreccap', 'forregcap', 'congro']
for col in cols:
    base_info[col].fillna(-1, inplace=True)

In [15]:
base_info.dtypes

id                object
oplocdistrict     object
industryphy       object
industryco        object
dom               object
opscope           object
enttype           object
enttypeitem       object
opfrom            object
opto              object
state             object
orgid             object
jobid             object
adbusign          object
townsign          object
regtype           object
empnum             int32
compform          object
parnum             int32
exenum             int32
opform            object
ptbusscope       float64
venind            object
enttypeminu       object
midpreindcode    float64
protype           object
oploc             object
regcap           float64
reccap           float64
forreccap        float64
forregcap        float64
congro           float64
enttypegb         object
dtype: object

In [16]:
# 去除空值列
drop_list = ['opform','ptbusscope','midpreindcode']

for col in drop_list:
    del base_info[col]

In [17]:
base_info.columns

Index(['id', 'oplocdistrict', 'industryphy', 'industryco', 'dom', 'opscope',
       'enttype', 'enttypeitem', 'opfrom', 'opto', 'state', 'orgid', 'jobid',
       'adbusign', 'townsign', 'regtype', 'empnum', 'compform', 'parnum',
       'exenum', 'venind', 'enttypeminu', 'protype', 'oploc', 'regcap',
       'reccap', 'forreccap', 'forregcap', 'congro', 'enttypegb'],
      dtype='object')

In [18]:
base_info.to_csv('../code/fea_explore/base_info.csv', index=False)

## 根据标签探索非法集资风险特征

In [21]:
base_info.columns

Index(['id', 'oplocdistrict', 'industryphy', 'industryco', 'dom', 'opscope',
       'enttype', 'enttypeitem', 'opfrom', 'opto', 'state', 'orgid', 'jobid',
       'adbusign', 'townsign', 'regtype', 'empnum', 'compform', 'parnum',
       'exenum', 'venind', 'enttypeminu', 'protype', 'oploc', 'regcap',
       'reccap', 'forreccap', 'forregcap', 'congro', 'enttypegb'],
      dtype='object')

In [22]:
train_dataset = pd.merge(train_id, base_info, on=['id'], how='left')

In [24]:
train_dataset.columns

Index(['id', 'label', 'oplocdistrict', 'industryphy', 'industryco', 'dom',
       'opscope', 'enttype', 'enttypeitem', 'opfrom', 'opto', 'state', 'orgid',
       'jobid', 'adbusign', 'townsign', 'regtype', 'empnum', 'compform',
       'parnum', 'exenum', 'venind', 'enttypeminu', 'protype', 'oploc',
       'regcap', 'reccap', 'forreccap', 'forregcap', 'congro', 'enttypegb'],
      dtype='object')

In [49]:
train_dataset.groupby('label')['parnum'].value_counts()

label  parnum
0      -1        12758
        2          549
        1          399
        3          108
        4           28
        5           16
        6            3
        7            2
        8            2
        9            2
        13           2
        27           2
        46           2
        10           1
        12           1
        14           1
        22           1
        23           1
        25           1
        33           1
        37           1
        41           1
        44           1
        49           1
1      -1          437
        2          292
        3           86
        4           37
        5           13
        6           11
                 ...  
        10           4
        12           4
        19           4
        9            3
        20           3
        21           3
        24           3
        30           3
        32           3
        38           3
        40           3
        41          

## other info


In [68]:
other_info = load_csv('../ccf_data/other_info.csv', converters={'id': str})

In [69]:
other_info

Unnamed: 0,id,legal_judgment_num,brand_num,patent_num
0,f000950527a6feb6d340f91da09e61347d8200cd2f0d1602,4.0,,
1,f000950527a6feb608dd9322b74a99f60851207f36a3c94c,1.0,,
2,d8071a739aa75a3b9f23966f8dae78fd226c272515b9c255,2.0,,
3,216bd2aaf4d079242209b1496f81a36c7abed9dd0bb65ed3,,1.0,
4,e9f7b28ec10e0470de9631c789f49acdd4e7cf9ed6db094b,,2.0,
5,d8071a739aa75a3bcf6fb0041ee883243251d30025ab9d45,1.0,,
6,f000950527a6feb6f97af739bb95531db891a11df80bdb8b,3.0,,
7,f000950527a6feb65929509d9be855bf75b7337d4465843e,47.0,,
8,f000950527a6feb6a81704c38a21ae17f4f09eaa9aa77bea,,3.0,
9,f000950527a6feb6c0ab37e6eb4a4bf25ee20f0ff9ff070d,,1.0,


In [70]:
convert_col = ['legal_judgment_num', 'brand_num', 'patent_num' ]

In [71]:
for col in convert_col:
    other_info[col].fillna(-1, inplace=True)
    other_info[col] = other_info[col].astype('float64')

In [72]:
other_info.dtypes

id                     object
legal_judgment_num    float64
brand_num             float64
patent_num            float64
dtype: object

#### 发现重复项

In [80]:
isDuplicated = other_info.duplicated('id')
print(other_info[other_info.duplicated('id')])

                                                    id  legal_judgment_num  \
836   e9f7b28ec10e04707ba878b89e6c2d362b107a817342f9c6                12.0   
1120  f000950527a6feb63702b1f6c1dabe5ea196d320bbbff425                27.0   

      brand_num  patent_num  
836         9.0         8.0  
1120       70.0        77.0  


In [81]:
other_info[other_info['id'].isin(['e9f7b28ec10e04707ba878b89e6c2d362b107a817342f9c6', 'f000950527a6feb63702b1f6c1dabe5ea196d320bbbff425'])]

Unnamed: 0,id,legal_judgment_num,brand_num,patent_num
835,e9f7b28ec10e04707ba878b89e6c2d362b107a817342f9c6,2.0,9.0,8.0
836,e9f7b28ec10e04707ba878b89e6c2d362b107a817342f9c6,12.0,9.0,8.0
1119,f000950527a6feb63702b1f6c1dabe5ea196d320bbbff425,27.0,68.0,77.0
1120,f000950527a6feb63702b1f6c1dabe5ea196d320bbbff425,27.0,70.0,77.0


In [82]:
## 经观察，发现后一条的数据更可信
other_info.drop_duplicates('id', keep='last', inplace=True)

In [83]:
other_info[other_info['id'].isin(['e9f7b28ec10e04707ba878b89e6c2d362b107a817342f9c6', 'f000950527a6feb63702b1f6c1dabe5ea196d320bbbff425'])]

Unnamed: 0,id,legal_judgment_num,brand_num,patent_num
836,e9f7b28ec10e04707ba878b89e6c2d362b107a817342f9c6,12.0,9.0,8.0
1120,f000950527a6feb63702b1f6c1dabe5ea196d320bbbff425,27.0,70.0,77.0


In [84]:
other_info.to_csv('../code/fea_explore/other_info.csv', index=False)

## news info

In [85]:
news_info = load_csv('../ccf_data/news_info.csv', converters={'id': str})

In [86]:
news_info

Unnamed: 0,id,positive_negtive,public_date
0,f000950527a6feb62669d6a175fe6fdccd1eb4f7ca8e5016,积极,2016-12-30
1,f000950527a6feb6e8bd9919e2ca363359bcfa997a0f9de7,中立,2017-08-09
2,f000950527a6feb6e8bd9919e2ca363359bcfa997a0f9de7,消极,2016-02-29
3,d8071a739aa75a3bcf6fb0041ee883243251d30025ab9d45,中立,2018-06-08
4,f000950527a6feb6d71de3382afa0bc5ff87bb65477f698a,积极,2015-06-29
5,f000950527a6feb6d71de3382afa0bc5ff87bb65477f698a,积极,2015-06-15
6,f000950527a6feb65929509d9be855bf75b7337d4465843e,积极,2019-10-26
7,f000950527a6feb65929509d9be855bf75b7337d4465843e,积极,2017-11-01
8,f000950527a6feb68b2de1c0fb54cf2919d4029742a7a1c4,中立,2018-04-20
9,f000950527a6feb68b2de1c0fb54cf2919d4029742a7a1c4,中立,2018-01-08


In [88]:
news_info['positive_negtive'].value_counts()

积极    5350
中立    4133
消极    1035
Name: positive_negtive, dtype: int64

In [89]:
# 新闻的积极与消极是有时间序列的 可以间接反映是否有逐渐变好或者逐渐变坏的趋势
# 有几点思路 1. 某个企业的新闻数量 2. 某个企业的新闻态度变化趋势

### 计算新闻总数量

In [133]:
df = pd.DataFrame(news_info.groupby('id')['positive_negtive'].count())

In [134]:
df_sum = df.reset_index()
df_sum = df.rename(columns={'positive_negtive':'news_sum'})

### 计算积极新闻数量

In [142]:
df = news_info[news_info['positive_negtive'].isin(['积极'])]

In [143]:
df = pd.DataFrame(df.groupby('id')['positive_negtive'].count())

In [144]:
df_pos = df.reset_index()
df_pos = df.rename(columns={'positive_negtive':'news_pos_sum'})

### 计算中立新闻数量

In [150]:
df = news_info[news_info['positive_negtive'].isin(['中立'])]

In [151]:
df = pd.DataFrame(df.groupby('id')['positive_negtive'].count())

In [153]:
df_mid = df.reset_index()
df_mid = df.rename(columns={'positive_negtive':'news_mid_sum'})

### 计算消极新闻数量

In [159]:
df = news_info[news_info['positive_negtive'].isin(['消极'])]

In [160]:
df = pd.DataFrame(df.groupby('id')['positive_negtive'].count())

In [161]:
df_neg = df.reset_index()
df_neg = df.rename(columns={'positive_negtive':'news_neg_sum'})

### 合并新闻数量特征

In [170]:
from functools import reduce
dfs = [df_sum, df_pos, df_mid, df_neg]
df_new_info = reduce(lambda left, right: pd.merge(left, right, on=['id'], how='left'), dfs)

In [171]:
df_new_info.fillna(0, inplace=True)

In [175]:
df_new_info.reset_index()

Unnamed: 0,id,news_sum,news_pos_sum,news_mid_sum,news_neg_sum
0,09912c34159b1720558a419983a989f1dd2e0ed69a044ca3,6,0.0,6.0,0.0
1,175ebe5f059ec050afbd65251ecdd3b512bfbe5e62d041b0,7,4.0,3.0,0.0
2,216bd2aaf4d079240c3ac0b76f0ef4aa355d443880ba78db,3,2.0,1.0,0.0
3,216bd2aaf4d079240f5823e63d24b44dd2c58e3281b822f6,2,0.0,2.0,0.0
4,216bd2aaf4d0792410725ba5e7ca1dc32ce55767372f2030,1,0.0,0.0,1.0
5,216bd2aaf4d0792441e3b7277b9cefa0291c793d81a3c0d6,1,0.0,0.0,1.0
6,216bd2aaf4d07924429c35aadfa374ad5c993210e02e7b7e,1,0.0,1.0,0.0
7,216bd2aaf4d0792458ca0011e529060363ac177eae61e131,1,0.0,1.0,0.0
8,216bd2aaf4d07924595c1b1041acae96b2a283d5c4acc607,1,1.0,0.0,0.0
9,216bd2aaf4d079245e336a1cb6e6c79784b82025f2256136,1,1.0,0.0,0.0


In [182]:
df_new_info.columns

Index(['news_sum', 'news_pos_sum', 'news_mid_sum', 'news_neg_sum'], dtype='object')

In [183]:
df_new_info.dtypes

news_sum          int64
news_pos_sum    float64
news_mid_sum    float64
news_neg_sum    float64
dtype: object

In [186]:
col_list = ['news_sum', 'news_pos_sum', 'news_mid_sum', 'news_neg_sum']
for col in col_list:
    df_new_info[col] = df_new_info[col].astype('int')

In [187]:
df_new_info.dtypes

news_sum        int32
news_pos_sum    int32
news_mid_sum    int32
news_neg_sum    int32
dtype: object

In [188]:
df_new_info.to_csv('../code/fea_explore/news_info.csv')