In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Normalizer, MinMaxScaler
%matplotlib inline

In [2]:
# Load data
train_base_sum = pd.read_csv("./data/train/base_train_sum.csv", encoding="gbk")
train_knowledge_sum = pd.read_csv("./data/train/knowledge_train_sum.csv", encoding="gbk")
train_money_report_sum = pd.read_csv("./data/train/money_report_train_sum.csv", encoding="gbk")
train_year_report_sum = pd.read_csv("./data/train/year_report_train_sum.csv", encoding="gbk")
verify_base = pd.read_csv("./data/verify/base_verify1.csv", encoding="gbk")
verify_money_info = pd.read_csv("./data/verify/money_information_verify1.csv", encoding="gbk")
verify_paient_info = pd.read_csv("./data/verify/paient_information_verify1.csv", encoding="gbk")
verify_year_report = pd.read_csv("./data/verify/year_report_verify1.csv", encoding="gbk")

In [3]:
def merge_base_knowledge(base, knowledge):
    # flag = 1 暂定表示为僵尸企业
    # flag除了1之外都是缺失值，使用0补齐，表示为非僵尸企业
    df = pd.merge(base, knowledge, on='ID')
    return df

train_base_knowledge = merge_base_knowledge(train_base_sum, train_knowledge_sum)
verify_base_knowledge = merge_base_knowledge(verify_base, verify_paient_info)

In [4]:
verify_base_knowledge.drop('控制人ID', axis=1, inplace=True)

In [5]:
verify_base_knowledge

Unnamed: 0,ID,注册时间,注册资本,行业,区域,企业类型,控制人类型,控制人持股比例,flag,专利,商标,著作权
0,1500001,2010.0,6680.0,商业服务业,山东,股份有限公司,自然人,0.89,1.0,0.0,0.0,0.0
1,1500321,2001.0,9330.0,商业服务业,广东,股份有限公司,企业法人,0.72,1.0,0.0,0.0,0.0
2,1500395,2003.0,8670.0,交通运输业,广西,农民专业合作社,企业法人,0.60,1.0,1.0,1.0,0.0
3,1500614,2001.0,7730.0,工业,江西,集体所有制企业,企业法人,0.58,1.0,0.0,0.0,0.0
4,1501057,2002.0,6840.0,工业,福建,农民专业合作社,自然人,0.84,1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
30879,5999996,2011.0,2170.0,零售业,湖北,农民专业合作社,自然人,0.93,0.0,1.0,1.0,0.0
30880,5999997,2013.0,9030.0,交通运输业,福建,集体所有制企业,企业法人,0.73,0.0,1.0,1.0,0.0
30881,5999998,2014.0,4510.0,服务业,湖南,股份有限公司,自然人,0.64,0.0,0.0,0.0,0.0
30882,5999999,2014.0,9130.0,交通运输业,福建,股份有限公司,自然人,0.80,0.0,1.0,1.0,1.0


In [6]:
# train_base_knowledge['注册时间'].fillna(int(train_base_knowledge['注册时间'].mean()), inplace=True)
# train_base_knowledge['注册资本'].fillna(int(train_base_knowledge['注册资本'].mean()), inplace=True)
# train_base_knowledge['控制人持股比例'].fillna(int(train_base_knowledge['控制人持股比例'].mean()), inplace=True)
values = {'注册时间': int(train_base_knowledge['注册时间'].mean()), 
          '注册资本': int(train_base_knowledge['注册资本'].mean()), 
          '控制人持股比例': train_base_knowledge['控制人持股比例'].mean(),
          '行业': 'other',
          '区域': 'other',
          '企业类型': 'other',
          '控制人类型': 'other',
          '专利': 0,
          '商标': 0,
          'flag': 0,
          '著作权': 0,}
values2 = {'注册时间': int(verify_base_knowledge['注册时间'].mean()), 
          '注册资本': int(verify_base_knowledge['注册资本'].mean()), 
          '控制人持股比例': verify_base_knowledge['控制人持股比例'].mean(),
          '行业': 'other',
          '区域': 'other',
          '企业类型': 'other',
          '控制人类型': 'other',
          '专利': 0,
          '商标': 0,
          'flag': 0,
          '著作权': 0,}
train_base_knowledge.fillna(value=values, inplace=True)
verify_base_knowledge.fillna(value=values, inplace=True)

In [7]:
encoder = OneHotEncoder(sparse = False)
X_train = train_base_knowledge.to_numpy()
X_test = verify_base_knowledge.to_numpy()

In [8]:
X_train[0]

array([28, 2007.0, 2050.0, '交通运输业', '福建', '农民专业合作社', '企业法人',
       0.7555609952384151, 1.0, 0.0, 1.0, 1.0], dtype=object)

In [9]:
zh_train = X_train[:, 3:7]
X_train = np.delete(X_train, range(3, 7), axis = 1)
# X_train = X_train[:,19:]
zh_test = X_test[:, 3:7]
X_test = np.delete(X_test, range(3, 7), axis = 1)
# X_test = X_test[:,19:]

In [10]:
X_train

array([[28, 2007.0, 2050.0, ..., 0.0, 1.0, 1.0],
       [230, 2008.0, 3360.0, ..., 0.0, 0.0, 0.0],
       [429, 2005.0, 9670.0, ..., 1.0, 0.0, 0.0],
       ...,
       [5978031, 2008.0, 3290.0, ..., 0.0, 1.0, 1.0],
       [5978032, 2009.0, 6060.0, ..., 1.0, 0.0, 1.0],
       [5978033, 2002.0, 6960.0, ..., 0.0, 1.0, 1.0]], dtype=object)

In [11]:
ans_train = encoder.fit_transform(zh_train)
ans_test = encoder.fit_transform(zh_test)

In [12]:
ans_test

array([[0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 1., 0.]])

In [13]:
X_train = np.hstack((X_train,ans_train)).astype('float32')
X_test = np.hstack((X_test,ans_test)).astype('float32')

In [14]:
X_train = pd.DataFrame(X_train).rename(columns = {0: 'ID', 4: 'flag'})
X_test = pd.DataFrame(X_test).rename(columns = {0: 'ID', 4: 'flag'})

In [15]:
X_train

Unnamed: 0,ID,1,2,3,flag,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,28.0,2007.0,2050.0,0.755561,1.0,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,230.0,2008.0,3360.0,1.000000,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,429.0,2005.0,9670.0,0.750000,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,693.0,2011.0,8360.0,0.980000,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,727.0,2001.0,8720.0,0.540000,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15045,5978029.0,2014.0,460.0,0.710000,0.0,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
15046,5978030.0,2004.0,1140.0,0.950000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
15047,5978031.0,2008.0,3290.0,0.630000,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
15048,5978032.0,2009.0,6060.0,0.510000,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [16]:
def merge_money_year(money, year): 
    money.loc[:,'year'] = money['year'].fillna(method='bfill')
    year.loc[:,'year'] = year['year'].fillna(method='bfill')
    df = pd.merge(money, year, on=['ID', 'year'])
    return df
train_money_year = merge_money_year(train_money_report_sum, train_year_report_sum)
verify_money_year  =merge_money_year(verify_money_info, verify_year_report)

In [17]:
train_money_year = train_money_year.sort_values(['ID', 'year'])
verify_money_year = verify_money_year.sort_values(['ID', 'year'])

In [18]:
for column in list(train_money_year.columns[train_money_year.isnull().sum() > 0]):
    mean_val = train_money_year[column].mean()
    train_money_year[column].fillna(mean_val, inplace=True)
for column in list(verify_money_year.columns[verify_money_year.isnull().sum() > 0]):
    mean_val = verify_money_year[column].mean()
    verify_money_year[column].fillna(mean_val, inplace=True)

In [19]:
x_train = pd.merge(train_money_year, X_train, on='ID')
x_verify = pd.merge(verify_money_year, X_test, on='ID')

In [20]:
x_verify

Unnamed: 0,ID,year,债权融资额度,债权融资成本,股权融资额度,股权融资成本,内部融资和贸易融资额度,内部融资和贸易融资成本,项目融资和政策融资额度,项目融资和政策融资成本,...,22,23,24,25,26,27,28,29,30,31
0,1500001,2015.0,0.0,0.00,0.0,0.000,0.0,0.00000,334.0,20.040,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1500001,2016.0,0.0,0.00,0.0,0.000,84168.0,5050.08000,0.0,0.000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,1500001,2017.0,0.0,0.00,0.0,0.000,91182.0,5470.92000,0.0,0.000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1500321,2015.0,0.0,0.00,0.0,0.000,721022.4,43261.34400,0.0,0.000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1500321,2016.0,0.0,0.00,0.0,0.000,0.0,0.00000,6997.5,419.850,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92644,5999998,2016.0,0.0,0.00,8659.2,346.368,0.0,1568.00667,0.0,0.000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
92645,5999998,2017.0,0.0,0.00,0.0,0.000,18265.5,1095.93000,0.0,0.000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
92646,5999999,2015.0,0.0,0.00,6025.8,241.032,0.0,0.00000,0.0,0.000,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
92647,5999999,2016.0,7304.0,584.32,0.0,0.000,0.0,0.00000,0.0,0.000,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [21]:
y_train = []
y_verify = []
i = 0
while i < len(x_train):
    y_train.append(x_train['flag'][i])
    i += 3

i = 0
while i < len(x_verify):
    y_verify.append(x_verify['flag'][i])
    i += 3

In [22]:
y_train = np.array(y_train).reshape(-1, 1)
y_verify = np.array(y_verify).reshape(-1, 1)

In [23]:
x_train.drop('flag', axis=1, inplace=True)
x_verify.drop('flag', axis=1, inplace=True)

In [24]:
x_train_np = x_train.values
x_verify_np = x_verify.values

In [25]:
x_train

Unnamed: 0,ID,year,债权融资额度,债权融资成本,股权融资额度,股权融资成本,内部融资和贸易融资额度,内部融资和贸易融资成本,项目融资和政策融资额度,项目融资和政策融资成本,...,22,23,24,25,26,27,28,29,30,31
0,28,2015.0,0.0,0.00,0.0,0.00,21648.0,1298.880,0.0,0.000,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,28,2016.0,0.0,0.00,34686.0,1387.44,0.0,0.000,0.0,0.000,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,28,2017.0,0.0,0.00,3444.0,137.76,0.0,0.000,0.0,0.000,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,230,2015.0,0.0,0.00,0.0,0.00,0.0,0.000,470.4,28.224,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,230,2016.0,0.0,0.00,0.0,0.00,46771.2,2806.272,0.0,0.000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45139,5978032,2016.0,12726.0,1018.08,0.0,0.00,0.0,0.000,0.0,0.000,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
45140,5978032,2017.0,0.0,0.00,0.0,0.00,0.0,0.000,2908.8,174.528,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
45141,5978033,2015.0,18096.0,1447.68,0.0,0.00,0.0,0.000,0.0,0.000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
45142,5978033,2016.0,0.0,0.00,0.0,0.00,254318.4,15259.104,0.0,0.000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [26]:
x_train_np  = Normalizer().fit_transform(x_train_np)
x_verify_np = Normalizer().fit_transform(x_verify_np)

In [27]:
x_train_np.shape

(45144, 49)

In [28]:
train = x_train_np.reshape(-1, 3, 49)
verify = x_verify_np.reshape(-1, 3, 49)

In [39]:
print(train.shape, y_train.shape)
y_verify

(15048, 3, 49) (15048, 1)


array([[1.],
       [1.],
       [1.],
       ...,
       [0.],
       [0.],
       [0.]], dtype=float32)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, LSTM, Dropout, ConvLSTM2D

In [42]:

def create_model(load=False):
    model = keras.models.Sequential([
        LSTM(500, input_shape=(3, 49), return_sequences=True),
        Dropout(0.25),
        LSTM(200, activation='relu'),
        Dropout(0.25),
        Dense(1)
    ])
    model.compile(optimizer='adam',loss='mse',metrics=['accuracy'])
    if load:
        model.load_weights('./weights')
    return model

In [40]:
def create_model1(load=False):
    model = keras.models.Sequential([
        ConvLSTM2D(64, kernel_size=(3, 3), activation='relu', padding='same',input_shape=(3, 49),return_sequences=True),
        Dropout(0.25),
        LSTM(32, activation='relu'),
        Dropout(0.25),
        Dense(1)
    ])
    model.compile(optimizer='adam',loss='mse',metrics=['accuracy'])
    if load:
        model.load_weights('./weights')
    return model

In [41]:
model = create_model1()

NameError: name 'ConvLSTM2D' is not defined

In [38]:
EPOCHS = 100

model.fit(train, y_train, epochs=EPOCHS, batch_size=100,  validation_data=(verify, y_verify), )

Train on 15048 samples, validate on 30883 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100


Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x193b4ff7b88>

In [None]:
predict_res = model.predict(verify)

In [None]:
for