In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, Normalizer, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn import svm, metrics
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
import time
%matplotlib inline

In [2]:
# Load data
train_base = pd.read_csv("./data/train/base_train_sum.csv", encoding="gbk")
train_knowledge = pd.read_csv("./data/train/knowledge_train_sum.csv", encoding="gbk")
train_money = pd.read_csv("./data/train/money_report_train_sum.csv", encoding="gbk")
train_year = pd.read_csv("./data/train/year_report_train_sum.csv", encoding="gbk")
verify_base = pd.read_csv("./data/verify/base_verify1.csv", encoding="gbk")
verify_money = pd.read_csv("./data/verify/money_information_verify1.csv", encoding="gbk")
verify_knowledge = pd.read_csv("./data/verify/paient_information_verify1.csv", encoding="gbk")
verify_year = pd.read_csv("./data/verify/year_report_verify1.csv", encoding="gbk")

In [42]:
test_base = pd.read_csv("./data/test/base_test_sum.csv", encoding="gbk")
test_money = pd.read_csv("./data/test/money_report_test_sum.csv", encoding="gbk")
test_knowledge = pd.read_csv("./data/test/knowledge_test_sum.csv", encoding="gbk")
test_year = pd.read_csv("./data/test/year_report_test_sum.csv", encoding="gbk")

### 可以看到测试集里面都没有缺失值

In [51]:
print(test_base.isnull().any())
print(test_money.isnull().any())
print(test_knowledge.isnull().any())
print(test_year.isnull().any())

ID         False
注册时间       False
注册资本       False
行业         False
区域         False
企业类型       False
控制人类型      False
控制人持股比例    False
dtype: bool
ID             False
year           False
债权融资额度         False
债权融资成本         False
股权融资额度         False
股权融资成本         False
内部融资和贸易融资额度    False
内部融资和贸易融资成本    False
项目融资和政策融资额度    False
项目融资和政策融资成本    False
dtype: bool
ID     False
专利     False
商标     False
著作权    False
dtype: bool
ID         False
year       False
从业人数       False
资产总额       False
负债总额       False
营业总收入      False
主营业务收入     False
利润总额       False
净利润        False
纳税总额       False
所有者权益合计    False
dtype: bool


In [4]:
def merge_base_knowledge():
    df_train = pd.merge(train_knowledge, train_base, on='ID')
    df_verify = pd.merge(verify_knowledge, verify_base, on='ID')
    return df_train, df_verify
train_base_knowledge, verify_base_knowledge = merge_base_knowledge()

In [53]:
test_base_knowledge = pd.merge(test_knowledge, test_base, on='ID')

In [54]:
print(test_base_knowledge.shape)
test_base_knowledge

(10000, 11)


Unnamed: 0,ID,专利,商标,著作权,注册时间,注册资本,行业,区域,企业类型,控制人类型,控制人持股比例
0,8024137,1,0,1,2008,6410,工业,江西,合伙企业,企业法人,0.83
1,8017684,0,0,0,2009,8460,服务业,湖北,有限责任公司,企业法人,0.84
2,8018629,0,0,0,2004,8360,服务业,福建,股份有限公司,企业法人,0.95
3,8011517,1,1,1,2004,5720,社区服务,广西,农民专业合作社,自然人,0.73
4,8013317,0,0,0,2000,5790,商业服务业,广东,农民专业合作社,自然人,0.72
...,...,...,...,...,...,...,...,...,...,...,...
9995,8017648,1,1,0,2002,1400,零售业,江西,股份有限公司,自然人,0.60
9996,8023821,0,0,0,2001,3780,零售业,江西,农民专业合作社,企业法人,0.76
9997,8016811,0,0,0,2011,2520,商业服务业,广东,有限责任公司,企业法人,0.77
9998,8016377,1,1,0,2006,1710,商业服务业,山东,集体所有制企业,自然人,0.61


In [5]:
print(train_base_knowledge.shape)
train_base_knowledge

(15050, 12)


Unnamed: 0,ID,专利,商标,著作权,注册时间,注册资本,行业,区域,企业类型,控制人类型,控制人持股比例,flag
0,28,0.0,1.0,1.0,2007.0,2050.0,交通运输业,福建,农民专业合作社,企业法人,,1.0
1,230,0.0,0.0,0.0,2008.0,3360.0,服务业,广东,农民专业合作社,企业法人,1.00,1.0
2,429,1.0,0.0,0.0,2005.0,9670.0,工业,江西,集体所有制企业,自然人,0.75,1.0
3,693,0.0,0.0,0.0,2011.0,8360.0,社区服务,山东,股份有限公司,企业法人,0.98,1.0
4,727,0.0,0.0,0.0,2001.0,8720.0,零售业,江西,股份有限公司,企业法人,0.54,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
15045,5978029,1.0,1.0,1.0,2014.0,460.0,交通运输业,广西,农民专业合作社,企业法人,0.71,
15046,5978030,0.0,0.0,0.0,2004.0,1140.0,商业服务业,湖南,有限责任公司,自然人,0.95,
15047,5978031,0.0,1.0,1.0,2008.0,3290.0,交通运输业,湖南,集体所有制企业,自然人,0.63,
15048,5978032,1.0,0.0,1.0,2009.0,6060.0,工业,福建,农民专业合作社,企业法人,0.51,


### 观察发现验证集中多了个***控制人ID***标签，没什么用，删除之 

In [6]:
print(verify_base_knowledge.shape)
verify_base_knowledge.drop('控制人ID', axis=1, inplace=True)
verify_base_knowledge

(30884, 13)


Unnamed: 0,ID,专利,商标,著作权,注册时间,注册资本,行业,区域,企业类型,控制人类型,控制人持股比例,flag
0,1500001.0,0.0,0.0,0.0,2010.0,6680.0,商业服务业,山东,股份有限公司,自然人,0.89,1.0
1,1500321.0,0.0,0.0,0.0,2001.0,9330.0,商业服务业,广东,股份有限公司,企业法人,0.72,1.0
2,1500395.0,1.0,1.0,0.0,2003.0,8670.0,交通运输业,广西,农民专业合作社,企业法人,0.60,1.0
3,1500614.0,0.0,0.0,0.0,2001.0,7730.0,工业,江西,集体所有制企业,企业法人,0.58,1.0
4,1501057.0,0.0,1.0,1.0,2002.0,6840.0,工业,福建,农民专业合作社,自然人,0.84,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
30879,5999996.0,1.0,1.0,0.0,2011.0,2170.0,零售业,湖北,农民专业合作社,自然人,0.93,0.0
30880,5999997.0,1.0,1.0,0.0,2013.0,9030.0,交通运输业,福建,集体所有制企业,企业法人,0.73,0.0
30881,5999998.0,0.0,0.0,0.0,2014.0,4510.0,服务业,湖南,股份有限公司,自然人,0.64,0.0
30882,5999999.0,1.0,1.0,1.0,2014.0,9130.0,交通运输业,福建,股份有限公司,自然人,0.80,0.0


### 观察发现money和year的数据中，有一些year数据缺失，但是这些数据都是一大堆一起出现的，使用bfill填充之

In [7]:
def merge_money_year(): 
    train_money.loc[:,'year'] = train_money['year'].fillna(method='bfill')
    train_year.loc[:,'year'] = train_year['year'].fillna(method='bfill')
    df_train = pd.merge(train_money, train_year, on=['ID', 'year'])
    
    verify_money.loc[:,'year'] = verify_money['year'].fillna(method='bfill')
    verify_year.loc[:,'year'] = verify_year['year'].fillna(method='bfill')
    df_verify = pd.merge(verify_money, verify_year, on=['ID', 'year'])
    return df_train, df_verify
train_money_year, verify_money_year = merge_money_year()
train_money_year

Unnamed: 0,ID,year,债权融资额度,债权融资成本,股权融资额度,股权融资成本,内部融资和贸易融资额度,内部融资和贸易融资成本,项目融资和政策融资额度,项目融资和政策融资成本,从业人数,资产总额,负债总额,营业总收入,主营业务收入,利润总额,净利润,纳税总额,所有者权益合计
0,28,2015.0,0.0,0.0,0.00,0.0000,21648.0,1298.880,0.0,0.000,794.0,16400.0,28700.0,72160.0,28864.0,7216.0,-7216.0,0.0,-12300.0
1,230,2015.0,0.0,0.0,0.00,0.0000,0.0,0.000,470.4,28.224,485.0,23520.0,10080.0,115248.0,57624.0,57624.0,-11524.8,0.0,13440.0
2,429,2015.0,19340.0,1547.2,0.00,0.0000,0.0,0.000,0.0,0.000,136.0,193400.0,183730.0,502840.0,351988.0,,-50284.0,0.0,9670.0
3,693,2015.0,0.0,0.0,0.00,0.0000,0.0,0.000,5350.4,321.024,534.0,133760.0,125400.0,655424.0,262169.6,196627.2,-65542.4,0.0,8360.0
4,727,2015.0,0.0,0.0,32229.12,1289.1648,0.0,0.000,0.0,0.000,375.0,366240.0,536280.0,402864.0,282004.8,161145.6,-40286.4,0.0,-170040.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45145,5978029,2017.0,0.0,0.0,0.00,0.0000,1117.8,67.068,0.0,0.000,265.0,1380.0,1840.0,3726.0,1863.0,745.2,372.6,2235.6,-460.0
45146,5978030,2017.0,0.0,0.0,0.00,0.0000,0.0,0.000,2371.2,142.272,1000.0,59280.0,58140.0,260832.0,208665.6,78249.6,-26083.2,0.0,1140.0
45147,5978031,2017.0,0.0,0.0,0.00,0.0000,0.0,0.000,2664.9,159.894,357.0,88830.0,171080.0,106596.0,42638.4,10659.6,-10659.6,0.0,-82250.0
45148,5978032,2017.0,0.0,0.0,0.00,0.0000,0.0,0.000,2908.8,174.528,572.0,145440.0,209070.0,421776.0,253065.6,168710.4,-42177.6,0.0,-63630.0


In [58]:
test_money_year = pd.merge(test_money, test_year, on=['ID', 'year'])
test_money_year

Unnamed: 0,ID,year,债权融资额度,债权融资成本,股权融资额度,股权融资成本,内部融资和贸易融资额度,内部融资和贸易融资成本,项目融资和政策融资额度,项目融资和政策融资成本,从业人数,资产总额,负债总额,营业总收入,主营业务收入,利润总额,净利润,纳税总额,所有者权益合计
0,8024137,2015,0,0.0,0.0,0.0,29614.2,1776.852,0.0,0.000,207,89740.0,166660.0,98714.0,49357.0,29614.2,-9970.114,0.0,-76920.0
1,8017684,2015,25380,2030.4,0.0,0.0,0.0,0.000,0.0,0.000,840,253800.0,368010.0,1015200.0,812160.0,101520.0,-102535.200,0.0,-114210.0
2,8018629,2015,0,0.0,0.0,0.0,231739.2,13904.352,0.0,0.000,331,275880.0,535040.0,772464.0,540724.8,231739.2,0.000,154492.8,-259160.0
3,8011517,2015,0,0.0,0.0,0.0,67953.6,4077.216,0.0,0.000,74,125840.0,120120.0,226512.0,135907.2,67953.6,-22877.712,0.0,5720.0
4,8013317,2015,0,0.0,0.0,0.0,312660.0,18759.600,0.0,0.000,881,434250.0,214230.0,1042200.0,729540.0,104220.0,0.000,625320.0,220020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,8017648,2017,6300,504.0,0.0,0.0,0.0,0.000,0.0,0.000,800,63000.0,92400.0,270900.0,189630.0,135450.0,0.000,54180.0,-29400.0
29996,8023821,2017,0,0.0,0.0,0.0,0.0,0.000,3628.8,217.728,355,120960.0,234360.0,72576.0,36288.0,29030.4,21990.528,43545.6,-113400.0
29997,8016811,2017,0,0.0,0.0,0.0,56246.4,3374.784,0.0,0.000,269,60480.0,115920.0,187488.0,131241.6,56246.4,75745.152,37497.6,-55440.0
29998,8016377,2017,9405,752.4,0.0,0.0,0.0,0.000,0.0,0.000,617,94050.0,92340.0,347985.0,278388.0,69597.0,35146.485,208791.0,1710.0


### 对于money和year两个csv中的其他缺失数据，感觉使用均值填充较为合理

In [8]:
for column in list(train_money_year.columns[train_money_year.isnull().sum() > 0]):
    mean_val = train_money_year[column].mean()
    train_money_year[column].fillna(mean_val, inplace=True)
for column in list(verify_money_year.columns[verify_money_year.isnull().sum() > 0]):
    mean_val = verify_money_year[column].mean()
    verify_money_year[column].fillna(mean_val, inplace=True)

### 按ID排序，方便下面汇总某一个ID3年的平均数据

In [9]:
train_money_year.sort_values('ID', inplace = True)
verify_money_year.sort_values('ID', inplace = True)

In [59]:
test_money_year.sort_values('ID', inplace=True)

In [60]:
test_money_year.isna().any()

ID             False
year           False
债权融资额度         False
债权融资成本         False
股权融资额度         False
股权融资成本         False
内部融资和贸易融资额度    False
内部融资和贸易融资成本    False
项目融资和政策融资额度    False
项目融资和政策融资成本    False
从业人数           False
资产总额           False
负债总额           False
营业总收入          False
主营业务收入         False
利润总额           False
净利润            False
纳税总额           False
所有者权益合计        False
dtype: bool

In [61]:
test_money_year

Unnamed: 0,ID,year,债权融资额度,债权融资成本,股权融资额度,股权融资成本,内部融资和贸易融资额度,内部融资和贸易融资成本,项目融资和政策融资额度,项目融资和政策融资成本,从业人数,资产总额,负债总额,营业总收入,主营业务收入,利润总额,净利润,纳税总额,所有者权益合计
22905,8010000,2016,5586,446.88,0.0,0.000,0.0,0.00,0.0,0.000,869,55860.0,53200.0,16758.0,10054.8,8379.0,-1692.558,0.0,2660.0
27357,8010000,2017,2128,170.24,0.0,0.000,0.0,0.00,0.0,0.000,311,21280.0,37240.0,21280.0,17024.0,8512.0,2149.280,8512.0,-15960.0
18453,8010000,2015,1596,127.68,0.0,0.000,0.0,0.00,0.0,0.000,856,15960.0,26600.0,22344.0,13406.4,4468.8,-2256.744,0.0,-10640.0
3232,8010006,2015,0,0.00,0.0,0.000,0.0,0.00,333.0,19.980,266,6660.0,6290.0,20646.0,8258.4,4129.2,-2085.246,0.0,370.0
3420,8010006,2017,0,0.00,0.0,0.000,0.0,0.00,407.0,24.420,322,8140.0,11655.0,11396.0,6837.6,2279.2,-1150.996,0.0,-3515.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26215,8024998,2017,0,0.00,0.0,0.000,0.0,0.00,5852.7,351.162,229,195090.0,278700.0,565761.0,226304.4,226304.4,0.000,113152.2,-83610.0
21763,8024998,2016,0,0.00,13377.6,535.104,0.0,0.00,0.0,0.000,882,55740.0,92900.0,222960.0,156072.0,66888.0,22518.960,133776.0,-37160.0
12710,8024999,2015,0,0.00,0.0,0.000,68172.0,4090.32,0.0,0.000,977,174800.0,85215.0,227240.0,181792.0,113620.0,-22951.240,0.0,89585.0
12802,8024999,2016,0,0.00,0.0,0.000,0.0,0.00,5899.5,353.970,421,196650.0,192280.0,98325.0,78660.0,39330.0,-9930.825,0.0,4370.0


### ~~下面这个做均值的函数运行很耗时间，不知道有什么方法能改进不~~
### 现在已经改进了，从100+秒变成只要0.5秒

In [12]:
train_mean = train_money_year.groupby('ID').mean().reset_index()
verify_mean = verify_money_year.groupby('ID').mean().reset_index()

In [62]:
test_mean = test_money_year.groupby('ID').mean().reset_index()

In [13]:
def merge_all():
    df_train = pd.merge(train_mean, train_base_knowledge, on='ID')
    df_verify = pd.merge(verify_mean, verify_base_knowledge, on='ID')
    return df_train, df_verify

In [64]:
df_test = pd.merge(test_mean, test_base_knowledge, on='ID')

In [14]:
df_train, df_verify = merge_all()

### 合并之后shape从15050变成了15048，说明有两个ID不匹配被丢弃了

In [68]:
print(df_test.shape)
df_test.isna().any()

(10000, 29)


ID             False
year           False
债权融资额度         False
债权融资成本         False
股权融资额度         False
股权融资成本         False
内部融资和贸易融资额度    False
内部融资和贸易融资成本    False
项目融资和政策融资额度    False
项目融资和政策融资成本    False
从业人数           False
资产总额           False
负债总额           False
营业总收入          False
主营业务收入         False
利润总额           False
净利润            False
纳税总额           False
所有者权益合计        False
专利             False
商标             False
著作权            False
注册时间           False
注册资本           False
行业             False
区域             False
企业类型           False
控制人类型          False
控制人持股比例        False
dtype: bool

### 把数据除了flag之外的na都填满

In [16]:
values = {'注册时间': int(df_train['注册时间'].mean()), 
          '注册资本': int(df_train['注册资本'].mean()), 
          '控制人持股比例': df_train['控制人持股比例'].mean(),
          '行业': 'other',
          '区域': 'other',
          '企业类型': 'other',
          '控制人类型': 'other',
          '专利': 0,
          '商标': 0,
          '著作权': 0,}
values2 = {'注册时间': int(df_verify['注册时间'].mean()), 
          '注册资本': int(df_verify['注册资本'].mean()), 
          '控制人持股比例': df_verify['控制人持股比例'].mean(),
          '行业': 'other',
          '区域': 'other',
          '企业类型': 'other',
          '控制人类型': 'other',
          '专利': 0,
          '商标': 0,
          '著作权': 0,}
df_train.fillna(value=values, inplace=True)
df_verify.fillna(value=values2, inplace=True)

In [17]:
df_train.isnull().sum()

ID                0
year              0
债权融资额度            0
债权融资成本            0
股权融资额度            0
股权融资成本            0
内部融资和贸易融资额度       0
内部融资和贸易融资成本       0
项目融资和政策融资额度       0
项目融资和政策融资成本       0
从业人数              0
资产总额              0
负债总额              0
营业总收入             0
主营业务收入            0
利润总额              0
净利润               0
纳税总额              0
所有者权益合计           0
专利                0
商标                0
著作权               0
注册时间              0
注册资本              0
行业                0
区域                0
企业类型              0
控制人类型             0
控制人持股比例           0
flag           9977
dtype: int64

### 这里把verify的na都填满了（除flag），发现df_verify.flag也有na, 直接删除df_verify.flag=na的数据

In [18]:
df_verify.isnull().sum()

ID               0
year             0
债权融资额度           0
债权融资成本           0
股权融资额度           0
股权融资成本           0
内部融资和贸易融资额度      0
内部融资和贸易融资成本      0
项目融资和政策融资额度      0
项目融资和政策融资成本      0
从业人数             0
资产总额             0
负债总额             0
营业总收入            0
主营业务收入           0
利润总额             0
净利润              0
纳税总额             0
所有者权益合计          0
专利               0
商标               0
著作权              0
注册时间             0
注册资本             0
行业               0
区域               0
企业类型             0
控制人类型            0
控制人持股比例          0
flag           306
dtype: int64

In [19]:
df_verify.dropna(inplace=True)

In [20]:
# 从30884变成了30577
df_verify.shape

(30577, 30)

In [71]:
df_test

Unnamed: 0,ID,year,债权融资额度,债权融资成本,股权融资额度,股权融资成本,内部融资和贸易融资额度,内部融资和贸易融资成本,项目融资和政策融资额度,项目融资和政策融资成本,...,专利,商标,著作权,注册时间,注册资本,行业,区域,企业类型,控制人类型,控制人持股比例
0,8010000,2016.0,3103.333333,248.266667,0.0,0.000,0.0,0.000,0.000000,0.000,...,0,1,0,2009,2660,服务业,江西,合伙企业,企业法人,0.57
1,8010006,2016.0,616.666667,49.333333,0.0,0.000,0.0,0.000,246.666667,14.800,...,0,0,0,2006,370,交通运输业,江西,集体所有制企业,自然人,0.80
2,8010007,2016.0,626.666667,50.133333,150.4,6.016,0.0,0.000,846.000000,50.760,...,1,0,0,2007,1880,零售业,山东,集体所有制企业,企业法人,0.91
3,8010009,2016.0,0.000000,0.000000,0.0,0.000,0.0,0.000,9568.000000,574.080,...,0,0,0,2002,9600,交通运输业,广西,有限责任公司,企业法人,0.74
4,8010012,2016.0,3192.000000,255.360000,0.0,0.000,38532.0,2311.920,1140.000000,68.400,...,0,1,1,2002,2280,交通运输业,福建,有限责任公司,自然人,0.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,8024994,2016.0,6392.000000,511.360000,159.8,6.392,11745.3,704.718,0.000000,0.000,...,1,0,0,2010,7990,商业服务业,湖南,有限责任公司,自然人,0.88
9996,8024996,2016.0,277.333333,22.186667,0.0,0.000,0.0,0.000,346.666667,20.800,...,0,0,0,2014,2080,商业服务业,湖南,农民专业合作社,自然人,0.77
9997,8024997,2016.0,0.000000,0.000000,5054.4,202.176,95612.4,5736.744,0.000000,0.000,...,0,0,0,2003,7020,工业,广西,有限责任公司,企业法人,0.60
9998,8024998,2016.0,0.000000,0.000000,13006.0,520.240,0.0,0.000,1950.900000,117.054,...,0,1,0,2010,9290,社区服务,广西,有限责任公司,自然人,0.62


### 接下来处理中文数据问题

In [21]:
encoder = OneHotEncoder(sparse = False)
X_train = df_train.to_numpy()
X_verify = df_verify.to_numpy()

In [72]:
X_test = df_test.to_numpy()

In [22]:
Y_train = X_train[:, -1]
X_train = np.delete(X_train, -1, axis = 1)
Y_verify = X_verify[:, -1]
X_verify = np.delete(X_verify, -1, axis = 1)

In [73]:
X_test[0]

array([8010000, 2016.0, 3103.3333333333335, 248.26666666666665, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 678.6666666666666, 31033.333333333332,
       39013.333333333336, 20127.333333333332, 13495.066666666666,
       7119.933333333333, -600.0073333333333, 2837.3333333333335, -7980.0,
       0, 1, 0, 2009, 2660, '服务业', '江西', '合伙企业', '企业法人', 0.57],
      dtype=object)

In [24]:
zh_train = X_train[:, -5:-1]
X_train = np.hstack((X_train[:, :-5], X_train[:, -1].reshape(-1, 1))).astype('float32')
zh_verify = X_verify[:, -5:-1]
X_verify = np.hstack((X_verify[:, :-5], X_verify[:, -1].reshape(-1, 1))).astype('float32')

In [74]:
zh_test = X_test[:, -5:-1]
X_test = np.hstack((X_test[:, :-5], X_test[:, -1].reshape(-1, 1))).astype('float32')

In [25]:
encoder.fit(zh_train)
ans_train = encoder.transform(zh_train)
ans_verify = encoder.transform(zh_verify)

In [75]:
ans_test = encoder.transform(zh_test)

In [76]:
print(X_test.shape,ans_test.shape )

(10000, 25) (10000, 24)


In [27]:
X_train = np.hstack((X_train, ans_train)).astype('float32')
X_verify = np.hstack((X_verify, ans_verify)).astype('float32')

In [77]:
X_test = np.hstack((X_test, ans_test)).astype('float32')

In [78]:
X_test.shape

(10000, 49)

### 这里位置已经把所有表都合并了，并且除了flag之外都没有na了，接下来先做正则化在根据方差选择lable

### 我发现先做聚类，比先做normalizer效果好。先做normalizer会把有lable的数据都分成不同的类 

In [29]:
# 聚类
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=0)
kmeans.fit(X_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)

In [30]:
for i in range(len(kmeans.labels_)):
    if kmeans.labels_[i] == 1:
        Y_train[i] = 0
    else:
        Y_train[i] = 1

In [31]:
Y_train

array([1, 1, 1, ..., 0, 0, 0], dtype=object)

In [32]:
# trainx = Normalizer().fit_transform(X_train)
# verifyx = Normalizer().fit_transform(X_verify)
trainx = MinMaxScaler().fit_transform(X_train)
verifyx = MinMaxScaler().fit_transform(X_verify)

In [79]:
testX = MinMaxScaler().fit_transform(X_test)

In [81]:
print(trainx.shape)
trainx[0]

(15048, 49)


array([0.        , 0.666626  , 0.        , 0.        , 0.09207726,
       0.09207726, 0.01457519, 0.01457519, 0.        , 0.        ,
       0.52021664, 0.07686406, 0.07385634, 0.06909325, 0.05371942,
       0.05477189, 0.23205402, 0.        , 0.58604234, 0.        ,
       1.        , 1.        , 0.5       , 0.1969697 , 0.5011382 ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 1.        , 0.        ], dtype=float32)

In [34]:
temp = pd.DataFrame(Y_train)
temp.shape

(15048, 1)

## 先试试这样直接用SVM，因为之前使用VarianceThreshold挑选特征的时候train和verify两个上面挑选出来的特征不一样，这样就不太好搞啊。在train上挑选出来有6个特征，verify上挑选出来只有1个特征

In [35]:
def Linear_SVM(data_train, data_test, lable_train, lable_test):
    # 用网格搜索法来获取最优的C
    # 搜索最优的C值
    param_grid = {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['scale', 'auto']}
    clf = GridSearchCV(svm.SVC(degree=5, max_iter=10000), cv=3, param_grid=param_grid, refit=True, )
    clf.fit(data_train, lable_train)
    # 预测数据
    predict = clf.predict(data_test)
    # 生成准确率
    accuracy_rate = metrics.accuracy_score(lable_test, predict)
    print('精度为%s' % accuracy_rate)

In [36]:
Y_train = Y_train.astype('int')
Y_verify = Y_verify.astype('int')

In [37]:
Linear_SVM(trainx, verifyx, Y_train, Y_verify)



精度为1.0


In [38]:
print(trainx.shape, verifyx.shape)

(15048, 49) (30577, 49)


In [39]:
pca = PCA(n_components=10)
pca.fit(trainx)
trainx_t1 = pca.transform(trainx)
verifyx_t1 = pca.transform(verifyx)

In [83]:
test_t1 = pca.transform(testX)

In [40]:
Linear_SVM(trainx_t1, verifyx_t1, Y_train, Y_verify)

精度为1.0


In [82]:
param_grid = {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['scale', 'auto']}
clf = GridSearchCV(svm.SVC(degree=5, max_iter=10000), cv=3, param_grid=param_grid, refit=True, )
clf.fit(trainx_t1, Y_train)
# 预测数据
predict = clf.predict(verifyx_t1)
# 生成准确率
accuracy_rate = metrics.accuracy_score(Y_verify, predict)
print('精度为%s' % accuracy_rate)

精度为1.0


In [84]:
res1 =  clf.predict(test_t1)

In [116]:
tt = X_test[:, 0]

In [123]:
tt = tt.astype('int')

In [124]:
res_df = pd.DataFrame(res1, tt).reset_index()

In [127]:
res_df.columns = ['企业ID','分类结果']

In [129]:
res_df

Unnamed: 0,企业ID,分类结果
0,8010000,1
1,8010006,1
2,8010007,1
3,8010009,1
4,8010012,1
...,...,...
9995,8024994,0
9996,8024996,0
9997,8024997,0
9998,8024998,0


In [130]:
res_df.to_csv('./data/res.csv')