In [1]:
import os
import pandas as pd
import warnings
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,OneHotEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from catboost import CatBoostClassifier

%matplotlib inline
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']

In [2]:
path = '/home/kesci/data/competition_A/'
train_df = pd.read_csv(path+'train_set.csv') 
test_df  = pd.read_csv(path+'test_set.csv') 
submission  =  pd.read_csv(path+'submission_example.csv') 
print('Train Shape:{}\nTest Shape:{}'.format(train_df.shape,test_df.shape))
train_df.head()

Train Shape:(6000, 31)
Test Shape:(2785, 30)


Unnamed: 0,年龄,性别,区域,体重,身高,体重指数,肥胖腰围,腰围,最高血压,最低血压,好胆固醇,坏胆固醇,总胆固醇,血脂异常,PVD,体育活动,教育,未婚,收入,护理来源,视力不佳,饮酒,高血压,家庭高血压,糖尿病,家族糖尿病,肝炎,家族肝炎,慢性疲劳,ALF,ID
0,58,F,east,75.6,174.9,24.71,0.0,94.8,100.0,52.0,35.0,95.0,130.0,0,0,1.0,0.0,0.0,0.0,Private Hospital,0.0,0,1.0,1,0.0,1,1.0,0.0,1.0,0.0,4379
1,85,F,east,66.3,166.1,24.03,0.0,89.6,134.0,84.0,59.0,153.0,212.0,0,0,2.0,0.0,0.0,0.0,Private Hospital,0.0,1,0.0,0,0.0,0,0.0,0.0,0.0,,7623
2,32,F,east,109.9,173.2,36.64,1.0,111.7,124.0,84.0,39.0,133.0,172.0,0,0,2.0,1.0,0.0,1.0,Private Hospital,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,1764
3,22,M,east,58.7,171.3,20.0,0.0,78.0,104.0,56.0,48.0,98.0,146.0,0,0,2.0,1.0,1.0,0.0,Never Counsulted,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,5450
4,44,F,west,79.7,172.1,26.91,0.0,93.8,114.0,60.0,34.0,195.0,229.0,0,0,2.0,0.0,0.0,0.0,Private Hospital,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,5196


In [3]:
train_df.describe()

Unnamed: 0,年龄,体重,身高,体重指数,肥胖腰围,腰围,最高血压,最低血压,好胆固醇,坏胆固醇,总胆固醇,血脂异常,PVD,体育活动,教育,未婚,收入,视力不佳,饮酒,高血压,家庭高血压,糖尿病,家族糖尿病,肝炎,家族肝炎,慢性疲劳,ALF,ID
count,6000.0,5860.0,5864.0,5791.0,5791.0,5776.0,5781.0,5727.0,5994.0,5993.0,5993.0,6000.0,6000.0,5992.0,5986.0,5692.0,5237.0,5598.0,6000.0,5951.0,6000.0,6000.0,6000.0,5984.0,5995.0,5974.0,4119.0,6000.0
mean,49.331,79.140884,167.037142,28.306374,0.317044,96.837846,125.582944,71.469356,51.627794,152.999166,204.628066,0.105833,0.0415,2.01719,0.434347,0.366831,0.418178,0.065916,0.301,0.401613,0.2395,0.110333,0.3145,0.067179,0.020684,0.031637,0.074775,4362.312833
std,18.784868,19.293747,10.124373,6.136871,0.465365,14.92833,21.004949,12.663285,15.518744,42.745425,42.70727,0.30765,0.19946,0.812113,0.495712,0.481982,0.493307,0.248158,0.458731,0.490266,0.426814,0.313331,0.464355,0.250353,0.142336,0.175047,0.263061,2538.515901
min,20.0,33.7,130.4,14.42,0.0,58.6,72.0,10.0,12.0,27.0,72.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,33.0,65.6,159.7,24.08,0.0,86.3,111.0,64.0,41.0,124.0,176.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2142.75
50%,47.0,76.7,166.6,27.39,0.0,96.3,122.0,72.0,49.0,150.0,201.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4363.5
75%,65.0,89.525,174.2,31.39,1.0,106.1,136.0,79.0,60.0,178.0,230.0,0.0,0.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6558.25
max,85.0,191.1,200.1,66.44,1.0,166.0,233.0,132.0,160.0,684.0,727.0,1.0,1.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8784.0


In [4]:
train_df.columns

Index(['年龄', '性别', '区域', '体重', '身高', '体重指数', '肥胖腰围', '腰围', '最高血压', '最低血压',
       '好胆固醇', '坏胆固醇', '总胆固醇', '血脂异常', 'PVD', '体育活动', '教育', '未婚', '收入', '护理来源',
       '视力不佳', '饮酒', '高血压', '家庭高血压', '糖尿病', '家族糖尿病', '肝炎', '家族肝炎', '慢性疲劳',
       'ALF', 'ID'],
      dtype='object')

In [5]:
num_columns = ['年龄','体重','身高','体重指数', '腰围', '最高血压', '最低血压',
                '好胆固醇', '坏胆固醇', '总胆固醇','收入']
zero_to_one_columns = ['肥胖腰围','血脂异常','PVD']
str_columns = ['性别','区域','体育活动','教育','未婚','护理来源','视力不佳','饮酒','高血压',
                '家庭高血压', '糖尿病', '家族糖尿病','家族肝炎', '慢性疲劳','ALF']

In [6]:
g = sns.FacetGrid(train_df, col='肝炎')
g.map(plt.hist, '年龄', bins=20)

findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.


<seaborn.axisgrid.FacetGrid at 0x7fe071049da0>

In [7]:
# 空值填充
train_df.fillna(0,inplace=True)
test_df.fillna(0,inplace=True)

In [8]:
# 字符编码
for i in tqdm(str_columns):
    lbl = LabelEncoder()
    train_df[i] = lbl.fit_transform(train_df[i].astype(str))
    test_df[i]  = lbl.fit_transform(test_df[i].astype(str))

100%|██████████| 15/15 [00:00<00:00, 210.24it/s]


In [9]:
# 数值归一化
train_df[num_columns] = MinMaxScaler().fit_transform(train_df[num_columns])
test_df[num_columns]  = MinMaxScaler().fit_transform(test_df[num_columns])

In [10]:
all_columns = [i for i in train_df.columns if i not in ['肝炎','ID']]

train_x,train_y = train_df[all_columns].values,train_df['肝炎'].values
test_x  = test_df[all_columns].values
submission['hepatitis'] = 0

In [11]:
kfold = StratifiedKFold(n_splits=5, shuffle=False)
model = CatBoostClassifier(
    iterations=200,
    learning_rate=0.1,
    loss_function='Logloss'
    )
for train, valid in kfold.split(train_x, train_y):
    X_train, Y_train = train_x[train], train_y[train]
    X_valid, Y_valid = train_x[valid], train_y[valid]
    model.fit(X_train,Y_train, eval_set=(X_valid, Y_valid),use_best_model=True)
    Y_valid_pred_prob = model.predict_proba(X_valid)
    submission['hepatitis'] += model.predict_proba(test_x)[:,1] / 5

0:	learn: 0.5476648	test: 0.5450950	best: 0.5450950 (0)	total: 1.2s	remaining: 3m 59s
1:	learn: 0.4542235	test: 0.4506240	best: 0.4506240 (1)	total: 2.9s	remaining: 4m 47s
2:	learn: 0.3913477	test: 0.3892482	best: 0.3892482 (2)	total: 4.7s	remaining: 5m 8s
3:	learn: 0.3462174	test: 0.3450788	best: 0.3450788 (3)	total: 6.1s	remaining: 4m 58s
4:	learn: 0.3067070	test: 0.3065037	best: 0.3065037 (4)	total: 7.61s	remaining: 4m 56s
5:	learn: 0.2819000	test: 0.2817510	best: 0.2817510 (5)	total: 9.3s	remaining: 5m
6:	learn: 0.2628941	test: 0.2631047	best: 0.2631047 (6)	total: 11.1s	remaining: 5m 6s
7:	learn: 0.2496631	test: 0.2502320	best: 0.2502320 (7)	total: 12.8s	remaining: 5m 7s
8:	learn: 0.2325814	test: 0.2338527	best: 0.2338527 (8)	total: 14.3s	remaining: 5m 3s
9:	learn: 0.2206442	test: 0.2229252	best: 0.2229252 (9)	total: 15.7s	remaining: 4m 58s
10:	learn: 0.2115983	test: 0.2144001	best: 0.2144001 (10)	total: 17.2s	remaining: 4m 55s
11:	learn: 0.2074506	test: 0.2107228	best: 0.2107228 (

In [None]:
!wget -nv -O kesci_submit https://cdn.kesci.com/submit_tool/v4/kesci_submit&&chmod +x kesci_submit
submission.to_csv('submission.csv',index=False)
!./kesci_submit -token '你的队伍Token' -file '/home/kesci/work/submission.csv'