# CatGBM


## 1.导入相关库

In [3]:
import numpy as np
import pandas as pd

#preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from tqdm import tqdm

#visualizations
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

#algorithms
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

#score metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score



## 2.读取数据

In [4]:

# path = '/home/kesci/data/competition_A/'
path = './data/competition_A/'
train_df = pd.read_csv(path+'train_set.csv')
test_df  = pd.read_csv(path+'test_set.csv')
submission  =  pd.read_csv(path+'submission_example.csv')
print('Train Shape:{}\nTest Shape:{}'.format(train_df.shape,test_df.shape))
train_df.head(5)


FileNotFoundError: [Errno 2] No such file or directory: './data/competition_A/train_set.csv'

## 3.数据EDA

In [None]:
train_df.describe()

# total_missingvalues = train_df.isnull().sum()
# total_missingvalues

### 3.1 根据训练集的列可以得到大致如下三种特征：数字列、二值列（0或1）、字符列

In [None]:
# 数字列
num_columns = ['年龄','体重','身高','体重指数', '腰围', '最高血压', '最低血压',
                '好胆固醇', '坏胆固醇', '总胆固醇','收入']
# 二值列
zero_to_one_columns = ['肥胖腰围','血脂异常','PVD']

# 字符列
str_columns = ['性别','区域','体育活动','教育','未婚','护理来源','视力不佳','饮酒','高血压',
                '家庭高血压', '糖尿病', '家族糖尿病','家族肝炎', '慢性疲劳','ALF']


#### 3.1.1 肝炎与年龄

In [None]:
g = sns.FacetGrid(train_df, col='肝炎')
g.map(plt.hist, '年龄', bins=20)

### 3.2 特征工程

In [None]:
# 字符编码
for i in tqdm(str_columns):
    lbl = LabelEncoder()
    train_df[i] = lbl.fit_transform(train_df[i].astype(str))
    test_df[i] = lbl.fit_transform(test_df[i].astype(str))

# 数值归一化
train_df[num_columns] = StandardScaler().fit_transform(train_df[num_columns])
test_df[num_columns]  = StandardScaler().fit_transform(test_df[num_columns])

train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)


## 4.模型

### 4.1 准备数据集

In [None]:
all_columns = [i for i in train_df.columns if i not in ['肝炎','ID']]

train_x, train_y = train_df[all_columns].values, train_df['肝炎'].values
test_x  = test_df[all_columns].values
submission['hepatitis'] =0

### 4.2 训练模型

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=False)

model = CatBoostClassifier(
    iterations=200,
    learning_rate=0.1,
    loss_function='Logloss'
    )

result = []

for train, valid in kfold.split(train_x, train_y):
    X_train, Y_train = train_x[train], train_y[train]
    X_valid, Y_valid = train_x[valid], train_y[valid]
    model.fit(X_train,Y_train, eval_set=(X_valid, Y_valid),use_best_model=True)
    Y_valid_pred_prob = model.predict_proba(X_valid)
    result.append(roc_auc_score(Y_valid, Y_valid_pred_prob))
    submission['hepatitis'] += model.predict_proba(test_x)[:,1] / 5


print(result)

In [None]:
submission.to_csv('submission.csv',index=False)
# !wget -nv -O kesci_submit https://cdn.kesci.com/submit_tool/v4/kesci_submit&&chmod +x kesci_submit
# !./kesci_submit -token f3de41fac72af731 -file '/home/kesci/work/submission.csv'
