# LightGBM


## 1.包加载

In [1]:
import numpy as np
import pandas as pd
import csv

#preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix

#visualizations
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False

#algorithms
import lightgbm as lgb

#score metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

OSError: dlopen(/Users/crius/opt/anaconda3/envs/python36/lib/python3.6/site-packages/lightgbm/lib_lightgbm.so, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib
  Referenced from: /Users/crius/opt/anaconda3/envs/python36/lib/python3.6/site-packages/lightgbm/lib_lightgbm.so
  Reason: image not found

## 2.读取数据

In [None]:
train = pd.read_csv('./train/train.csv')
pd.set_option('max_columns', 35)
train.head(5)

In [None]:
train.shape

## 3.数据总样

In [None]:
train.info()

# 4.查找缺失值

In [None]:
total_missingvalues = train.isnull().sum()
total_missingvalues

## 5.处理缺失值

In [None]:
train = train.drop(labels='护理来源', axis=1)
train = train.drop(labels='ID', axis=1)
train = train.drop(labels='区域', axis=1)
train = train.drop(labels='性别', axis=1)
train.head(5)

In [None]:
# 用前面的值来填充
train.fillna(method='ffill', inplace=True)
train.head(5)


## 分割数据集

In [None]:
y = train.iloc[:, 23]
print(y)

In [None]:
X = train.drop(labels='肝炎', axis=1)
X.shape
X.head(5)

In [None]:
total_missingvalues = train.isnull().sum()
total_missingvalues

## 6.计算相关矩阵

In [None]:
train_corr = train.corr()

sns.heatmap(train_corr,
            xticklabels=train_corr.columns,
            yticklabels=train_corr.columns)

In [None]:
train_corr.corr()


## 7.分割数据集

In [None]:
# 绘制计数直方图
sns.countplot(y)
plt.show()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1,1))
scaler.fit(X)
X_scaled = scaler.transform(X)


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=1, stratify=y)


In [None]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 9,
    'metric': 'multi_error',
    'num_leaves': 300,
    'min_data_in_leaf': 100,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0.4,
    'lambda_l2': 0.5,
    'min_gain_to_split': 0.2,
    'verbose': 5,
    'is_unbalance': True
}

# train
print('Start training...')
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10000,
                valid_sets=lgb_eval,
                early_stopping_rounds=500)

print('Start predicting...')






In [None]:

test = pd.read_csv('./test/test.csv')
pd.set_option('max_columns', 35)
test.head(5)

In [None]:
ID = test['ID']
print(ID)


In [None]:
test = test.drop(labels='护理来源', axis=1)
test = test.drop(labels='ID', axis=1)
test = test.drop(labels='区域', axis=1)
test = test.drop(labels='性别', axis=1)
test.head(5)

In [None]:
total_missingvalues = test.isnull().sum()
total_missingvalues

In [None]:
test.fillna(0.0, inplace=True)
test.head(5)

In [None]:
test.shape


In [None]:
# scaler.fit(test)
# X_scaled_test = scaler.transform(test)
preds = gbm.predict(test, num_iteration=gbm.best_iteration)

result = []
# 导出结果
for pred in preds:
    result.append(int(np.argmax(pred)))

# 输出的是概率结果

In [None]:
csvFile = open('./result/20201008-01-Result-XGBClassifier-1.csv', 'wt', encoding="UTF8", newline='')
myWriter = csv.writer(csvFile, delimiter=",")
myWriter.writerow(['ID', 'hepatitis'])
myWriter.writerows(zip(ID,result))

csvFile.close()