In [1]:
from pathlib import Path
import requests

DATA_PATH = Path("data")
PATH = DATA_PATH / "mnist"

PATH.mkdir(parents=True, exist_ok=True)

URL = "http://deeplearning.net/data/mnist/"
FILENAME = "mnist.pkl.gz"

if not (PATH / FILENAME).exists():
        content = requests.get(URL + FILENAME).content
        (PATH / FILENAME).open("wb").write(content)

In [2]:
import pickle
import gzip

with gzip.open((PATH / FILENAME).as_posix(), "rb") as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding="latin-1")

In [3]:
import pandas as pd
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn import datasets

In [4]:
import time

In [7]:
tt = time.time()
# 转换为Dataset数据格式
train_data = lgb.Dataset(x_train, label=y_train)
validation_data = lgb.Dataset(x_valid, label=y_valid)

# 参数
params = {
    'learning_rate': 0.1,
    'lambda_l1': 0.1,
    'lambda_l2': 0.2,
    'max_depth': 4,
    'objective': 'multiclass',  # 目标函数
    'num_class': 10,
}
# 模型训练
gbm = lgb.train(params, train_data, valid_sets=[validation_data])

# 模型预测
y_pred = gbm.predict(x_valid)
#print(y_pred)
y_pred = [list(x).index(max(x)) for x in y_pred]
print(y_pred)

# 模型评估
print(accuracy_score(y_valid, y_pred))
print('Time used: {} sec'.format(time.time()-tt))

[1]	valid_0's multi_logloss: 2.02436
[2]	valid_0's multi_logloss: 1.82194
[3]	valid_0's multi_logloss: 1.65978
[4]	valid_0's multi_logloss: 1.52579
[5]	valid_0's multi_logloss: 1.41207
[6]	valid_0's multi_logloss: 1.31297
[7]	valid_0's multi_logloss: 1.22754
[8]	valid_0's multi_logloss: 1.14996
[9]	valid_0's multi_logloss: 1.08022
[10]	valid_0's multi_logloss: 1.01926
[11]	valid_0's multi_logloss: 0.961815
[12]	valid_0's multi_logloss: 0.912
[13]	valid_0's multi_logloss: 0.864139
[14]	valid_0's multi_logloss: 0.822687
[15]	valid_0's multi_logloss: 0.784225
[16]	valid_0's multi_logloss: 0.748474
[17]	valid_0's multi_logloss: 0.71566
[18]	valid_0's multi_logloss: 0.68534
[19]	valid_0's multi_logloss: 0.656782
[20]	valid_0's multi_logloss: 0.631019
[21]	valid_0's multi_logloss: 0.60703
[22]	valid_0's multi_logloss: 0.584541
[23]	valid_0's multi_logloss: 0.562499
[24]	valid_0's multi_logloss: 0.542996
[25]	valid_0's multi_logloss: 0.524052
[26]	valid_0's multi_logloss: 0.506888
[27]	valid_

In [8]:
import xgboost as xgb

In [10]:
tt = time.time()
#dmatrix 格式 在xgboost当中运行速度更快，性能更好。
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_valid, label=y_valid)

xgb_params = {
    'seed': 0,
    'eta': 0.1,
    'colsample_bytree': 0.5,
    'silent': 1,
    'subsample': 0.5,
    'objective': 'multi:softmax',
    'num_class':10,
    'max_depth': 4,
    'min_child_weight': 3
}


bst = xgb.train(xgb_params, dtrain, 100, evals=[(dtrain,'train'), (dtest,'test')])

preds = bst.predict(dtest)
auc = accuracy_score(y_valid, preds)

print(auc)
print('Time used: {} sec'.format(time.time()-tt))

[0]	train-merror:0.24184	test-merror:0.2277
[1]	train-merror:0.18706	test-merror:0.174
[2]	train-merror:0.16302	test-merror:0.1559
[3]	train-merror:0.14826	test-merror:0.1422
[4]	train-merror:0.13994	test-merror:0.133
[5]	train-merror:0.13582	test-merror:0.1286
[6]	train-merror:0.12844	test-merror:0.1213
[7]	train-merror:0.12242	test-merror:0.118
[8]	train-merror:0.11972	test-merror:0.1146
[9]	train-merror:0.11554	test-merror:0.1126
[10]	train-merror:0.11258	test-merror:0.1094
[11]	train-merror:0.11028	test-merror:0.1058
[12]	train-merror:0.1083	test-merror:0.1043
[13]	train-merror:0.10516	test-merror:0.0999
[14]	train-merror:0.10314	test-merror:0.0983
[15]	train-merror:0.1008	test-merror:0.0964
[16]	train-merror:0.10018	test-merror:0.0964
[17]	train-merror:0.09894	test-merror:0.0953
[18]	train-merror:0.09682	test-merror:0.0951
[19]	train-merror:0.09502	test-merror:0.0934
[20]	train-merror:0.09294	test-merror:0.0917
[21]	train-merror:0.09166	test-merror:0.09
[22]	train-merror:0.08956	t