# 导入数据

In [1]:
# 使用sklearn的函数来获取MNIST数据集
from sklearn.datasets import fetch_openml
import numpy as np
import os

In [2]:
# to make this notebook's output stable across runs
np.random.seed(42)

In [3]:
import ssl
# 全局取消证书验证
ssl._create_default_https_context = ssl._create_unverified_context

In [4]:
import time
time_begin=time.time()
mnist=fetch_openml('mnist_784')
time_end=time.time()
time_data=time_end-time_begin
print('加载数据集所用时间:',time_data)

加载数据集所用时间: 46.283408641815186


In [5]:
X,y=mnist['data'],mnist['target']
X.shape #数据X共有7万张图片，每张图片有784个特征。因为图片是28×28像素，每个特征代表了一个像素点的强度，从0（白色）到255（黑色），

(70000, 784)

In [6]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [7]:
X_train.shape

(49000, 784)

# 基分类器1

In [8]:
import time
import os

In [9]:
from sklearn.tree import DecisionTreeClassifier
start_time_base1=time.time()
classifier1=DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=18,min_samples_split=5,random_state=42)#第一次测试最优
classifier1.fit(X_train,y_train)
score1=classifier1.score(X_test,y_test)
print(score1)
end_time_base1=time.time()

0.8696666666666667


In [10]:
base_time1=end_time_base1-start_time_base1
print('基分类器所用时间：',base_time1)

基分类器所用时间： 27.306578397750854


In [11]:
import joblib

#保存Model
joblib.dump(classifier1,'model_mnist/base1_learner_mnist.pkl')

['model_mnist/base1_learner_mnist.pkl']

In [12]:
print(os.path.getsize('model_mnist/base1_learner_mnist.pkl')/1024/1024) #MB

0.6030960083007812


# Bagging

In [13]:
def get_dir_size(target_dir):
    pkl_size=[] #MB
    dir_list=os.listdir(target_dir)
    print(dir_list)
    #计算每个文件的大小
    for file in dir_list:
        file = os.path.join(target_dir, file)
        #如果是文件，直接通过getsize计算大小并加到size中
        if os.path.isfile(file):
            pkl_size.append(os.path.getsize(file)/1024/1024) #MB
    return pkl_size
#没按理想顺序排列

In [34]:
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm

def bagging_clf(base_learner,target_dir,n):
    #性能指标
    time_bagging=[] #s
    top1_bagging=[]
    
#     n= [20,40,60,80,100,120,140,170,300,400,500]

    
    for i in tqdm(n):
    
        start_time_bagging=time.time()

        # 创建Bagging集成学习器
        bagging_clf = BaggingClassifier(base_estimator=base_learner, n_estimators=i, random_state=42,n_jobs=-1,bootstrap=True)
        bagging_clf.fit(X_train, y_train)
        y_pred_bagging = bagging_clf.predict(X_test)

        # 评估性能
    #     print("Bagging Accuracy:", accuracy_score(y_test, y_pred_bagging))
        top1_bagging.append(accuracy_score(y_test, y_pred_bagging))

        end_time_bagging=time.time()
        bagging_time=end_time_bagging-start_time_bagging
    #     print('Bagging所用时间：',bagging_time)
        time_bagging.append(bagging_time)

        joblib.dump(bagging_clf,f'{target_dir}/bagging_mnist_{i}.pkl')
    
    return time_bagging,top1_bagging

In [15]:
time_bagging1,top1_bagging1=bagging_clf(classifier1,'model_mnist/model1_bagging')

100%|███████████████████████████████████████████████████████████████████████████████| 11/11 [1:08:03<00:00, 371.27s/it]


In [16]:
pkl_size_bagging1=get_dir_size('model_mnist/model1_bagging')

['bagging_mnist_100.pkl', 'bagging_mnist_120.pkl', 'bagging_mnist_140.pkl', 'bagging_mnist_170.pkl', 'bagging_mnist_20.pkl', 'bagging_mnist_300.pkl', 'bagging_mnist_40.pkl', 'bagging_mnist_400.pkl', 'bagging_mnist_500.pkl', 'bagging_mnist_60.pkl', 'bagging_mnist_80.pkl']


In [17]:
#保存结果
n= [20,40,60,80,100,120,140,170,300,400,500]
import pandas as pd
c1={"基分类器个数" : n,
   "performance" : top1_bagging1,
  "时间":time_bagging1,
  "pkl_size_bagging1":pkl_size_bagging1}#将列表a，b转换成字典
bagging_result1=pd.DataFrame(c1)#将字典转换成为数据框
print(bagging_result1)
bagging_result1.to_csv('model_mnist/bagging_MNIST_result1.csv')

    基分类器个数  performance           时间  pkl_size_bagging1
0       20     0.950667    45.250962          45.023484
1       40     0.951381    78.520336          53.945835
2       60     0.952714   110.638683          62.865065
3       80     0.953429   181.730240          76.296826
4      100     0.951667   219.069227           9.492064
5      120     0.952619   240.434104         134.295970
6      140     0.953143   304.146207          18.390792
7      170     0.953095   369.168261         178.836874
8      300     0.954476   609.535818         223.488550
9      400     0.954762   879.549972          27.278916
10     500     0.954524  1040.362046          36.135383


# Boosting

In [35]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm

def boosting_clf(base_learner,target_dir,n):

    #性能指标
    time_boosting=[] #s
    top1_boosting=[]

#     n= [20,40,60,80,100,120,140,170,300,400,500]
#     n= [20,40,60,80,100,120,140,170,200,300]

    for i in tqdm(n):

        start_time_boosting=time.time()

        # 创建Bagging集成学习器
        boosting_clf = AdaBoostClassifier(base_estimator=base_learner, n_estimators=i, random_state=42)
        boosting_clf.fit(X_train, y_train)
        y_pred_boosting = boosting_clf.predict(X_test)

        # 评估性能
        top1_boosting.append(accuracy_score(y_test, y_pred_boosting))

        end_time_boosting=time.time()
        boosting_time=end_time_boosting-start_time_boosting
        time_boosting.append(boosting_time)
        
        joblib.dump(boosting_clf,f'{target_dir}/boosting_mnist_{i}.pkl')    
    
    return time_boosting,top1_boosting

In [36]:
time_boosting1,top1_boosting1=boosting_clf(classifier1,'model_mnist/model1_boosting')

  0%|                                                                                           | 0/11 [01:01<?, ?it/s]


KeyboardInterrupt: 

看错了，导致点错了，还好保存了csv，下次注释写清楚点

In [None]:
pkl_size_boosting1=get_dir_size('model_mnist/model1_boosting')

In [None]:
#保存结果
import pandas as pd
cb1={"基分类器个数" : n,
   "performance" : top1_boosting1,
  "时间":time_boosting1,
  "pkl_size_boosting1":pkl_size_boosting1}#将列表a，b转换成字典
boost_result1=pd.DataFrame(cb1)#将字典转换成为数据框
print(boost_result1)
boost_result1.to_csv('model_mnist/boosting_MNIST_result1.csv')

# 基分类器2

In [22]:
start_time_base2=time.time()
classifier2=DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=10,min_samples_split=5,random_state=42)#第一次测试最优
classifier2.fit(X_train,y_train)
score2=classifier2.score(X_test,y_test)
print(score2)
end_time_base2=time.time()

0.8536666666666667


In [23]:
base_time2=end_time_base2-start_time_base2
print('基分类器所用时间：',base_time2)

基分类器所用时间： 18.06419563293457


In [24]:
import joblib

#保存Model
joblib.dump(classifier2,'model_mnist/base2_learner_mnist.pkl')

['model_mnist/base2_learner_mnist.pkl']

In [25]:
print(os.path.getsize('model_mnist/base2_learner_mnist.pkl')/1024/1024)

0.21736907958984375


# Bagging

In [26]:
time_bagging2,top1_bagging2=bagging_clf(classifier2,'model_mnist/model2_bagging')

100%|█████████████████████████████████████████████████████████████████████████████████| 11/11 [41:03<00:00, 223.95s/it]


In [27]:
pkl_size_bagging2=get_dir_size('model_mnist/model2_bagging')

['bagging_mnist_100.pkl', 'bagging_mnist_120.pkl', 'bagging_mnist_140.pkl', 'bagging_mnist_170.pkl', 'bagging_mnist_20.pkl', 'bagging_mnist_300.pkl', 'bagging_mnist_40.pkl', 'bagging_mnist_400.pkl', 'bagging_mnist_500.pkl', 'bagging_mnist_60.pkl', 'bagging_mnist_80.pkl']


In [28]:
#保存结果
import pandas as pd
c2={"基分类器个数" : n,
   "performance" : top1_bagging2,
  "时间":time_bagging2,
  "pkl_size_bagging":pkl_size_bagging2}#将列表a，b转换成字典
bagging_result2=pd.DataFrame(c2)#将字典转换成为数据框
print(bagging_result2)
bagging_result2.to_csv('model_mnist/bagging_MNIST_result2.csv')

    基分类器个数  performance          时间  pkl_size_bagging
0       20     0.931571   29.990106         19.370698
1       40     0.932571   54.064914         23.189106
2       60     0.932476   80.307544         26.994276
3       80     0.932619  101.904251         32.805402
4      100     0.931762  132.199790          4.008879
5      120     0.931571  153.232098         57.668086
6      140     0.931714  179.325649          7.878890
7      170     0.932238  219.361088         76.696020
8      300     0.932476  378.843054         95.927073
9      400     0.933095  504.451967         11.713136
10     500     0.933048  626.647793         15.529990


# Boosting

In [37]:
n= [20,40,60,80,100,120,140,170,300,400,500]
time_boosting2,top1_boosting2=boosting_clf(classifier2,'model_mnist/model2_boosting',n)

100%|██████████████████████████████████████████████████████████████████████████████| 11/11 [9:35:21<00:00, 3138.36s/it]


In [38]:
# print(time_boosting,top1_boosting)

In [39]:
pkl_size_boosting2=get_dir_size('model_mnist/model2_boosting')

['boosting_mnist_100.pkl', 'boosting_mnist_120.pkl', 'boosting_mnist_140.pkl', 'boosting_mnist_170.pkl', 'boosting_mnist_20.pkl', 'boosting_mnist_300.pkl', 'boosting_mnist_40.pkl', 'boosting_mnist_400.pkl', 'boosting_mnist_500.pkl', 'boosting_mnist_60.pkl', 'boosting_mnist_80.pkl']


In [40]:
#保存结果
import pandas as pd
cb2={"基分类器个数" : n,
   "performance" : top1_boosting2,
  "时间":time_boosting2,
  "pkl_size_boosting1":pkl_size_boosting2}#将列表a，b转换成字典
boost_result2=pd.DataFrame(cb2)#将字典转换成为数据框
print(boost_result2)
boost_result2.to_csv('model_mnist/boosting_MNIST_result2.csv')

    基分类器个数  performance           时间  pkl_size_boosting1
0       20     0.911619   416.166608           12.620796
1       40     0.932810   792.731243           14.685902
2       60     0.941952  1170.273323           17.057098
3       80     0.945667  1564.946035           20.457346
4      100     0.950286  1955.616270            3.095284
5      120     0.953238  2350.902061           36.451831
6      140     0.954095  2733.736177            5.558567
7      170     0.956905  3277.244351           48.188750
8      300     0.960048  5514.028136           59.978068
9      400     0.962381  6802.416414            8.043121
10     500     0.963619  7941.578051           10.311337


跑完中断服务再运行函数 

重跑bagging/boosting函数后再运行下列实验

# 基分类器3

In [41]:
start_time_base3=time.time()
classifier3=DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=8,min_samples_split=5,random_state=42)#第一次测试最优
classifier3.fit(X_train,y_train)
score3=classifier3.score(X_test,y_test)
print(score3)
end_time_base3=time.time()

0.8048095238095239


In [42]:
base_time3=end_time_base3-start_time_base3
print('基分类器所用时间：',base_time3)

基分类器所用时间： 16.20579433441162


In [43]:
import joblib

#保存Model
joblib.dump(classifier3,'model_mnist/base3_learner_mnist.pkl')

['model_mnist/base3_learner_mnist.pkl']

In [44]:
print(os.path.getsize('model_mnist/base3_learner_mnist.pkl')/1024/1024)

0.06743621826171875


# Bagging

In [45]:
n= [20,40,60,80,100,120,140,170,200,300]
time_bagging3,top1_bagging3=bagging_clf(classifier3,'model_mnist/model3_bagging',n)

100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [17:32<00:00, 105.25s/it]


In [46]:
pkl_size_bagging3=get_dir_size('model_mnist/model3_bagging')

['bagging_mnist_100.pkl', 'bagging_mnist_120.pkl', 'bagging_mnist_140.pkl', 'bagging_mnist_170.pkl', 'bagging_mnist_20.pkl', 'bagging_mnist_200.pkl', 'bagging_mnist_300.pkl', 'bagging_mnist_40.pkl', 'bagging_mnist_60.pkl', 'bagging_mnist_80.pkl']


In [47]:
#保存结果
import pandas as pd
n= [20,40,60,80,100,120,140,170,200,300]
c3={"基分类器个数" : n,
   "performance" : top1_bagging3,
  "时间":time_bagging3,
  "pkl_size_bagging":pkl_size_bagging3}#将列表a，b转换成字典
bagging_result3=pd.DataFrame(c3)#将字典转换成为数据框
print(bagging_result3)
bagging_result3.to_csv('model_mnist/bagging_MNIST_result3.csv')

   基分类器个数  performance          时间  pkl_size_bagging
0      20     0.896190   22.717244          6.870759
1      40     0.893905   39.213464          8.224613
2      60     0.892524   50.972113          9.571714
3      80     0.892286   69.572621         11.620252
4     100     0.890476   86.770426          1.415663
5     120     0.890619  101.528722         13.646489
6     140     0.891333  117.682910         20.432857
7     170     0.890762  143.290583          2.791030
8     200     0.891048  169.517843          4.146714
9     300     0.892238  249.688899          5.504981


# Boosting

In [48]:
n= [20,40,60,80,100,120,140,170,200,300]
time_boosting3,top1_boosting3=boosting_clf(classifier3,'model_mnist/model3_boosting',n)

100%|██████████████████████████████████████████████████████████████████████████████| 10/10 [5:40:27<00:00, 2042.76s/it]


In [49]:
pkl_size_boosting3=get_dir_size('model_mnist/model3_boosting')

['boosting_mnist_100.pkl', 'boosting_mnist_120.pkl', 'boosting_mnist_140.pkl', 'boosting_mnist_170.pkl', 'boosting_mnist_20.pkl', 'boosting_mnist_200.pkl', 'boosting_mnist_300.pkl', 'boosting_mnist_40.pkl', 'boosting_mnist_60.pkl', 'boosting_mnist_80.pkl']


In [50]:
#保存结果
import pandas as pd
cb3={"基分类器个数" : n,
   "performance" : top1_boosting3,
  "时间":time_boosting3,
  "pkl_size_boosting1":pkl_size_boosting3}#将列表a，b转换成字典
boost_result3=pd.DataFrame(cb3)#将字典转换成为数据框8
print(boost_result3)
boost_result3.to_csv('model_mnist/boosting_MNIST_result3.csv')

   基分类器个数  performance           时间  pkl_size_boosting1
0      20     0.878095   340.244428            4.807401
1      40     0.902619   681.822877            5.644752
2      60     0.917048  1022.027747            6.473804
3      80     0.925714  1359.982244            7.853319
4     100     0.934143  1698.923945            1.170280
5     120     0.935524  2038.498396            9.206897
6     140     0.940048  2370.618853           13.435717
7     170     0.944000  2850.202939            2.081050
8     200     0.945810  3327.834531            3.003239
9     300     0.950810  4736.178408            3.823991
