# 导入数据

In [1]:
import numpy as np
import os
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from sklearn.datasets import load_files

In [2]:
import  pickle
# 加载数据集
def load_cifar10_batch(filename):
  
    with open(filename,'rb') as f:
        datadict=pickle.load(f,encoding='latin1')
        X=datadict['data']
        Y=datadict['labels']
        Y=np.array(Y)
    return X,Y

In [3]:
def load_cifar10(root):
    xs=[]
    ys=[]
    for b in range(1,6):
        f=os.path.join(root,'data_batch_%d'%(b,))
        X,Y=load_cifar10_batch(f)
        xs.append(X)
        ys.append(Y)
    Xtr= np.vstack(xs)
    Ytr=np.hstack(ys)
    del X,Y
    Xte,Yte=load_cifar10_batch(os.path.join(root,'test_batch'))
    return Xtr,Ytr,Xte,Yte

In [4]:
Xtr,Ytr,Xte,Yte=load_cifar10("./cifar10/")

In [5]:
print(Xtr.shape, Ytr.shape,Xte.shape,Yte.shape)

(50000, 3072) (50000,) (10000, 3072) (10000,)


In [6]:
X = np.vstack((Xtr,Xte))

In [7]:
y= np.concatenate((Ytr, Yte))

In [8]:
print(X.shape,y.shape)

(60000, 3072) (60000,)


In [9]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [10]:
X_train.shape

(42000, 3072)

# 基分类器1

In [11]:
import time
import os

In [12]:
from sklearn.tree import DecisionTreeClassifier
start_time_base1=time.time()
classifier1=DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=18,min_samples_split=5,random_state=42)#第一次测试最优
classifier1.fit(X_train,y_train)
score1=classifier1.score(X_test,y_test)
print(score1)
end_time_base1=time.time()

0.26761111111111113


In [13]:
base_time1=end_time_base1-start_time_base1
print('基分类器所用时间：',base_time1)

基分类器所用时间： 195.43886256217957


In [14]:
import joblib

#保存Model
joblib.dump(classifier1,'model_cifar10/base1_learner_cifar10.pkl')

['model_cifar10/base1_learner_cifar10.pkl']

In [15]:
print(os.path.getsize('model_cifar10/base1_learner_cifar10.pkl')/1024/1024)

1.6970195770263672


# Bagging

In [16]:
def get_dir_size(target_dir):
    pkl_size=[] #MB
    dir_list=os.listdir(target_dir)
    print(dir_list)
    #计算每个文件的大小
    for file in dir_list:
        file = os.path.join(target_dir, file)
        #如果是文件，直接通过getsize计算大小并加到size中
        if os.path.isfile(file):
            pkl_size.append(os.path.getsize(file)/1024/1024) #MB
    return pkl_size

In [37]:
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm

def bagging_clf(base_learner,target_dir,n):
    #性能指标
    time_bagging=[] #s
    top1_bagging=[]
    
#     n= [20,40,60,80,100,120,140,170,300,400,500]
    
    for i in tqdm(n):
    
        start_time_bagging=time.time()

        # 创建Bagging集成学习器
        bagging_clf = BaggingClassifier(base_estimator=base_learner, n_estimators=i, random_state=42,n_jobs=-1,bootstrap=True)
        bagging_clf.fit(X_train, y_train)
        y_pred_bagging = bagging_clf.predict(X_test)

        # 评估性能
    #     print("Bagging Accuracy:", accuracy_score(y_test, y_pred_bagging))
        top1_bagging.append(accuracy_score(y_test, y_pred_bagging))

        end_time_bagging=time.time()
        bagging_time=end_time_bagging-start_time_bagging
    #     print('Bagging所用时间：',bagging_time)
        time_bagging.append(bagging_time)

        joblib.dump(bagging_clf,f'{target_dir}/bagging_cifar10_{i}.pkl')
    
    return time_bagging,top1_bagging

In [18]:
time_bagging1,top1_bagging1=bagging_clf(classifier1,'model_cifar10/model1_bagging')

100%|██████████████████████████████████████████████████████████████████████████████| 11/11 [6:06:45<00:00, 2000.50s/it]


In [19]:
pkl_size_bagging1=get_dir_size('model_cifar10/model1_bagging')

['bagging_cifar10_100.pkl', 'bagging_cifar10_120.pkl', 'bagging_cifar10_140.pkl', 'bagging_cifar10_170.pkl', 'bagging_cifar10_20.pkl', 'bagging_cifar10_300.pkl', 'bagging_cifar10_40.pkl', 'bagging_cifar10_400.pkl', 'bagging_cifar10_500.pkl', 'bagging_cifar10_60.pkl', 'bagging_cifar10_80.pkl']


In [20]:
#保存结果
import pandas as pd
n= [20,40,60,80,100,120,140,170,300,400,500]
c1={"基分类器个数" : n,
   "performance" : top1_bagging1,
  "时间":time_bagging1,
  "pkl_size_bagging1":pkl_size_bagging1}#将列表a，b转换成字典
bagging_result1=pd.DataFrame(c1)#将字典转换成为数据框
print(bagging_result1)
bagging_result1.to_csv('model_cifar10/bagging_cifar10_result1.csv')

    基分类器个数  performance           时间  pkl_size_bagging1
0       20     0.411222   288.130097         123.077250
1       40     0.435889   539.409804         147.316343
2       60     0.444333   722.692421         171.655815
3       80     0.452333   997.036239         208.376822
4      100     0.457722  1210.448237          26.048515
5      120     0.461444  1403.024829         365.652562
6      140     0.464056  1670.265264          50.342065
7      170     0.467111  2067.906025         486.981323
8      300     0.469667  3382.287407         608.567158
9      400     0.471944  4187.489450          74.638499
10     500     0.470833  5526.038850          98.882264


# Boosting

In [38]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm

def boosting_clf(base_learner,target_dir,n):

    #性能指标
    time_boosting=[] #s
    top1_boosting=[]

#     n= [20,40,60,80,100,120,140,170,300,400,500]

    for i in tqdm(n):

        start_time_boosting=time.time()

        # 创建Bagging集成学习器
        boosting_clf = AdaBoostClassifier(base_estimator=base_learner, n_estimators=i, random_state=42)
        boosting_clf.fit(X_train, y_train)
        y_pred_boosting = boosting_clf.predict(X_test)

        # 评估性能
        top1_boosting.append(accuracy_score(y_test, y_pred_boosting))

        end_time_boosting=time.time()
        boosting_time=end_time_boosting-start_time_boosting
        time_boosting.append(boosting_time)
        
        joblib.dump(boosting_clf,f'{target_dir}/boosting_cifar10_{i}.pkl')    
    
    return time_boosting,top1_boosting

In [22]:
time_boosting1,top1_boosting1=boosting_clf(classifier1,'model_cifar10/model1_boosting')

 36%|██████████████████████████▉                                               | 4/11 [13:00:16<22:45:29, 11704.21s/it]


KeyboardInterrupt: 

In [36]:
print(time_boosting1,top1_boosting1) #中间结果打印不出来 函数中断没有return成功

NameError: name 'time_boosting1' is not defined

In [23]:
pkl_size_boosting1=get_dir_size('model_cifar10/model1_boosting')

['boosting_cifar10_20.pkl', 'boosting_cifar10_40.pkl', 'boosting_cifar10_60.pkl', 'boosting_cifar10_80.pkl']


In [31]:
print(pkl_size_boosting1)

[36.617488861083984, 73.30146789550781, 112.44429397583008, 148.91400909423828]


In [40]:
#保存结果
n= [20,40,60,80]
import pandas as pd
cb1={"基分类器个数" : n,
   "performance" : top1_boosting1,
  "时间":time_boosting1,
  "pkl_size_boosting":pkl_size_boosting1}#将列表a，b转换成字典
boost_result1=pd.DataFrame(cb1)#将字典转换成为数据框
print(boost_result1)
boost_result1.to_csv('model_cifar10/boosting_cifar10_result1.csv')

NameError: name 'top1_boosting1' is not defined

继续跑剩下的结果

In [43]:
n= [20,40,60,80,100,120,140,170,200,300]
time_boosting11,top1_boosting11=boosting_clf(classifier1,'model_cifar10/model1_boosting',n)

100%|████████████████████████████████████████████████████████████████████████████| 10/10 [78:55:48<00:00, 28414.89s/it]


In [44]:
pkl_size_boosting11=get_dir_size('model_cifar10/model1_boosting')

['boosting_cifar10_100.pkl', 'boosting_cifar10_120.pkl', 'boosting_cifar10_140.pkl', 'boosting_cifar10_170.pkl', 'boosting_cifar10_20.pkl', 'boosting_cifar10_200.pkl', 'boosting_cifar10_300.pkl', 'boosting_cifar10_40.pkl', 'boosting_cifar10_60.pkl', 'boosting_cifar10_80.pkl']


In [45]:
#保存结果
import pandas as pd
cb11={"基分类器个数" : n,
   "performance" : top1_boosting11,
  "时间":time_boosting11,
  "pkl_size_boosting":pkl_size_boosting11}#将列表a，b转换成字典
boost_result11=pd.DataFrame(cb11)#将字典转换成为数据框
print(boost_result11)
boost_result11.to_csv('model_cifar10/boosting_cifar10_result1_1.csv')

   基分类器个数  performance            时间  pkl_size_boosting
0      20     0.340056   4498.058643         183.643673
1      40     0.382333   8081.870675         219.865662
2      60     0.401111  13839.976938         256.570911
3      80     0.415111  17208.776704         312.480135
4     100     0.424278  21510.124742          36.617489
5     120     0.430833  25002.603672         365.217422
6     140     0.434000  29890.642432         547.779746
7     170     0.439889  35952.013403          73.301468
8     200     0.445333  46887.961143         112.444294
9     300     0.457389  81268.161003         148.914009


运行结束之后暂停  重跑下列2组实验 已完成

# 基分类器2

In [46]:
start_time_base2=time.time()
classifier2=DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=10,min_samples_split=5,random_state=42)#第一次测试最优
classifier2.fit(X_train,y_train)
score2=classifier2.score(X_test,y_test)
print(score2)
end_time_base2=time.time()

0.29588888888888887


In [47]:
base_time2=end_time_base2-start_time_base2
print('基分类器所用时间：',base_time2)

基分类器所用时间： 175.94180989265442


In [48]:
import joblib

#保存Model
joblib.dump(classifier2,'model_cifar10/base2_learner_cifar10.pkl')

['model_cifar10/base2_learner_cifar10.pkl']

In [49]:
print(os.path.getsize('model_cifar10/base2_learner_cifar10.pkl')/1024/1024)

0.2199993133544922


# Bagging

In [50]:
n= [20,40,60,80,100,120,140,170,200,300]
time_bagging2,top1_bagging2=bagging_clf(classifier2,'model_cifar10/model2_bagging',n)

100%|██████████████████████████████████████████████████████████████████████████████| 10/10 [4:05:00<00:00, 1470.06s/it]


In [51]:
pkl_size_bagging2=get_dir_size('model_cifar10/model2_bagging')

['bagging_cifar10_100.pkl', 'bagging_cifar10_120.pkl', 'bagging_cifar10_140.pkl', 'bagging_cifar10_170.pkl', 'bagging_cifar10_20.pkl', 'bagging_cifar10_200.pkl', 'bagging_cifar10_300.pkl', 'bagging_cifar10_40.pkl', 'bagging_cifar10_60.pkl', 'bagging_cifar10_80.pkl']


In [52]:
#保存结果
import pandas as pd
c2={"基分类器个数" : n,
   "performance" : top1_bagging2,
  "时间":time_bagging2,
  "pkl_size_bagging":pkl_size_bagging2}#将列表a，b转换成字典
bagging_result2=pd.DataFrame(c2)#将字典转换成为数据框
print(bagging_result2)
bagging_result2.to_csv('model_cifar10/bagging_cifar10_result2.csv')

   基分类器个数  performance           时间  pkl_size_bagging
0      20     0.399056   260.158357         21.516673
1      40     0.412444   503.041526         25.735838
2      60     0.418000   709.394526         30.015175
3      80     0.418111   972.865941         36.441184
4     100     0.421278  1216.894387          4.503319
5     120     0.424389  1426.272881         42.799499
6     140     0.423889  1678.214157         64.020848
7     170     0.424833  2026.910001          8.728173
8     200     0.425167  2386.292365         13.001307
9     300     0.424056  3516.926649         17.237596


# Boosting

In [53]:
n= [20,40,60,80,100,120,140,170,200,300]
time_boosting2,top1_boosting2=boosting_clf(classifier2,'model_cifar10/model2_boosting',n)

100%|████████████████████████████████████████████████████████████████████████████| 10/10 [59:47:17<00:00, 21523.76s/it]


In [54]:
pkl_size_boosting2=get_dir_size('model_cifar10/model2_boosting')

['boosting_cifar10_100.pkl', 'boosting_cifar10_120.pkl', 'boosting_cifar10_140.pkl', 'boosting_cifar10_170.pkl', 'boosting_cifar10_20.pkl', 'boosting_cifar10_200.pkl', 'boosting_cifar10_300.pkl', 'boosting_cifar10_40.pkl', 'boosting_cifar10_60.pkl', 'boosting_cifar10_80.pkl']


In [55]:
#保存结果
import pandas as pd
cb2={"基分类器个数" : n,
   "performance" : top1_boosting2,
  "时间":time_boosting2,
  "pkl_size_boosting":pkl_size_boosting2}#将列表a，b转换成字典
boost_result2=pd.DataFrame(cb2)#将字典转换成为数据框
print(boost_result2)
boost_result2.to_csv('model_cifar10/boosting_cifar10_result2.csv')

   基分类器个数  performance            时间  pkl_size_boosting
0      20     0.265056   4241.890586          17.523251
1      40     0.294000   8249.265976          21.099564
2      60     0.314278  11924.379053          24.692219
3      80     0.333389  15817.183200          29.969011
4     100     0.342111  19972.752807           3.545620
5     120     0.354611  23654.145258          35.483932
6     140     0.365222  26287.187249          53.382102
7     170     0.372833  25946.995061           6.867722
8     200     0.382556  31102.209241          10.350391
9     300     0.398944  48039.403347          13.965874


# 基分类器3

In [56]:
start_time_base3=time.time()
classifier3=DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=8,min_samples_split=5,random_state=42)#第一次测试最优
classifier3.fit(X_train,y_train)
score3=classifier3.score(X_test,y_test)
print(score3)
end_time_base3=time.time()

0.2876666666666667


In [57]:
base_time3=end_time_base3-start_time_base3
print('基分类器所用时间：',base_time3)

基分类器所用时间： 130.3557469844818


In [58]:
import joblib

#保存Model
joblib.dump(classifier3,'model_cifar10/base3_learner_cifar10.pkl')

['model_cifar10/base3_learner_cifar10.pkl']

In [59]:
print(os.path.getsize('model_cifar10/base3_learner_cifar10.pkl')/1024/1024)

0.06669425964355469


# Bagging

In [60]:
n= [20,40,60,80,100,120,140,170,200,300]
time_bagging3,top1_bagging3=bagging_clf(classifier3,'model_cifar10/model3_bagging',n)

100%|███████████████████████████████████████████████████████████████████████████████| 10/10 [2:25:45<00:00, 874.58s/it]


In [61]:
pkl_size_bagging3=get_dir_size('model_cifar10/model3_bagging')

['bagging_cifar10_100.pkl', 'bagging_cifar10_120.pkl', 'bagging_cifar10_140.pkl', 'bagging_cifar10_170.pkl', 'bagging_cifar10_20.pkl', 'bagging_cifar10_200.pkl', 'bagging_cifar10_300.pkl', 'bagging_cifar10_40.pkl', 'bagging_cifar10_60.pkl', 'bagging_cifar10_80.pkl']


In [62]:
#保存结果
import pandas as pd
c3={"基分类器个数" : n,
   "performance" : top1_bagging3,
  "时间":time_bagging3,
  "pkl_size_bagging":pkl_size_bagging3}#将列表a，b转换成字典
bagging_result3=pd.DataFrame(c3)#将字典转换成为数据框
print(bagging_result3)
bagging_result3.to_csv('model_cifar10/bagging_cifar10_result3.csv')

   基分类器个数  performance           时间  pkl_size_bagging
0      20     0.378444   175.276000          7.702617
1      40     0.383556   379.606413          9.226622
2      60     0.386556   480.526399         10.753731
3      80     0.388278   566.732618         13.046988
4     100     0.389500   862.045287          1.600639
5     120     0.389889   979.400436         15.330916
6     140     0.389222  1058.413802         22.967259
7     170     0.388500  1076.414617          3.122811
8     200     0.388611  1309.487308          4.652019
9     300     0.388889  1855.717403          6.172914


# Boosting

In [63]:
n= [20,40,60,80,100,120,140,170,200,300]
time_boosting3,top1_boosting3=boosting_clf(classifier3,'model_cifar10/model3_boosting',n)

100%|████████████████████████████████████████████████████████████████████████████| 10/10 [45:00:43<00:00, 16204.40s/it]


In [64]:
pkl_size_boosting3=get_dir_size('model_cifar10/model3_boosting')

['boosting_cifar10_100.pkl', 'boosting_cifar10_120.pkl', 'boosting_cifar10_140.pkl', 'boosting_cifar10_170.pkl', 'boosting_cifar10_20.pkl', 'boosting_cifar10_200.pkl', 'boosting_cifar10_300.pkl', 'boosting_cifar10_40.pkl', 'boosting_cifar10_60.pkl', 'boosting_cifar10_80.pkl']


In [65]:
#保存结果
import pandas as pd
cb3={"基分类器个数" : n,
   "performance" : top1_boosting3,
  "时间":time_boosting3,
  "pkl_size_boosting":pkl_size_boosting3}#将列表a，b转换成字典
boost_result3=pd.DataFrame(cb3)#将字典转换成为数据框
print(boost_result3)
boost_result3.to_csv('model_cifar10/boosting_cifar10_result3.csv')

   基分类器个数  performance            时间  pkl_size_boosting
0      20     0.277222   2438.547966           6.080624
1      40     0.270889   4809.793720           7.300812
2      60     0.290556   7272.237727           8.518406
3      80     0.309278   9808.519388          10.318987
4     100     0.321056  12433.583216           1.261089
5     120     0.331278  15291.972367          12.167816
6     140     0.341667  18069.785604          18.170188
7     170     0.348778  23324.323385           2.430954
8     200     0.355722  29081.418851           3.637653
9     300     0.375111  39512.265176           4.859398
