# 导入数据

In [1]:
import numpy as np
import os
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from sklearn.datasets import load_files

In [2]:
import  pickle
# 加载数据集
def load_cifar100_batch(filename):

    with open(filename,'rb') as f:
        datadict=pickle.load(f,encoding='latin1')
        X=datadict['data']
        Y=datadict['fine_labels']
        Y=np.array(Y)
    return X,Y

In [3]:
def load_cifar100(root):
    Xtr,Ytr=load_cifar100_batch(os.path.join(root,'train'))
    Xte,Yte=load_cifar100_batch(os.path.join(root,'test'))
    return Xtr,Ytr,Xte,Yte

In [4]:
Xtr,Ytr,Xte,Yte=load_cifar100("./cifar100/")

In [5]:
print(Xtr.shape, Ytr.shape,Xte.shape,Yte.shape)

(50000, 3072) (50000,) (10000, 3072) (10000,)


In [6]:
X = np.vstack((Xtr,Xte))

In [7]:
y= np.concatenate((Ytr, Yte))

In [8]:
print(X.shape,y.shape)

(60000, 3072) (60000,)


In [9]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [10]:
X_train.shape

(42000, 3072)

# 寻找最优决策树深度

In [24]:
from sklearn.tree import DecisionTreeClassifier
classifier=DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=10,min_samples_split=5,random_state=42)#第一次测试最优
classifier.fit(X_train,y_train)
score=classifier.score(X_test,y_test)
print(score)

0.08594444444444445


In [25]:
from sklearn.tree import DecisionTreeClassifier
classifier=DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=50,min_samples_split=5,random_state=42)#第一次测试最优
classifier.fit(X_train,y_train)
score=classifier.score(X_test,y_test)
print(score)

0.0805


In [26]:
from sklearn.tree import DecisionTreeClassifier
classifier=DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=30,min_samples_split=5,random_state=42)#第一次测试最优
classifier.fit(X_train,y_train)
score=classifier.score(X_test,y_test)
print(score)

0.07922222222222222


In [27]:
from sklearn.tree import DecisionTreeClassifier
classifier=DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=80,min_samples_split=5,random_state=42)#第一次测试最优
classifier.fit(X_train,y_train)
score=classifier.score(X_test,y_test)
print(score)

0.07927777777777778


In [28]:
from sklearn.tree import DecisionTreeClassifier
classifier=DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=100,min_samples_split=5,random_state=42)#第一次测试最优
classifier.fit(X_train,y_train)
score=classifier.score(X_test,y_test)
print(score)

0.08016666666666666


In [29]:
from sklearn.tree import DecisionTreeClassifier
classifier=DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=8,min_samples_split=5,random_state=42)#第一次测试最优
classifier.fit(X_train,y_train)
score=classifier.score(X_test,y_test)
print(score)

0.08172222222222222


In [30]:
from sklearn.tree import DecisionTreeClassifier
classifier=DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=5,min_samples_split=5,random_state=42)#第一次测试最优
classifier.fit(X_train,y_train)
score=classifier.score(X_test,y_test)
print(score)

0.06222222222222222


In [31]:
from sklearn.tree import DecisionTreeClassifier
classifier=DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=15,min_samples_split=5,random_state=42)#第一次测试最优
classifier.fit(X_train,y_train)
score=classifier.score(X_test,y_test)
print(score)

0.08427777777777777


# 基分类器1

In [11]:
import time
import os

In [12]:
from sklearn.tree import DecisionTreeClassifier
start_time_base1=time.time()
classifier1=DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=18,min_samples_split=5,random_state=42)#第一次测试最优
classifier1.fit(X_train,y_train)
score1=classifier1.score(X_test,y_test)
print(score1)
end_time_base1=time.time()

0.08222222222222222


In [13]:
base_time1=end_time_base1-start_time_base1
print('基分类器所用时间：',base_time1)

基分类器所用时间： 421.55856370925903


In [14]:
import joblib

#保存Model
joblib.dump(classifier1,'model_cifar100/base1_learner_cifar100.pkl')

['model_cifar100/base1_learner_cifar100.pkl']

In [15]:
print(os.path.getsize('model_cifar100/base1_learner_cifar100.pkl')/1024/1024)

8.120092391967773


# Bagging

In [12]:
def get_dir_size(target_dir):
    pkl_size=[] #MB
    dir_list=os.listdir(target_dir)
    print(dir_list)
    #计算每个文件的大小
    for file in dir_list:
        file = os.path.join(target_dir, file)
        #如果是文件，直接通过getsize计算大小并加到size中
        if os.path.isfile(file):
            pkl_size.append(os.path.getsize(file)/1024/1024) #MB
    return pkl_size

重跑这个函数

In [13]:
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm

def bagging_clf(base_learner,target_dir):
    #性能指标
    time_bagging=[] #s
    top1_bagging=[]
    
#     n= [20,40,60,80,100,120,140,170,300,400,500]
    n= [20,40,60,80,100,120,140,170,200,300]
    
    for i in tqdm(n):
    
        start_time_bagging=time.time()

        # 创建Bagging集成学习器
        bagging_clf = BaggingClassifier(base_estimator=base_learner, n_estimators=i, random_state=42,n_jobs=-1,bootstrap=True)
        bagging_clf.fit(X_train, y_train)
        y_pred_bagging = bagging_clf.predict(X_test)

        # 评估性能
    #     print("Bagging Accuracy:", accuracy_score(y_test, y_pred_bagging))
        top1_bagging.append(accuracy_score(y_test, y_pred_bagging))

        end_time_bagging=time.time()
        bagging_time=end_time_bagging-start_time_bagging
    #     print('Bagging所用时间：',bagging_time)
        time_bagging.append(bagging_time)

        joblib.dump(bagging_clf,f'{target_dir}/bagging_cifar100_{i}.pkl')
    
    return time_bagging,top1_bagging

In [18]:
time_bagging1,top1_bagging1=bagging_clf(classifier1,'model_cifar100/model1_bagging')

100%|█████████████████████████████████████████████████████████████████████████████| 11/11 [16:26:16<00:00, 5379.64s/it]


In [19]:
pkl_size_bagging1=get_dir_size('model_cifar100/model1_bagging')

['bagging_cifar100_100.pkl', 'bagging_cifar100_120.pkl', 'bagging_cifar100_140.pkl', 'bagging_cifar100_170.pkl', 'bagging_cifar100_20.pkl', 'bagging_cifar100_300.pkl', 'bagging_cifar100_40.pkl', 'bagging_cifar100_400.pkl', 'bagging_cifar100_500.pkl', 'bagging_cifar100_60.pkl', 'bagging_cifar100_80.pkl']


In [20]:
#保存结果
import pandas as pd
n= [20,40,60,80,100,120,140,170,300,400,500]
c1={"基分类器个数" : n,
   "performance" : top1_bagging1,
  "时间":time_bagging1,
  "pkl_size_bagging1":pkl_size_bagging1}#将列表a，b转换成字典
bagging_result1=pd.DataFrame(c1)#将字典转换成为数据框
print(bagging_result1)
bagging_result1.to_csv('model_cifar100/bagging_cifar100_result1.csv')

    基分类器个数  performance            时间  pkl_size_bagging1
0       20     0.157056    627.943440         484.405025
1       40     0.175944   1236.205108         579.197813
2       60     0.184333   1590.830074         674.767754
3       80     0.191778   2083.329652         816.479362
4      100     0.194389   2612.214642         102.549736
5      120     0.197778   3032.314652        1433.157369
6      140     0.201389   3728.886504         197.941705
7      170     0.202667   4777.291127        1908.936829
8      300     0.207556   8821.450423        2383.049320
9      400     0.207333  11460.187042         292.945125
10     500     0.208333  19154.862211         389.308564


# Boosting

In [14]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm

def boosting_clf(base_learner,target_dir):

    #性能指标
    time_boosting=[] #s
    top1_boosting=[]

    n= [20,40,60,80,100,120,140,170,200,300]

    for i in tqdm(n):

        start_time_boosting=time.time()

        # 创建Bagging集成学习器
        boosting_clf = AdaBoostClassifier(base_estimator=base_learner, n_estimators=i, random_state=42)
        boosting_clf.fit(X_train, y_train)
        y_pred_boosting = boosting_clf.predict(X_test)

        # 评估性能
        top1_boosting.append(accuracy_score(y_test, y_pred_boosting))

        end_time_boosting=time.time()
        boosting_time=end_time_boosting-start_time_boosting
        time_boosting.append(boosting_time)
        
        joblib.dump(boosting_clf,f'{target_dir}/boosting_cifar100_{i}.pkl')    
    
    return time_boosting,top1_boosting

In [35]:
time_boosting1,top1_boosting1=boosting_clf(classifier1,'model_cifar100/model1_boosting')

100%|████████████████████████████████████████████████████████████████████████████| 10/10 [87:05:10<00:00, 31351.04s/it]


In [36]:
pkl_size_boosting1=get_dir_size('model_cifar100/model1_boosting')

['boosting_cifar100_100.pkl', 'boosting_cifar100_120.pkl', 'boosting_cifar100_140.pkl', 'boosting_cifar100_170.pkl', 'boosting_cifar100_20.pkl', 'boosting_cifar100_200.pkl', 'boosting_cifar100_300.pkl', 'boosting_cifar100_40.pkl', 'boosting_cifar100_60.pkl', 'boosting_cifar100_80.pkl']


In [37]:
#保存结果
n= [20,40,60,80,100,120,140,170,200,300]
import pandas as pd
cb1={"基分类器个数" : n,
   "performance" : top1_boosting1,
  "时间":time_boosting1,
  "pkl_size_boosting":pkl_size_boosting1}#将列表a，b转换成字典
boost_result1=pd.DataFrame(cb1)#将字典转换成为数据框
print(boost_result1)
boost_result1.to_csv('model_cifar100/boosting_cifar100_result1.csv')

   基分类器个数  performance            时间  pkl_size_boosting
0      20     0.064667   7210.759424          93.361950
1      40     0.067111  12062.219180         108.047668
2      60     0.064556  19304.842715         124.650166
3      80     0.064278  21464.824920         149.749834
4     100     0.062778  25223.121325          40.142467
5     120     0.065056  29111.428207         175.236450
6     140     0.068111  31685.369305         266.962607
7     170     0.073944  39697.124073          56.431488
8     200     0.079944  53890.372960          68.336735
9     300     0.091167  73854.381379          81.231392


调整m个数重新运行

调整max_depth

In [15]:
n= [20,40,60,80,100,120,140,170,200,300]

# 基分类器2

In [16]:
start_time_base2=time.time()
classifier2=DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=10,min_samples_split=5,random_state=42)#第一次测试最优
classifier2.fit(X_train,y_train)
score2=classifier2.score(X_test,y_test)
print(score2)
end_time_base2=time.time()

0.08594444444444445


In [17]:
base_time2=end_time_base2-start_time_base2
print('基分类器所用时间：',base_time2)

基分类器所用时间： 189.77275609970093


In [18]:
import joblib

#保存Model
joblib.dump(classifier2,'model_cifar100/base2_learner_cifar100.pkl')

['model_cifar100/base2_learner_cifar100.pkl']

In [19]:
print(os.path.getsize('model_cifar100/base2_learner_cifar100.pkl')/1024/1024)

1.2007503509521484


# Bagging

In [20]:
time_bagging2,top1_bagging2=bagging_clf(classifier2,'model_cifar100/model2_bagging')

100%|██████████████████████████████████████████████████████████████████████████████| 10/10 [4:55:47<00:00, 1774.78s/it]


In [21]:
pkl_size_bagging2=get_dir_size('model_cifar100/model2_bagging')

['bagging_cifar100_100.pkl', 'bagging_cifar100_120.pkl', 'bagging_cifar100_140.pkl', 'bagging_cifar100_170.pkl', 'bagging_cifar100_20.pkl', 'bagging_cifar100_200.pkl', 'bagging_cifar100_300.pkl', 'bagging_cifar100_40.pkl', 'bagging_cifar100_60.pkl', 'bagging_cifar100_80.pkl']


In [22]:
#保存结果
import pandas as pd
n= [20,40,60,80,100,120,140,170,200,300]
c2={"基分类器个数" : n,
   "performance" : top1_bagging2,
  "时间":time_bagging2,
  "pkl_size_bagging":pkl_size_bagging2}#将列表a，b转换成字典
bagging_result2=pd.DataFrame(c2)#将字典转换成为数据框
print(bagging_result2)
bagging_result2.to_csv('model_cifar100/bagging_cifar100_result2.csv')

   基分类器个数  performance           时间  pkl_size_bagging
0      20     0.144778   300.192063         97.556987
1      40     0.154611   613.908269        116.610899
2      60     0.160444   872.239505        135.898277
3      80     0.162444  1092.600025        164.715873
4     100     0.164722  1382.051738         20.041725
5     120     0.166556  1559.851910        193.399596
6     140     0.167611  1866.024740        289.172002
7     170     0.168611  2238.049685         39.293176
8     200     0.168889  2541.762230         58.706294
9     300     0.170389  5272.253625         78.165116


# Boosting

In [23]:
time_boosting2,top1_boosting2=boosting_clf(classifier2,'model_cifar100/model2_boosting')

100%|████████████████████████████████████████████████████████████████████████████| 10/10 [85:01:26<00:00, 30608.62s/it]


In [24]:
pkl_size_boosting2=get_dir_size('model_cifar100/model2_boosting')

['boosting_cifar100_100.pkl', 'boosting_cifar100_120.pkl', 'boosting_cifar100_140.pkl', 'boosting_cifar100_170.pkl', 'boosting_cifar100_20.pkl', 'boosting_cifar100_200.pkl', 'boosting_cifar100_300.pkl', 'boosting_cifar100_40.pkl', 'boosting_cifar100_60.pkl', 'boosting_cifar100_80.pkl']


In [25]:
#保存结果
import pandas as pd
cb2={"基分类器个数" : n,
   "performance" : top1_boosting2,
  "时间":time_boosting2,
  "pkl_size_boosting":pkl_size_boosting2}#将列表a，b转换成字典
boost_result2=pd.DataFrame(cb2)#将字典转换成为数据框
print(boost_result2)
boost_result2.to_csv('model_cifar100/boosting_cifar100_result2.csv')

   基分类器个数  performance            时间  pkl_size_boosting
0      20     0.068222   3434.388138          24.476971
1      40     0.060000   7307.279373          27.573801
2      60     0.062667  13853.059575          30.373479
3      80     0.063889  19792.524408          34.027673
4     100     0.064500  25413.495997           7.808658
5     120     0.062000  30840.283103          37.921875
6     140     0.058111  32464.849141          50.868814
7     170     0.056056  46044.769627          12.541475
8     200     0.054889  45727.210762          16.810600
9     300     0.054556  81204.494751          20.818486


# 基分类器3

In [26]:
start_time_base3=time.time()
classifier3=DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=8,min_samples_split=5,random_state=42)#第一次测试最优
classifier3.fit(X_train,y_train)
score3=classifier3.score(X_test,y_test)
print(score3)
end_time_base3=time.time()

0.08172222222222222


In [27]:
base_time3=end_time_base3-start_time_base3
print('基分类器所用时间：',base_time3)

基分类器所用时间： 190.1080286502838


In [28]:
import joblib

#保存Model
joblib.dump(classifier3,'model_cifar100/base3_learner_cifar100.pkl')

['model_cifar100/base3_learner_cifar100.pkl']

In [29]:
print(os.path.getsize('model_cifar100/base3_learner_cifar100.pkl')/1024/1024)

0.39746665954589844


# Bagging

In [30]:
time_bagging3,top1_bagging3=bagging_clf(classifier3,'model_cifar100/model3_bagging')

100%|██████████████████████████████████████████████████████████████████████████████| 10/10 [3:39:02<00:00, 1314.30s/it]


In [31]:
pkl_size_bagging3=get_dir_size('model_cifar100/model3_bagging')

['bagging_cifar100_100.pkl', 'bagging_cifar100_120.pkl', 'bagging_cifar100_140.pkl', 'bagging_cifar100_170.pkl', 'bagging_cifar100_20.pkl', 'bagging_cifar100_200.pkl', 'bagging_cifar100_300.pkl', 'bagging_cifar100_40.pkl', 'bagging_cifar100_60.pkl', 'bagging_cifar100_80.pkl']


In [32]:
#保存结果
import pandas as pd
c3={"基分类器个数" : n,
   "performance" : top1_bagging3,
  "时间":time_bagging3,
  "pkl_size_bagging":pkl_size_bagging3}#将列表a，b转换成字典
bagging_result3=pd.DataFrame(c3)#将字典转换成为数据框
print(bagging_result3)
bagging_result3.to_csv('model_cifar100/bagging_cifar100_result3.csv')

   基分类器个数  performance           时间  pkl_size_bagging
0      20     0.131556   274.702234         38.491145
1      40     0.137500   483.918506         46.073774
2      60     0.140889   669.602207         53.689049
3      80     0.143111   966.434433         65.123389
4     100     0.145556  1065.712024          7.897774
5     120     0.147056  1274.405816         76.409163
6     140     0.147556  1485.410996        114.256978
7     170     0.147667  1741.001485         15.552224
8     200     0.148389  2015.369994         23.206706
9     300     0.150278  3160.239135         30.831788


# Boosting

In [33]:
time_boosting3,top1_boosting3=boosting_clf(classifier3,'model_cifar100/model3_boosting')

100%|████████████████████████████████████████████████████████████████████████████| 10/10 [45:39:30<00:00, 16437.02s/it]


In [34]:
pkl_size_boosting3=get_dir_size('model_cifar100/model3_boosting')

['boosting_cifar100_100.pkl', 'boosting_cifar100_120.pkl', 'boosting_cifar100_140.pkl', 'boosting_cifar100_170.pkl', 'boosting_cifar100_20.pkl', 'boosting_cifar100_200.pkl', 'boosting_cifar100_300.pkl', 'boosting_cifar100_40.pkl', 'boosting_cifar100_60.pkl', 'boosting_cifar100_80.pkl']


In [35]:
#保存结果
import pandas as pd
cb3={"基分类器个数" : n,
   "performance" : top1_boosting3,
  "时间":time_boosting3,
  "pkl_size_boosting":pkl_size_boosting3}#将列表a，b转换成字典
boost_result3=pd.DataFrame(cb3)#将字典转换成为数据框
print(boost_result3)
boost_result3.to_csv('model_cifar100/boosting_cifar100_result3.csv')

   基分类器个数  performance            时间  pkl_size_boosting
0      20     0.068278   3319.988853          14.403131
1      40     0.064167   6386.744726          16.337480
2      60     0.063333   9344.196559          18.229379
3      80     0.061944  11856.188519          21.080289
4     100     0.062556  12887.429691           3.893426
5     120     0.061444  15784.988125          23.640580
6     140     0.059833  18418.236914          31.305752
7     170     0.057667  22376.350152           6.970658
8     200     0.056778  25695.599679           9.739312
9     300     0.055278  38298.425575          12.202653
