In [1]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

In [3]:
X, y = make_blobs(n_samples=20000, n_features=100, centers=200,
    random_state=0)
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=1, train_size=0.8)



In [2]:
# 读取数据carEvaluation数据集
path = u'C:/Users/jxjsj/Desktop/JupyterHome/Data/carEvaluation.txt'
dataSet = pd.read_csv(path, header = None)
x_temp, y_temp = np.split(dataSet, (6,), axis=1)
x_train, x_test, y_train, y_test = train_test_split(x_temp, y_temp, random_state=1, train_size=0.8)



In [3]:
# 属性变量编码
vec = DictVectorizer(sparse=False)
x_train = vec.fit_transform(x_train.to_dict(orient='record'))   #对训练数据的特征进行提取
x_test = vec.transform(x_test.to_dict(orient='record'))         #对测试数据的特征进行提取
vec.feature_names_

['0=high',
 '0=low',
 '0=med',
 '0=vhigh',
 '1=high',
 '1=low',
 '1=med',
 '1=vhigh',
 '2=2',
 '2=3',
 '2=4',
 '2=5more',
 '3=2',
 '3=4',
 '3=more',
 '4=big',
 '4=med',
 '4=small',
 '5=high',
 '5=low',
 '5=med']

In [7]:
# 训练决策树
blobEvaluation = DecisionTreeClassifier(class_weight = "balanced",
                                        criterion='entropy')
blobEvaluation = blobEvaluation.fit(x_train, y_train)
y_train_predict = blobEvaluation.predict(x_train)
y_test_predict = blobEvaluation.predict(x_test)
print('trainAccracy:',blobEvaluation.score(x_train,y_train))
print(classification_report(y_train,y_train_predict))
print('testAccracy:',blobEvaluation.score(x_test,y_test))
print(classification_report(y_test,y_test_predict))

trainAccracy: 1.0
              precision    recall  f1-score   support

         acc       1.00      1.00      1.00       325
        good       1.00      1.00      1.00        55
       unacc       1.00      1.00      1.00       950
       vgood       1.00      1.00      1.00        52

   micro avg       1.00      1.00      1.00      1382
   macro avg       1.00      1.00      1.00      1382
weighted avg       1.00      1.00      1.00      1382

testAccracy: 0.9739884393063584
              precision    recall  f1-score   support

         acc       0.90      0.97      0.93        59
        good       1.00      0.86      0.92        14
       unacc       0.99      0.98      0.98       260
       vgood       1.00      1.00      1.00        13

   micro avg       0.97      0.97      0.97       346
   macro avg       0.97      0.95      0.96       346
weighted avg       0.98      0.97      0.97       346



In [22]:
# 训练随机森林，子属性集属性个数 log2底d，d为特征数
blobEvaluation = RandomForestClassifier(n_estimators=26, 
                                        max_depth=None,
                                        criterion='entropy',
                                        max_features='log2',
                                        class_weight = "balanced")
blobEvaluation.fit(x_train, y_train)
y_train_predict = blobEvaluation.predict(x_train)
y_test_predict = blobEvaluation.predict(x_test)
print('trainAccracy:',blobEvaluation.score(x_train,y_train))
print(classification_report(y_train,y_train_predict))
print('testAccracy:',blobEvaluation.score(x_test,y_test))
print(classification_report(y_test,y_test_predict))

trainAccracy: 1.0
              precision    recall  f1-score   support

         acc       1.00      1.00      1.00       325
        good       1.00      1.00      1.00        55
       unacc       1.00      1.00      1.00       950
       vgood       1.00      1.00      1.00        52

   micro avg       1.00      1.00      1.00      1382
   macro avg       1.00      1.00      1.00      1382
weighted avg       1.00      1.00      1.00      1382

testAccracy: 0.9508670520231214
              precision    recall  f1-score   support

         acc       0.78      0.98      0.87        59
        good       1.00      0.79      0.88        14
       unacc       1.00      0.96      0.98       260
       vgood       0.92      0.85      0.88        13

   micro avg       0.95      0.95      0.95       346
   macro avg       0.93      0.89      0.90       346
weighted avg       0.96      0.95      0.95       346



  import sys


In [51]:
# 训练bagging，基学习器为决策树
blobEvaluation = BaggingClassifier( base_estimator=DecisionTreeClassifier(class_weight = "balanced",
                                                                          criterion='entropy'), 
                                    n_estimators=46, 
                                    max_samples=0.9, 
                                    max_features=0.9, 
                                    bootstrap=True, 
                                    bootstrap_features=False, 
                                    oob_score=False, 
                                    warm_start=False, 
                                    n_jobs=None, 
                                    random_state=1, 
                                    verbose=0)
blobEvaluation.fit(x_train, y_train)
y_train_predict = blobEvaluation.predict(x_train)
y_test_predict = blobEvaluation.predict(x_test)
print('trainAccracy:',blobEvaluation.score(x_train,y_train))
print(classification_report(y_train,y_train_predict))
print('testAccracy:',blobEvaluation.score(x_test,y_test))
print(classification_report(y_test,y_test_predict))

  y = column_or_1d(y, warn=True)


trainAccracy: 0.9992764109985528
              precision    recall  f1-score   support

         acc       1.00      1.00      1.00       325
        good       1.00      1.00      1.00        55
       unacc       1.00      1.00      1.00       950
       vgood       1.00      1.00      1.00        52

   micro avg       1.00      1.00      1.00      1382
   macro avg       1.00      1.00      1.00      1382
weighted avg       1.00      1.00      1.00      1382

testAccracy: 0.9710982658959537
              precision    recall  f1-score   support

         acc       0.87      0.98      0.92        59
        good       0.93      1.00      0.97        14
       unacc       1.00      0.97      0.98       260
       vgood       1.00      1.00      1.00        13

   micro avg       0.97      0.97      0.97       346
   macro avg       0.95      0.99      0.97       346
weighted avg       0.97      0.97      0.97       346



In [5]:
# 训练bagging，基学习器为神经网络
blobEvaluation = BaggingClassifier( MLPClassifier(activation='tanh', 
                                                  solver='lbfgs',
                                                  alpha=1e-5,
                                                  hidden_layer_sizes=(len(x_train)*2+1,),
                                                  random_state=1),
                                    n_estimators=15, 
                                    max_samples=0.9, 
                                    max_features=0.9, 
                                    bootstrap=True, 
                                    bootstrap_features=False, 
                                    oob_score=False, 
                                    warm_start=False, 
                                    n_jobs=None, 
                                    random_state=None, 
                                    verbose=0)

blobEvaluation.fit(x_train, y_train)
y_train_predict = blobEvaluation.predict(x_train)
y_test_predict = blobEvaluation.predict(x_test)
print('trainAccracy:',blobEvaluation.score(x_train,y_train))
print(classification_report(y_train,y_train_predict))
print('testAccracy:',blobEvaluation.score(x_test,y_test))
print(classification_report(y_test,y_test_predict))

  y = column_or_1d(y, warn=True)


trainAccracy: 1.0
              precision    recall  f1-score   support

         acc       1.00      1.00      1.00       325
        good       1.00      1.00      1.00        55
       unacc       1.00      1.00      1.00       950
       vgood       1.00      1.00      1.00        52

   micro avg       1.00      1.00      1.00      1382
   macro avg       1.00      1.00      1.00      1382
weighted avg       1.00      1.00      1.00      1382

testAccracy: 0.9739884393063584
              precision    recall  f1-score   support

         acc       0.92      0.93      0.92        59
        good       0.87      0.93      0.90        14
       unacc       0.99      0.98      0.99       260
       vgood       1.00      1.00      1.00        13

   micro avg       0.97      0.97      0.97       346
   macro avg       0.94      0.96      0.95       346
weighted avg       0.97      0.97      0.97       346



In [59]:
blobEvaluation = MLPClassifier(activation='tanh', 
                               solver='lbfgs',
                               alpha=1e-5,
                               hidden_layer_sizes=(len(x_train)*2+1,))
blobEvaluation.fit(x_train, y_train)
y_train_predict = blobEvaluation.predict(x_train)
y_test_predict = blobEvaluation.predict(x_test)
print('trainAccracy:',blobEvaluation.score(x_train,y_train))
print(classification_report(y_train,y_train_predict))
print('testAccracy:',blobEvaluation.score(x_test,y_test))
print(classification_report(y_test,y_test_predict))

  y = column_or_1d(y, warn=True)


trainAccracy: 1.0
              precision    recall  f1-score   support

         acc       1.00      1.00      1.00       325
        good       1.00      1.00      1.00        55
       unacc       1.00      1.00      1.00       950
       vgood       1.00      1.00      1.00        52

   micro avg       1.00      1.00      1.00      1382
   macro avg       1.00      1.00      1.00      1382
weighted avg       1.00      1.00      1.00      1382

testAccracy: 0.9653179190751445
              precision    recall  f1-score   support

         acc       0.87      0.93      0.90        59
        good       0.87      0.93      0.90        14
       unacc       0.99      0.98      0.98       260
       vgood       1.00      0.92      0.96        13

   micro avg       0.97      0.97      0.97       346
   macro avg       0.93      0.94      0.94       346
weighted avg       0.97      0.97      0.97       346



In [None]:
max_n = 0
max_p = 0
for n in range(7,25):
    blobEvaluation = BaggingClassifier( MLPClassifier(activation='tanh', 
                                                  solver='adam',
                                                  alpha=1e-5,
                                                  hidden_layer_sizes=(len(x_train)*2+1,),
                                                  random_state=1),
                                    n_estimators=n, 
                                    max_samples=0.9, 
                                    max_features=0.9, 
                                    bootstrap=True, 
                                    bootstrap_features=False, 
                                    oob_score=False, 
                                    warm_start=False, 
                                    n_jobs=None, 
                                    random_state=None, 
                                    verbose=0)
    blobEvaluation.fit(x_train, y_train)
    y_train_predict = blobEvaluation.predict(x_train)
    y_test_predict = blobEvaluation.predict(x_test)
    if max_p < blobEvaluation.score(x_test,y_test):
        max_n = n
        max_p = blobEvaluation.score(x_test,y_test)
max_n