In [1]:
import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
def load_image():
    T = []
    for i in range(1, 8001):
        if i < 10:
            location = '000' + str(i)
        elif i < 100:
            location = '00' + str(i)
        elif i < 1000:
            location = '0' + str(i)
        else:
            location = str(i)
        image = cv2.imread('./Black_data/0' + location + '.jpg', cv2.IMREAD_GRAYSCALE)
        T.append(image)
    return T
data = load_image()
#data #8000*32*32

In [3]:
info = []
for i in range(0, 8000):
    tmp = []
    for j in range(0, 32):
        for k in range(0, 32):
            tmp.append(float(data[i][j][k]))
    info.append(tmp)

In [4]:
csv_target = pd.read_csv('annotations.csv')
target = csv_target['class'][:8000]

In [5]:
x1_train, x1_test, y1_train, y1_test = train_test_split(info, target, test_size=0.2)

In [11]:
# 决策树算法
dt = DecisionTreeClassifier()
dt.fit(x1_train, y1_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [12]:
s_train = dt.score(x1_train, y1_train)
print("训练集准确率：{}%".format(int(s_train*10000)/100))
s_test = dt.score(x1_test, y1_test)
print("测试集准确率：{}%".format(int(s_test*10000)/100))

训练集准确率：99.89%
测试集准确率：1.12%


In [13]:
# 随机森林算法
rf = RandomForestClassifier(n_estimators = 10, oob_score = True)
rf.fit(x1_train, y1_train)

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [14]:
s_train = rf.score(x1_train, y1_train)
print("训练集准确率：{}%".format(int(s_train*10000)/100))
s_test = rf.score(x1_test, y1_test)
print("测试集准确率：{}%".format(int(s_test*10000)/100))

训练集准确率：99.71%
测试集准确率：1.06%


In [15]:
#求最高准确率
s1_rf_ = []
s2_rf_ = []
for i in range(0, 10):
    rf = RandomForestClassifier(n_estimators = 20)
    rf.fit(x1_train, y1_train)
    s1_rf = rf.score(x1_train, y1_train)
    s1_rf_.append(s1_rf)
    s2_rf = rf.score(x1_test, y1_test)
    s2_rf_.append(s2_rf)
print("训练集最高准确率：{}%".format(int(max(s1_rf_)*10000)/100))
print("测试集最高准确率：{}%".format(int(max(s2_rf_)*10000)/100))

训练集最高准确率：99.89%
测试集最高准确率：1.5%


In [16]:
# SVM算法
# rbf核函数，设置数据权重
svc = SVC(kernel='rbf', class_weight='balanced')
# 训练模型
clf = svc.fit(x1_train, y1_train)



In [17]:
s_train = svc.score(x1_train, y1_train)
print("训练集准确率：{}%".format(int(s_train*10000)/100))
s_test = svc.score(x1_test, y1_test)
print("测试集准确率：{}%".format(int(s_test*10000)/100))

训练集准确率：99.89%
测试集准确率：0.56%


In [6]:
# 调参优化
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
svc = SVC(kernel='rbf', class_weight='balanced',)
c_range = np.logspace(-5, 15, 11, base=2)
gamma_range = np.logspace(-9, 3, 13, base=2)
# 网格搜索交叉验证的参数范围，cv=3,3折交叉
param_grid = [{'kernel': ['rbf'], 'C': c_range, 'gamma': gamma_range}]# 参数列表，表明参数范围
grid = GridSearchCV(svc, param_grid, cv=3, n_jobs=-1)# 寻找最优参数，传递给模型
# 训练模型
clf = grid.fit(x1_train, y1_train)
s_train = grid.score(x1_train, y1_train)
print("训练集准确率：{}%".format(int(s_train*10000)/100))
s_test = grid.score(x1_test, y1_test)
print("测试集准确率：{}%".format(int(s_test*10000)/100))

训练集准确率：99.89%
测试集准确率：0.75%
