In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
import joblib
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
import sklearn.metrics as metrics
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

In [25]:
# 测试集：83个音频特征数据读入
test_path = r".\input\83_sound_features.xlsx"
# test_path = r".\input\44_sound_features.xlsx"
train_path = r".\input\6747_sound_features.xls"
test_data = pd.read_excel(test_path)
train_data = pd.read_excel(train_path)
test_data

Unnamed: 0,index,score,Leq_mean,Leq_var,Leq_std,Leq_max,Leq_10,Leq_25,Leq_median,Leq_75,...,Tonality_10,Tonality_25,Tonality_median,Tonality_75,Tonality_90,Tonality_10-Tonality_90,pctn,pctu,pctm,level
0,1,,49.88000,1.93000,1.39000,54.25000,48.18000,48.84000,49.75000,50.67000,...,0.00,0.03000,0.06000,0.09000,0.12000,-0.12000,0.615385,0.384615,0.000000,1
1,2,,63.58000,11.19000,3.35000,68.46000,58.55000,62.75000,64.45000,65.59000,...,0.02,0.04000,0.07000,0.11000,0.18000,-0.16000,0.25,0.083333,0.666667,3
2,3,,70.97952,8.67463,2.94642,81.49453,67.78887,68.49439,70.49512,72.93534,...,0.00,0.00000,0.03255,0.05599,0.07266,0.07266,,,,4
3,4,,61.93000,7.06000,2.66000,70.05000,58.76000,60.00000,61.72000,63.38000,...,0.06,0.11000,0.18000,0.26000,0.33000,-0.27000,0,1.000000,0.000000,3
4,5,,59.44000,8.86000,2.98000,65.99000,55.56000,56.92000,59.31000,61.63000,...,0.00,0.00000,0.02000,0.05000,0.07000,-0.07000,0.8,0.200000,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,79,,57.16000,26.06000,5.10000,64.71000,50.22000,51.23000,59.83000,61.54000,...,0.01,0.03000,0.06000,0.08000,0.11000,-0.10000,0,0.333333,0.666667,4
79,80,,47.11975,18.50206,4.30255,57.48161,41.91348,43.71102,46.42073,50.37455,...,0.00,0.00000,0.01631,0.04248,0.06278,0.06278,,,,2
80,81,,56.13524,2.47095,1.57224,60.88622,54.39929,55.00396,55.72117,57.32652,...,0.00,0.00000,0.00000,0.00000,0.01136,0.01136,,,,3
81,82,,53.80151,5.21296,2.28365,59.98065,50.61944,51.91705,54.02594,55.43391,...,0.00,0.03333,0.07310,0.12003,0.15806,0.15806,,,,3


In [26]:
test_data.shape

(83, 66)

In [27]:
# 36维特征 + level(暂时还没有）
column_36 = ['Leq_mean', 'Leq_std', 'Leq_25', 'Leq_median',
             'Leq_75', 'Leq_10-Leq_90', 'Loudness_mean', 'Loudness_std',
             'Loudness_25', 'Loudness_median', 'Loudness_75',
             'Loudness_10-Loudness_90', 'Roughness_mean', 'Roughness_std',
             'Roughness_25', 'Roughness_median', 'Roughness_75',
             'Roughness_10-Roughness_90', 'Sharpness_mean', 'Sharpness_std',
             'Sharpness_25', 'Sharpness_median', 'Sharpness_75',
             'Sharpness_10-Sharpness_90', 'Fluct_mean', 'Fluct_std', 'Fluct_25',
             'Fluct_median', 'Fluct_75', 'Fluct_10-Fluct_90', 'Tonality_mean',
             'Tonality_std', 'Tonality_25', 'Tonality_median', 'Tonality_75',
             'Tonality_10-Tonality_90', 'level']
len(column_36)

37

In [28]:
feature_use = column_36
# test_x只取前36个
test_x = test_data[feature_use[:-1]]
test_y = test_data[feature_use[-1]]

# train同理
train_x = train_data[feature_use[:-1]]
train_y = train_data[feature_use[-1]]

In [29]:
# 将数据进行l2正则化
test_x = normalize(test_x, norm="l2")
train_x = normalize(train_x, norm="l2")

In [30]:
len(train_x), len(train_y)

(6747, 6747)

In [31]:
# 分类器重写
def classifier(nb_class):
    """
    分类器集合
    :param nb_class: 类别数
    :return:分类器列表
    """
    clf = [
        KNeighborsClassifier(nb_class),  # 0.68
        DecisionTreeClassifier(max_depth=40),  # 0.69
        SVC(C=10, kernel='rbf', class_weight={1: 8, 2: 4, 3: 1, 4: 10}),  # 0.42
        ExtraTreesClassifier(n_estimators=80, max_depth=24, min_samples_split=3, max_features='auto', random_state=0),
        # 0.825
        # 'max_depth': 21, 'min_samples_split': 2, 'n_estimators': 95}
        RandomForestClassifier(n_estimators=95, max_depth=21, min_samples_split=2, random_state=0),  # 0.758
        # 'max_depth': 11, 'n_estimators': 110
        # XGBClassifier(n_estimators=110, max_depth=11, learning_rate=0.001, objective='multi:softmax', nb_class=4),
        LogisticRegression(penalty='l2', class_weight='balanced', solver='lbfgs')]  # 0.45

    return clf

In [32]:
# 我也没搞懂这里要干啥
nb_class = 4
clf_list = classifier(nb_class)

# 0: KNN    1: 决策树    2: SVC
# 3: 极端树  4: 随机森林  5: LogisticRegression
clf = clf_list[0]
# 训练分类器
clf.fit(train_x, train_y)

KNeighborsClassifier(n_neighbors=4)

In [33]:
# 导入模型
joblib.dump(clf, r'./model/model_0516.pkl')
pre = clf.predict(test_x)

In [34]:
# 计算正确率
# 有+-1
# def accuracy(test_y, pre):
#     same = 0
#     for i in range(len(test_y)):
#         if test_y[i] == pre[i] or test_y[i] == pre[i] - 1 or test_y[i] == pre[i]+1:
#             same += 1
#     return (same / len(test_y))

# 无+-1
def accuracy(test_y, pre):
    same = 0
    for i in range(len(test_y)):
        if test_y[i] == pre[i]:
            same += 1
    return (same / len(test_y))

In [35]:
accuracy(test_y, pre)

0.3855421686746988

In [36]:
analysis_columns = ["index", "true_level", "predict_level", "difference"]
analysis_83_sounds = pd.DataFrame(columns=analysis_columns)

In [40]:
analysis_83_sounds['index'] = test_data['index']
analysis_83_sounds['true_level'] = test_data['level']

In [43]:
analysis_83_sounds['predict_level'] = pre

In [47]:
difference = abs(analysis_83_sounds['true_level'] -analysis_83_sounds['predict_level'])
analysis_83_sounds['difference'] = difference

In [48]:
analysis_83_sounds

Unnamed: 0,index,true_level,predict_level,difference
0,1,1,3,2
1,2,3,3,0
2,3,4,1,3
3,4,3,3,0
4,5,1,3,2
...,...,...,...,...
78,79,4,3,1
79,80,2,2,0
80,81,3,1,2
81,82,3,2,1


In [50]:
analysis_83_sounds.to_csv(r".\output\analysis_83_sounds.csv",index=None)