## 玻璃分类规律

### 导入第三方库

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, log_loss
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

### 处理数据

数据中 x为 12个化学成分, y为其对应的玻璃种类

In [2]:
excelpath = "merged_excel.xlsx"
dateGlass = pd.read_excel(excelpath, sheet_name = 0)
# 获取 X 数据
date_x = np.asarray(dateGlass.iloc[:,3:16])
# 获取对应的 Y 数据
date_y = np.asarray(dateGlass.iloc[:,-1])
# 划分数据
X_train, X_test, y_train, y_test = train_test_split(date_x,date_y,test_size=0.2)

In [3]:
dateGlass.iloc[:,3:16].columns

Index(['氧化钠(Na2O)', '氧化钾(K2O)', '氧化钙(CaO)', '氧化镁(MgO)', '氧化铝(Al2O3)',
       '氧化铁(Fe2O3)', '氧化铜(CuO)', '氧化铅(PbO)', '氧化钡(BaO)', '五氧化二磷(P2O5)',
       '氧化锶(SrO)', '氧化锡(SnO2)', '二氧化硫(SO2)'],
      dtype='object')

### 第一种方法: 逻辑回归

In [4]:
# 创建逻辑回归实例
logistic_regression = LogisticRegression(max_iter=1000, solver='lbfgs', penalty='l2',C=1)
# 训练模型
logistic_regression.fit(X_train, y_train)
# 预测测试集
y_pred = logistic_regression.predict(X_test)
# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 1.00


### 分析分类的统计规律

混淆矩阵

In [5]:
# 计算混淆矩阵
conf_matrix = confusion_matrix(y_test, y_pred)

召回率

In [6]:
# 计算精确率和召回率
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

In [7]:
precision, recall

(1.0, 1.0)

ROC曲线

In [8]:
# ROC曲线和AUC值
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

In [9]:
roc_auc

1.0

In [10]:
# 假设model是已经训练好的逻辑回归模型
coefficients = logistic_regression.coef_

# 获取特征名称和对应的系数
feature_coefficients = pd.DataFrame({
    'Feature': dateGlass.iloc[:,3:16].columns,
    'Coefficient': logistic_regression.coef_.flatten()
})
feature_coefficients

Unnamed: 0,Feature,Coefficient
0,氧化钠(Na2O),-0.029853
1,氧化钾(K2O),0.129016
2,氧化钙(CaO),0.08172
3,氧化镁(MgO),-0.006992
4,氧化铝(Al2O3),-0.156933
5,氧化铁(Fe2O3),0.023518
6,氧化铜(CuO),0.047159
7,氧化铅(PbO),-0.603288
8,氧化钡(BaO),-0.223957
9,五氧化二磷(P2O5),0.004774


交叉验证

In [11]:
# 使用分层抽样的K折交叉验证

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5)
fold_accuracies = []

for train_index, test_index in skf.split(date_x, date_y):
    
    X_train, X_test = date_x[train_index], date_x[test_index]
    y_train, y_test = date_y[train_index], date_y[test_index]

    logistic_regression.fit(X_train, y_train)
    y_pred = logistic_regression.predict(X_test)
    fold_accuracy = accuracy_score(y_test, y_pred)
    
    fold_accuracies.append(fold_accuracy)

# 打印每个折的准确率和平均准确率
print("Accuracies for each fold:", fold_accuracies)
print("Average accuracy:", sum(fold_accuracies) / len(fold_accuracies))


Accuracies for each fold: [1.0, 1.0, 1.0, 1.0, 1.0]
Average accuracy: 1.0


###  第二种方法: 决策树

In [12]:
# 创建决策树分类器实例
clf = DecisionTreeClassifier()
# 训练模型
clf.fit(X_train,y_train)
# 预测测试集
y_pred_clf = clf.predict(X_test)
# 计算准确率
accuracy = accuracy_score(y_test, y_pred_clf)
print(f"Model accuracy: {accuracy:.2f}")

Model accuracy: 1.00


In [13]:
conf_matrix = confusion_matrix(y_test,y_pred_clf)
conf_matrix

array([[10,  0],
       [ 0,  3]], dtype=int64)

评估模型

In [14]:
precision = precision_score(y_test, y_pred_clf)
recall = recall_score(y_test, y_pred_clf)

In [15]:
# ROC曲线和AUC值
fpr, tpr, thresholds = roc_curve(y_test, y_pred_clf)
roc_auc = auc(fpr, tpr)