# Pre-processing

## KNN Interpolation

In [None]:
import pandas as pd
from sklearn.impute import KNNImputer

# 读取CSV文件
data = pd.read_csv(r'xxx.csv')

# 分离标签列（类型）
labels = data['Type']
data = data.drop(columns=['Type'])

# 初始化KNN填充器
imputer = KNNImputer(n_neighbors=5)  # 可以调整n_neighbors参数来设置K值

# 使用KNN填充空缺值
data_filled = imputer.fit_transform(data)

# 将填充后的数据转换为DataFrame
data_filled = pd.DataFrame(data_filled, columns=data.columns)

# 将标签列添加回数据中
data_filled['Type'] = labels

# 保存填充后的数据到新的CSV文件
data_filled.to_csv(r'xxx.csv', index=False)

## CLR Transformation

In [None]:
import pandas as pd
import numpy as np

# 读取CSV文件
data = pd.read_csv(r'xxx.csv')

# 分离标签列（类型）
labels = data['Type']
data = data.drop(columns=['Type'])

# 对数据进行居中对数变换
data_transformed = np.log1p(data)  # 使用np.log1p函数来避免对0值取对数

# 将标签列添加回数据中
data_transformed['Type'] = labels

# 保存变换后的数据到新的CSV文件
data_transformed.to_csv(r'xxx.csv', index=False)

# EDA

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 加载数据
file_path = r'xxx.csv'
data = pd.read_csv(file_path)

# 设置字体为Palatino Linotype
plt.rcParams['font.family'] = 'Palatino Linotype'

# 使用 Seaborn 绘制散点图矩阵
pairplot = sns.pairplot(data, hue='Type', markers='o', palette='tab10')

# 修改图例的标签
plt.legend(title='Type', loc='center left', bbox_to_anchor=(1, 0.5))
current_legend = plt.gca().get_legend()

# 修改图例的标签字符
new_labels = ['Skarn', 'VMS', 'Epithermal', 'Orogenic', 'Carlin', 'Porphyry', 'Magmatic Sulfide']
for i, label in enumerate(current_legend.texts):
    label.set_text(new_labels[i])

plt.show()

# Correlation Analysis

# PCA

# Silhouette Coefficient

# Hyperparameters Tuning

## SVM 

## RF

# Evaluation

# 决策边界（二维判别图的横纵两轴）

In [None]:
#使用svm分类器
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA  
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split  
from sklearn import datasets
from sklearn import svm
from sklearn.linear_model import LogisticRegression
import pandas as pd#导入pandas库
df_wine=pd.read_csv(r'xxx.csv')
mydata_data=df_wine[df_wine.columns[1:]].values
mydata_target=df_wine['Type'].values

from sklearn.preprocessing import StandardScaler  
stdScale1 = StandardScaler().fit(mydata_data) #生成规则(建模)  
mydata_trainScaler = stdScale1.transform(mydata_data)#对训练集进行标准化  
x1=mydata_trainScaler
y1=mydata_target

gamma=50
svc=svm.SVC(kernel='rbf',C=10,gamma=gamma)
svc.fit(x1,y1)
print('SV number:',svc.support_)
print('SV set:',svc.support_vectors_)
print('SVC score:',svc.score(x1,y1))
print(40*'*')
logi = LogisticRegression(C=1.0,penalty='l2',solver='sag',max_iter=1000)
 
svc_linear=svm.SVC(C=1.0,kernel="linear")
 
svc_rbf1=svm.SVC(C=1.0,kernel="rbf",gamma=0.5)
 
svc_rbf2=svm.SVC(C=1.0,kernel="rbf",gamma=50)
clfs=[logi,svc_linear,svc_rbf1,svc_rbf2]
titles=["逻辑斯蒂回归",'线性回归函数SVM','RBF核函数(gamma=0.5)','RBF核函数(gamma=50)']
clr1=[logi]
 
for clf,i in zip(clfs,range(len(clfs))):
    clf.fit(x1,y1)
    print(titles[i],'在全体样本集上的性能评分：',clf.score(x1,y1))
print(40*'*')

# SHAP

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# 1. 加载CSV数据
data = pd.read_csv(r'E:\LW\DATA_NEW_CLR_COPY.csv')  # 替换 'your_data.csv' 为你的数据文件路径

# 2. 分割数据集为特征 (X) 和标签 (y)
X = data.iloc[:, 1:]  # 假设除了第一列的所有列都是特征
y = data.iloc[:, 0]   # 第一列是标签

# 3. 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. 创建随机森林分类器
rf_classifier = RandomForestClassifier(n_estimators=175, max_depth=20, random_state=42)

# 5. 训练分类器
rf_classifier.fit(X_train, y_train)

# 6. 进行预测
y_pred = rf_classifier.predict(X_test)

# 7. 可视化混淆矩阵
confusion = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()
# 8. 可视化ROC曲线
plt.figure(figsize=(8, 6))

# 计算每个类别的ROC曲线
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(np.unique(y))):
    y_one_hot = (y_test == i).astype(int)
    y_score = rf_classifier.predict_proba(X_test)[:, i]
    fpr[i], tpr[i], _ = roc_curve(y_one_hot, y_score)
    roc_auc[i] = auc(fpr[i], tpr[i])

# 绘制所有类别的ROC曲线
for i in range(len(np.unique(y))):
    plt.plot(fpr[i], tpr[i], label=f'Class {i} (AUC = {roc_auc[i]:.4f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

from sklearn.metrics import accuracy_score
y_test_pred_svm = rf_classifier.predict(X_test)
test_accuracy_svm = accuracy_score(y_test, y_test_pred_svm)  # 计算训练集的精度
print("Test Accuracy (GB):", test_accuracy_svm)

In [None]:
import shap
import matplotlib.pyplot as plt

# 假设 rf_classifier、X_test、data 和 y 在此代码之前已经定义
# 7. 可视化混淆矩阵
explainer = shap.TreeExplainer(rf_classifier)
shap_values = explainer.shap_values(X_test)

# 8. 可视化SHAP值
shap.summary_plot(shap_values, X_test, feature_names=data.columns[1:], class_names=np.unique(y), plot_type="bar")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import shap
import matplotlib.pyplot as plt

# 设置中文字体为 Palatino Linotype
plt.rcParams['font.sans-serif'] = ['Palatino Linotype']
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

# 1. 加载CSV数据
data = pd.read_csv(r'E:\LW\DATA_NEW_CLR_COPY.csv')  # 替换 'your_data.csv' 为你的数据文件路径

# 2. 分割数据集为特征 (X) 和标签 (y)
X = data.iloc[:, 1:]  # 假设除了第一列的所有列都是特征
y = data.iloc[:, 0]   # 第一列是标签

# 3. 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. 创建随机森林分类器
rf_classifier = RandomForestClassifier(n_estimators=151, max_depth=20, random_state=42)

# 5. 训练分类器
rf_classifier.fit(X_train, y_train)

# 6. 使用 shap 计算 Shapley 值
explainer = shap.TreeExplainer(rf_classifier)
shap_values = explainer.shap_values(X_test)

# 绘制 Shapley 值摘要图，并添加标签信息
for output_index in range(len(shap_values)):
    shap.summary_plot(shap_values[output_index], X_test, plot_type="dot", title=f"Output {output_index}", feature_names=X_test.columns)
    plt.figure()  # 创建新的图形
plt.show()  # 显示所有摘要图
