# Pre-processing

## KNN Interpolation

In [None]:
import pandas as pd
from sklearn.impute import KNNImputer

# Read the CSV file
data = pd.read_csv(r'xxx.csv')

# Separate columns of labels (type)
labels = data['Type']
data = data.drop(columns=['Type'])

# Initialise the KNN filler
imputer = KNNImputer(n_neighbors=5)  # You can adjust the n_neighbors parameter to set the K value

# Use KNN to fill the vacanciesdata_filled = imputer.fit_transform(data)

# Transform the filled data into a DataFrame
data_filled = pd.DataFrame(data_filled, columns=data.columns)

# Add the labelled columns back into the data
data_filled['Type'] = labels

# Save the filled data to a new CSV filedata_filled.to_csv(r'xxx.csv', index=False)

## CLR Transformation

In [None]:
import pandas as pd
import numpy as np

# Read the CSV file
data = pd.read_csv(r'xxx.csv')

# Separate columns of labels (type)
labels = data['Type']
data = data.drop(columns=['Type'])

# Perform a centred logarithmic transformation on the data
data_transformed = np.log1p(data)
# Add the labelled columns back into the data
data_transformed['Type'] = labels

# Save the transformed data to a new CSV file
data_transformed.to_csv(r'xxx.csv', index=False)

# EDA

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load data
file_path = r'xxx.csv'
data = pd.read_csv(file_path)

# Set the font to Palatino Linotype
plt.rcParams['font.family'] = 'Palatino Linotype'

# Use Seaborn to plot the scatterplot matrix
pairplot = sns.pairplot(data, hue='Type', markers='o', palette='tab10')

# Modify the legend's labels
plt.legend(title='Type', loc='center left', bbox_to_anchor=(1, 0.5))
current_legend = plt.gca().get_legend()

# Modify the label characters of the legend
new_labels = ['Skarn', 'VMS', 'Epithermal', 'Orogenic', 'Carlin', 'Porphyry', 'Magmatic Sulfide']
for i, label in enumerate(current_legend.texts):
    label.set_text(new_labels[i])

plt.show()

# Correlation Analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assume data is loaded into a DataFrame named 'data'
data = pd.read_csv(r'xxx.csv')

# Remove 'Type' column
data_without_type = data.drop('Type', axis=1)

# Group by 'Type' and perform Spearman correlation analysis
types = data['Type'].unique()

for t in types:
    subset = data[data['Type'] == t]
    
    # Drop 'Type' column and calculate Spearman correlation using listwise method
    correlation_matrix = subset.drop('Type', axis=1).dropna().corr(method='spearman', min_periods=1)  
    
    # Use 'Blues' colormap for blue to white colors
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='Blues', fmt=".2f", linewidths=.5, vmin=-1, vmax=1, annot_kws={"size": 10}, cbar=True)
    plt.title(f"Spearman Correlation Matrix - Type {t}", fontsize=16)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.show()

# Perform Spearman correlation analysis for the entire dataset
correlation_matrix_total = data_without_type.dropna().corr(method='spearman', min_periods=1)

# Use 'Blues' colormap for blue to white colors
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix_total, annot=True, cmap='Blues', fmt=".2f", linewidths=.5, vmin=-1, vmax=1, annot_kws={"size": 10}, cbar=True)
plt.title("Total Spearman Correlation Matrix", fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()


# PCA

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Read the CSV file
df = pd.read_csv(r'xxxx.csv')
df = df.dropna()

# Extract the category type column
categories = df.iloc[:, 0]

# Extract the features column
features = df.iloc[:, 1:]

# Standardise feature columns
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Create pca model, choose number of principal components (2 principal components here)
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_features)

# Get the contribution of each feature to the first two principal components
feature_contributions = pca.components_

# Compose a DataFrame of contributions and feature names
df_feature_contributions = pd.DataFrame(data=feature_contributions.T,
                                         columns=['PC1 Contribution', 'PC2 Contribution'],
                                         index=features.columns)

# Draw a bar chart
df_feature_contributions.plot(kind='bar', figsize=(12, 8))
plt.xlabel('Feature')
plt.ylabel('Contribution to Principal Components')
plt.title('Contribution of Features to Principal Components')
plt.legend(loc='upper right')
plt.grid(axis='y')
plt.show()

# SVM

## Hyperparameters Tuning

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import accuracy_score


# 1. Load CSV data
data = pd.read_csv(r'xxx.csv')
# 2. Segment the dataset into features (X) and labels (y)
X = data.iloc[:, 1:]y = data.iloc[:, 0]
# 3. Segment the dataset into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4.Define parameter ranges
param_grid = {'C': [0.01,0.1, 1, 10, 100,1000,10000,100000],
              'gamma': [0.00001,0.0001,0.001, 0.01, 0.1, 1, 10]}
# 5. Create SVM Classifier
svm_classifier = SVC(kernel='rbf', probability=True, random_state=42)

# 6. Create GridSearchCV Objects
grid_search = GridSearchCV(svm_classifier, param_grid, cv=10, scoring='accuracy')

# 7. Perform Grid Search to find the best parameters
grid_search.fit(X_train, y_train)
# 8. Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# 9. heat map visualisation (with values)
scores = grid_search.cv_results_["mean_test_score"].reshape(len(param_grid['C']), len(param_grid['gamma']))
plt.figure(figsize=(8, 6))
sns.heatmap(scores, annot=True, fmt=".3f", cmap=plt.cm.cividis, cbar=True)
plt.xlabel("gamma")
plt.ylabel("C")
plt.xticks(np.arange(len(param_grid['gamma'])) + 0.5, param_grid['gamma'], rotation=45, ha="right")
plt.yticks(np.arange(len(param_grid['C'])) + 0.5, param_grid['C'])
plt.title("Validation Accuracy")
plt.show()

## Evaluation

### Confusion Matrix

In [None]:
# 10. Use the model with the best parameters for prediction
best_svm_classifier = SVC(kernel='rbf', probability=True, random_state=42, **best_params)
best_svm_classifier.fit(X_train, y_train)
y_pred_svm = best_svm_classifier.predict(X_test)

# 11. visualised confusion matrix
confusion_svm = confusion_matrix(y_test, y_pred_svm)
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_svm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("SVM Confusion Matrix")
plt.show()

### Evaluation metrics

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


# Calculate precision, recall, F1 score for each category
precision_svm = precision_score(y_test, y_pred_svm, average=None)
recall_svm = recall_score(y_test, y_pred_svm, average=None)
f1_svm = f1_score(y_test, y_pred_svm, average=None)

# Print the precision, recall, and F1 score for each category
for i in range(len(np.unique(y))):
    print(f"SVM - Type {i}:")
    print(f"  Precision: {precision_svm[i]:.4f}")
    print(f"  Recall: {recall_svm[i]:.4f}")
    print(f"  F1 score: {f1_svm[i]:.4f}")
    print()

# Calculate overall accuracy
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM - Accuracy: {accuracy_svm:.4f}")

# RF

## Hyperparameters Tuning

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import pandas as pd

# Load CSV data (replace with your actual CSV file path)
data = pd.read_csv(r'D:\LW\DATA_NEW_CLR_COPY.csv')

# Split data into features (X) and labels (y)
X = data.iloc[:, 1:]  # Assuming all columns except the first one are features
y = data.iloc[:, 0]   # The first column is the label

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a Random Forest Classifier
rfc = RandomForestClassifier(n_jobs=-1, random_state=42)

# Perform k-fold cross-validation for each number of trees
n_trees = list(range(1, 201, 10))
cv_scores = []
for n in n_trees:
    rfc.n_estimators = n
    scores = cross_val_score(rfc, X_train, y_train, cv=10, scoring='accuracy')  # Use 10-fold cross-validation
    cv_scores.append(scores.mean())

# Find the maximum cross-validated accuracy and corresponding number of trees
max_cv_accuracy = max(cv_scores)
optimal_n_estimators_cv = n_trees[cv_scores.index(max_cv_accuracy)]
print("Optimal number of trees with k-fold cross-validation:", optimal_n_estimators_cv)
print("Maximum cross-validated accuracy:", max_cv_accuracy)

# Plot cross-validated accuracy against the number of trees
plt.figure(figsize=[8, 6])
plt.plot(n_trees, cv_scores)
plt.xlabel('Number of Trees')
plt.ylabel('Cross-Validated Accuracy')
plt.title('Random Forest Classifier Performance with k-fold Cross-Validation')
plt.show()


## Evaluation

### Confusion Matrix

In [None]:
# Train the Random Forest model with the optimal number of trees
rfc_optimal = RandomForestClassifier(n_estimators=optimal_n_estimators_cv, n_jobs=-1, random_state=42)
rfc_optimal.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rfc_optimal.predict(X_test)

# Display confusion matrix
confusion_rf = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_rf, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Random Forest Confusion Matrix")
plt.show()

### Evaluation metrics

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


# Calculate precision, recall, F1 score for each category
precision_rf = precision_score(y_test, y_pred_rf, average=None)
recall_rf = recall_score(y_test, y_pred_rf, average=None)
f1_rf = f1_score(y_test, y_pred_rf, average=None)

# Print precision, recall, F1 score for each category
for i in range(len(np.unique(y))):
    print(f"SVM - Type {i}:")
    print(f"  Precision: {precision_rf[i]:.4f}")
    print(f"  Recall: {recall_rf[i]:.4f}")
    print(f"  F1 score: {f1_rf[i]:.4f}")
    print()

# Calculate overall accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"RF - Accuracy: {accuracy_rf:.4f}")

# Decision boundary (horizontal and vertical axes of a two-dimensional discriminant chart)

In [None]:
#Using the SVM classifier
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA  
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split  
from sklearn import datasets
from sklearn import svm
from sklearn.linear_model import LogisticRegression
import pandas as pdndf_wine=pd.read_csv(r'xxx.csv')
mydata_data=df_wine[df_wine.columns[1:]].values
mydata_target=df_wine['Type'].values

from sklearn.preprocessing import StandardScaler  
stdScale1 = StandardScaler().fit(mydata_data)   
mydata_trainScaler = stdScale1.transform(mydata_data)  
x1=mydata_trainScaler
y1=mydata_target

gamma=50
svc=svm.SVC(kernel='rbf',C=10,gamma=gamma)
svc.fit(x1,y1)
print('SV number:',svc.support_)
print('SV set:',svc.support_vectors_)
print('SVC score:',svc.score(x1,y1))
print(40*'*')
logi = LogisticRegression(C=1.0,penalty='l2',solver='sag',max_iter=1000)
 
svc_linear=svm.SVC(C=1.0,kernel="linear")
 
svc_rbf1=svm.SVC(C=1.0,kernel="rbf",gamma=0.5)
 
svc_rbf2=svm.SVC(C=1.0,kernel="rbf",gamma=50)
clfs=[logi,svc_linear,svc_rbf1,svc_rbf2]
titles=["Logistic Steele regression",'Linear regression function SVM','RBF kernel function (gamma=0.5)','RBF kernel function (gamma=50)']
clr1=[logi]
 
for clf,i in zip(clfs,range(len(clfs))):
    clf.fit(x1,y1)
    print(titles[i],'Performance scores on the full sample set：',clf.score(x1,y1))
print(40*'*')

# SHAP

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

data = pd.read_csv(r'xxx.csv')
X = data.iloc[:, 1:]
y = data.iloc[:, 0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(n_estimators=175, max_depth=20, random_state=42)

rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)


In [None]:
import shap
import matplotlib.pyplot as plt

explainer = shap.TreeExplainer(rf_classifier)
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values, X_test, feature_names=data.columns[1:], class_names=np.unique(y), plot_type="bar")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import shap
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['Palatino Linotype']
plt.rcParams['axes.unicode_minus'] = False

# 1. Load CSV data
data = pd.read_csv(r'xxxx.csv')

# 2. Split the dataset into features (X) and labels (y)
X = data.iloc[:, 1:]
y = data.iloc[:, 0]

# 3. Divide the dataset into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Create a random forest classifier
rf_classifier = RandomForestClassifier(n_estimators=151, max_depth=20, random_state=42)

# 5. train_classifier
rf_classifier.fit(X_train, y_train)

# 6. Use shap to calculate Shapley values
explainer = shap.TreeExplainer(rf_classifier)
shap_values = explainer.shap_values(X_test)

for output_index in range(len(shap_values)):
    shap.summary_plot(shap_values[output_index], X_test, plot_type="dot", title=f"Output {output_index}", feature_names=X_test.columns)
    plt.figure()plt.show()