# Preprocessing

In [None]:
import pandas as pd
import numpy as np

# Read your Excel file into a Pandas DataFrame
df = pd.read_excel('/home/alifian/Geology Data Analysis/Pyrite_-21-feb(Final Version) copy.xlsx')
# Specify columns to transform
columns_to_transform = ['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']

# Apply natural logarithm to selected columns
df[columns_to_transform] = np.log(df[columns_to_transform])

# Calculate the mean and standard deviation of the log-transformed columns
mean = df[columns_to_transform].mean()
std = df[columns_to_transform].std()

# Standardize the log-transformed columns using the mean and standard deviation
df[columns_to_transform] = (df[columns_to_transform] - mean) / std

# Round the values to four decimal places
df = df.round(4)

# Save the result to a new Excel file
df.to_excel('Pyrite_Standarized_data_file.xlsx', index=False)


# Random Forest

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
from sklearn.impute import SimpleImputer

# Load your data from Excel
df = pd.read_excel('/Users/asia/Desktop/Sir Amar/Pyrite_Standarized_data_file.xlsx')

# Identify features (X) and the target variable (y)
X = df[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
y = df['ore-forming fluids Type']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)  # Apply the imputer to the X DataFrame

# Split the data into training (60%), validation (20%), and testing sets (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize the Random Forest classifier with specified parameters
rf_classifier = RandomForestClassifier(n_estimators=400, max_depth=20, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = rf_classifier.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_rep = classification_report(y_val, y_val_pred)

# Output validation set evaluation metrics
print(f'Validation Accuracy: {val_accuracy*100:.2f}%')
print('\nValidation Confusion Matrix:')
print(val_conf_matrix)
print('\nValidation Classification Report:')
print(val_classification_rep)

# Once the model is finalized after validation, evaluate it on the test set
y_test_pred = rf_classifier.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_rep = classification_report(y_test, y_test_pred)

# Output test set evaluation metrics
print(f'Test Accuracy: {test_accuracy*100:.2f}%')
print('\nTest Confusion Matrix:')
print(test_conf_matrix)
print('\nTest Classification Report:')
print(test_classification_rep)



# Support Vector Machine (SVM)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
from sklearn.impute import SimpleImputer

# Load your data from Excel
df = pd.read_excel('/Users/asia/Desktop/Sir Amar/Pyrite_Standarized_data_file.xlsx')

# Identify features (X) and the target variable (y)
X = df[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
y = df['ore-forming fluids Type']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='mean')

# Apply the imputer to the X DataFrame
X = imputer.fit_transform(X)

# Split the data into training (60%), validation (20%), and testing sets (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize the SVM classifier with specified parameters
svm_classifier = SVC(C=100, kernel='rbf', gamma=0.1)

# Train the classifier on the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = svm_classifier.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_rep = classification_report(y_val, y_val_pred)

# Output validation set evaluation metrics
print(f'Validation Accuracy: {val_accuracy*100:.2f}%')
print('\nValidation Confusion Matrix:')
print(val_conf_matrix)
print('\nValidation Classification Report:')
print(val_classification_rep)

# Once the model is finalized after validation, evaluate it on the test set
y_test_pred = svm_classifier.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_rep = classification_report(y_test, y_test_pred)

# Output test set evaluation metrics
print(f'Test Accuracy: {test_accuracy*100:.2f}%')
print('\nTest Confusion Matrix:')
print(test_conf_matrix)
print('\nTest Classification Report:')
print(test_classification_rep)



# Multilayer Perceptron (MLP)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
from sklearn.impute import SimpleImputer

# Load your data from Excel
df = pd.read_excel('/Users/asia/Desktop/Sir Amar/Pyrite_Standarized_data_file.xlsx')

# Identify features (X) and the target variable (y)
X = df[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
y = df['ore-forming fluids Type']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='mean')

# Apply the imputer to the X DataFrame
X = imputer.fit_transform(X)

# Split the data into training (60%), validation (20%), and testing sets (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize the MLP classifier with specified parameters
mlp_classifier = MLPClassifier(hidden_layer_sizes=(150, 100, 50), max_iter=300, activation='tanh', solver='adam', alpha=0.0001, learning_rate='constant')

# Train the classifier on the training data
mlp_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = mlp_classifier.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_rep = classification_report(y_val, y_val_pred)

# Output validation set evaluation metrics
print(f'Validation Accuracy: {val_accuracy*100:.2f}%')
print('\nValidation Confusion Matrix:')
print(val_conf_matrix)
print('\nValidation Classification Report:')
print(val_classification_rep)

# Once the model is finalized after validation, evaluate it on the test set
y_test_pred = mlp_classifier.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_rep = classification_report(y_test, y_test_pred)

# Output test set evaluation metrics
print(f'Test Accuracy: {test_accuracy*100:.2f}%')
print('\nTest Confusion Matrix:')
print(test_conf_matrix)
print('\nTest Classification Report:')
print(test_classification_rep)

# Gradient Boost

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
from sklearn.impute import SimpleImputer

# Load your data from Excel
df = pd.read_excel('/Users/asia/Desktop/Sir Amar/Pyrite_Standarized_data_file.xlsx')

# Identify features (X) and the target variable (y)
X = df[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
y = df['ore-forming fluids Type']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='mean')

# Apply the imputer to the X DataFrame
X = imputer.fit_transform(X)

# Split the data into training (60%), validation (20%), and testing sets (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize the Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(random_state=42)

# Train the classifier on the training data
gb_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = gb_classifier.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_rep = classification_report(y_val, y_val_pred)

# Output validation set evaluation metrics
print(f'Validation Accuracy: {val_accuracy*100:.2f}%')
print('\nValidation Confusion Matrix:')
print(val_conf_matrix)
print('\nValidation Classification Report:')
print(val_classification_rep)

# Once the model is finalized after validation, evaluate it on the test set
y_test_pred = gb_classifier.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_rep = classification_report(y_test, y_test_pred)

# Output test set evaluation metrics
print(f'Test Accuracy: {test_accuracy*100:.2f}%')
print('\nTest Confusion Matrix:')
print(test_conf_matrix)
print('\nTest Classification Report:')
print(test_classification_rep)

# Feature Importance for RF and Gradient Boost 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
from sklearn.impute import SimpleImputer

# Load your data from Excel
df = pd.read_excel('/Users/asia/Desktop/Sir Amar/Pyrite_Standarized_data_file.xlsx')

# Identify features (X) and the target variable (y)
features = ['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']
X = df[features]
y = df['ore-forming fluids Type']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='mean')

# Apply the imputer to the X DataFrame
X = imputer.fit_transform(X)

# Split the data into training (60%), validation (20%), and testing sets (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize the Random Forest classifier with specified parameters
rf_classifier = RandomForestClassifier(n_estimators=400, max_depth=20, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Get feature importances
importances = rf_classifier.feature_importances_

# Print feature importances
for feature, importance in zip(features, importances):
    print(f'The importance of {feature} is: {importance:.4f}')

# Make predictions on the validation set
y_val_pred = rf_classifier.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_rep = classification_report(y_val, y_val_pred)

# Output validation set evaluation metrics
print(f'Validation Accuracy: {val_accuracy*100:.2f}%')
print('\nValidation Confusion Matrix:')
print(val_conf_matrix)
print('\nValidation Classification Report:')
print(val_classification_rep)

# Once the model is finalized after validation, evaluate it on the test set
y_test_pred = rf_classifier.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_rep = classification_report(y_test, y_test_pred)

# Output test set evaluation metrics
print(f'Test Accuracy: {test_accuracy*100:.2f}%')
print('\nTest Confusion Matrix:')
print(test_conf_matrix)
print('\nTest Classification Report:')
print(test_classification_rep)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
from sklearn.impute import SimpleImputer

# Load your data from Excel
df = pd.read_excel('/Users/asia/Desktop/Sir Amar/Pyrite_Standarized_data_file.xlsx')

# Identify features (X) and the target variable (y)
features = ['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']
X = df[features]
y = df['ore-forming fluids Type']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='mean')

# Apply the imputer to the X DataFrame
X = imputer.fit_transform(X)

# Split the data into training (60%), validation (20%), and testing sets (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize the Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(learning_rate=0.1, max_depth=5, n_estimators=300, random_state=42)

# Train the classifier on the training data
gb_classifier.fit(X_train, y_train)

# Get feature importances
importances = gb_classifier.feature_importances_

# Print feature importances
for feature, importance in zip(features, importances):
    print(f'The importance of {feature} is: {importance:.4f}')

# Make predictions on the validation set
y_val_pred = gb_classifier.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_rep = classification_report(y_val, y_val_pred)

# Output validation set evaluation metrics
print(f'Validation Accuracy: {val_accuracy*100:.4f}%')
print('\nValidation Confusion Matrix:')
print(val_conf_matrix)
print('\nValidation Classification Report:')
print(val_classification_rep)

# Once the model is finalized after validation, evaluate it on the test set
y_test_pred = gb_classifier.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_rep = classification_report(y_test, y_test_pred)

# Output test set evaluation metrics
print(f'Test Accuracy: {test_accuracy*100:.2f}%')
print('\nTest Confusion Matrix:')
print(test_conf_matrix)
print('\nTest Classification Report:')
print(test_classification_rep)

# Hyperparameters for RF

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
from sklearn.impute import SimpleImputer

# Load your data from Excel
df = pd.read_excel('/Users/asia/Desktop/Sir Amar/Pyrite_Standarized_data_file.xlsx')

# Identify features (X) and the target variable (y)
X = df[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
y = df['ore-forming fluids Type']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)  # Apply the imputer to the X DataFrame

# Split the data into training (60%), validation (20%), and testing sets (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [5, 10, 15, 20, 25, 30],
    'random_state': [42]
}

# Set up the grid search
grid_cv = GridSearchCV(
    estimator=rf_classifier,
    param_grid=hyperparameter_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Train the classifier on the training data
grid_cv.fit(X_train, y_train)

# Get the best parameters
best_params = grid_cv.best_params_
print(f'Best parameters: {best_params}')

# Make predictions on the validation set using the model with the best parameters
y_val_pred = grid_cv.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_rep = classification_report(y_val, y_val_pred)

# Output validation set evaluation metrics
print(f'Validation Accuracy: {val_accuracy*100:.2f}%')
print('\nValidation Confusion Matrix:')
print(val_conf_matrix)
print('\nValidation Classification Report:')
print(val_classification_rep)

# Once the model is finalized after validation, evaluate it on the test set
y_test_pred = grid_cv.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_rep = classification_report(y_test, y_test_pred)

# Output test set evaluation metrics
print(f'Test Accuracy: {test_accuracy*100:.2f}%')
print('\nTest Confusion Matrix:')
print(test_conf_matrix)
print('\nTest Classification Report:')
print(test_classification_rep)

# Hyperparameters SVM

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
from sklearn.impute import SimpleImputer

# Load your data from Excel
df = pd.read_excel('/Users/asia/Desktop/Sir Amar/Pyrite_Standarized_data_file.xlsx')

# Identify features (X) and the target variable (y)
X = df[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
y = df['ore-forming fluids Type']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='mean')

# Apply the imputer to the X DataFrame
X = imputer.fit_transform(X)

# Split the data into training (60%), validation (20%), and testing sets (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize the SVM classifier
svm_classifier = SVC()

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf']
}

# Set up the grid search
grid_cv = GridSearchCV(
    estimator=svm_classifier,
    param_grid=hyperparameter_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Train the classifier on the training data
grid_cv.fit(X_train, y_train)

# Get the best parameters
best_params = grid_cv.best_params_
print(f'Best parameters: {best_params}')

# Make predictions on the validation set using the model with the best parameters
y_val_pred = grid_cv.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_rep = classification_report(y_val, y_val_pred)

# Output validation set evaluation metrics
print(f'Validation Accuracy: {val_accuracy*100:.2f}%')
print('\nValidation Confusion Matrix:')
print(val_conf_matrix)
print('\nValidation Classification Report:')
print(val_classification_rep)

# Once the model is finalized after validation, evaluate it on the test set
y_test_pred = grid_cv.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_rep = classification_report(y_test, y_test_pred)

# Output test set evaluation metrics
print(f'Test Accuracy: {test_accuracy*100:.2f}%')
print('\nTest Confusion Matrix:')
print(test_conf_matrix)
print('\nTest Classification Report:')
print(test_classification_rep)

# Hyperparameters for MLP

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
from sklearn.impute import SimpleImputer

# Load your data from Excel
df = pd.read_excel('/Users/asia/Desktop/Sir Amar/Pyrite_Standarized_data_file.xlsx')

# Identify features (X) and the target variable (y)
X = df[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
y = df['ore-forming fluids Type']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='mean')

# Apply the imputer to the X DataFrame
X = imputer.fit_transform(X)

# Split the data into training (60%), validation (20%), and testing sets (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize the MLP classifier
mlp_classifier = MLPClassifier(max_iter=300)

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

# Set up the grid search
grid_cv = GridSearchCV(
    estimator=mlp_classifier,
    param_grid=hyperparameter_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Train the classifier on the training data
grid_cv.fit(X_train, y_train)

# Get the best parameters
best_params = grid_cv.best_params_
print(f'Best parameters: {best_params}')

# Make predictions on the validation set using the model with the best parameters
y_val_pred = grid_cv.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_rep = classification_report(y_val, y_val_pred)

# Output validation set evaluation metrics
print(f'Validation Accuracy: {val_accuracy*100:.2f}%')
print('\nValidation Confusion Matrix:')
print(val_conf_matrix)
print('\nValidation Classification Report:')
print(val_classification_rep)

# Once the model is finalized after validation, evaluate it on the test set
y_test_pred = grid_cv.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_rep = classification_report(y_test, y_test_pred)

# Output test set evaluation metrics
print(f'Test Accuracy: {test_accuracy*100:.2f}%')
print('\nTest Confusion Matrix:')
print(test_conf_matrix)
print('\nTest Classification Report:')
print(test_classification_rep)

# Hyperparameters for Gradient Boost

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
from sklearn.impute import SimpleImputer

# Load your data from Excel
df = pd.read_excel('/Users/asia/Desktop/Sir Amar/Pyrite_Standarized_data_file.xlsx')

# Identify features (X) and the target variable (y)
X = df[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
y = df['ore-forming fluids Type']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='mean')

# Apply the imputer to the X DataFrame
X = imputer.fit_transform(X)

# Split the data into training (60%), validation (20%), and testing sets (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize the Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(random_state=42)

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [3, 4, 5]
}

# Set up the grid search
grid_cv = GridSearchCV(
    estimator=gb_classifier,
    param_grid=hyperparameter_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Train the classifier on the training data
grid_cv.fit(X_train, y_train)

# Get the best parameters
best_params = grid_cv.best_params_
print(f'Best parameters: {best_params}')

# Make predictions on the validation set using the model with the best parameters
y_val_pred = grid_cv.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_rep = classification_report(y_val, y_val_pred)

# Output validation set evaluation metrics
print(f'Validation Accuracy: {val_accuracy*100:.2f}%')
print('\nValidation Confusion Matrix:')
print(val_conf_matrix)
print('\nValidation Classification Report:')
print(val_classification_rep)

# Once the model is finalized after validation, evaluate it on the test set
y_test_pred = grid_cv.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_rep = classification_report(y_test, y_test_pred)

# Output test set evaluation metrics
print(f'Test Accuracy: {test_accuracy*100:.2f}%')
print('\nTest Confusion Matrix:')
print(test_conf_matrix)
print('\nTest Classification Report:')
print(test_classification_rep)

# AUC for RF

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelBinarizerca
from sklearn.metrics import precision_score

# Load your data from Excel
df = pd.read_excel('/Users/asia/Desktop/Sir Amar/Pyrite_Standarized_data_file.xlsx')

# Identify features (X) and the target variable (y)
X = df[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
y = df['ore-forming fluids Type']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)  # Apply the imputer to the X DataFrame

# Split the data into training (60%), validation (20%), and testing sets (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize the Random Forest classifier with specified parameters
rf_classifier = RandomForestClassifier(n_estimators=400, max_depth=20, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = rf_classifier.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_rep = classification_report(y_val, y_val_pred)

# Output validation set evaluation metrics
print(f'Validation Accuracy: {val_accuracy*100:.2f}%')
print('\nValidation Confusion Matrix:')
print(val_conf_matrix)
print('\nValidation Classification Report:')
print(val_classification_rep)

# Calculate AUC on the validation set
y_val_pred_proba = rf_classifier.predict_proba(X_val)
lb = LabelBinarizer()
lb.fit(y_val)
y_val_lb = lb.transform(y_val)
val_auc = roc_auc_score(y_val_lb, y_val_pred_proba, multi_class='ovr')
print(f'Validation AUC: {val_auc:.4f}')

# Once the model is finalized after validation, evaluate it on the test set
y_test_pred = rf_classifier.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_rep = classification_report(y_test, y_test_pred)

# Output test set evaluation metrics
print(f'Test Accuracy: {test_accuracy*100:.2f}%')
print('\nTest Confusion Matrix:')
print(test_conf_matrix)
print('\nTest Classification Report:')
print(test_classification_rep)

# Calculate AUC on the test set
y_test_pred_proba = rf_classifier.predict_proba(X_test)
y_test_lb = lb.transform(y_test)
test_auc = roc_auc_score(y_test_lb, y_test_pred_proba, multi_class='ovr')
print(f'Test AUC: {test_auc:.4f}')

# Output number of original and predicted type counts for validation set
val_original_counts = y_val.value_counts()
val_predicted_counts = pd.Series(y_val_pred).value_counts()
print('\nNumber of Original Type Counts (Validation Set):')
print(val_original_counts)
print('\nNumber of Predicted Type Counts (Validation Set):')
print(val_predicted_counts)

# Output number of original and predicted type counts for test set
test_original_counts = y_test.value_counts()
test_predicted_counts = pd.Series(y_test_pred).value_counts()
print('\nNumber of Original Type Counts (Test Set):')
print(test_original_counts)
print('\nNumber of Predicted Type Counts (Test Set):')
print(test_predicted_counts)

# Calculate precision score
precision = precision_score(y_test, y_test_pred, average='weighted')
print(f'Precision Score: {precision:.4f}')


# AUC for SVM

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelBinarizer

# Load your data from Excel
df = pd.read_excel('/Users/asia/Desktop/Sir Amar/Pyrite_Standarized_data_file.xlsx')

# Identify features (X) and the target variable (y)
X = df[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
y = df['ore-forming fluids Type']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='mean')

# Apply the imputer to the X DataFrame
X = imputer.fit_transform(X)

# Split the data into training (60%), validation (20%), and testing sets (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize the SVM classifier with specified parameters
svm_classifier = SVC(C=100, kernel='rbf', gamma=0.1, probability=True)

# Train the classifier on the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = svm_classifier.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_rep = classification_report(y_val, y_val_pred)

# Output validation set evaluation metrics
print(f'Validation Accuracy: {val_accuracy*100:.2f}%')
print('\nValidation Confusion Matrix:')
print(val_conf_matrix)
print('\nValidation Classification Report:')
print(val_classification_rep)

# Calculate AUC on the validation set
y_val_pred_proba = svm_classifier.predict_proba(X_val)
lb = LabelBinarizer()
lb.fit(y_val)
y_val_lb = lb.transform(y_val)
val_auc = roc_auc_score(y_val_lb, y_val_pred_proba, multi_class='ovr')
print(f'Validation AUC: {val_auc:.4f}')

# Once the model is finalized after validation, evaluate it on the test set
y_test_pred = svm_classifier.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_rep = classification_report(y_test, y_test_pred)

# Output test set evaluation metrics
print(f'Test Accuracy: {test_accuracy*100:.2f}%')
print('\nTest Confusion Matrix:')
print(test_conf_matrix)
print('\nTest Classification Report:')
print(test_classification_rep)

# Calculate AUC on the test set
y_test_pred_proba = svm_classifier.predict_proba(X_test)
y_test_lb = lb.transform(y_test)
test_auc = roc_auc_score(y_test_lb, y_test_pred_proba, multi_class='ovr')
print(f'Test AUC: {test_auc:.4f}')

# Output number of original and predicted type counts for validation set
val_original_counts = y_val.value_counts()
val_predicted_counts = pd.Series(y_val_pred).value_counts()
print('\nNumber of Original Type Counts (Validation Set):')
print(val_original_counts)
print('\nNumber of Predicted Type Counts (Validation Set):')
print(val_predicted_counts)

# Output number of original and predicted type counts for test set
test_original_counts = y_test.value_counts()
test_predicted_counts = pd.Series(y_test_pred).value_counts()
print('\nNumber of Original Type Counts (Test Set):')
print(test_original_counts)
print('\nNumber of Predicted Type Counts (Test Set):')
print(test_predicted_counts)

# Calculate precision score
precision = precision_score(y_test, y_test_pred, average='weighted')
print(f'Precision Score: {precision:.4f}')


# AUC of MLP

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import aaccuracy_score, classification_report, confusion_matrix, precision_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelBinarizer

# Load your data from Excel
df = pd.read_excel('/Users/asia/Desktop/Sir Amar/Pyrite_Standarized_data_file.xlsx')

# Identify features (X) and the target variable (y)
X = df[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
y = df['ore-forming fluids Type']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='mean')

# Apply the imputer to the X DataFrame
X = imputer.fit_transform(X)

# Split the data into training (60%), validation (20%), and testing sets (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize the MLP classifier with specified parameters
mlp_classifier = MLPClassifier(hidden_layer_sizes=(50, 100, 50), max_iter=300, activation='tanh', solver='adam', alpha=0.0001, learning_rate='constant')

# Train the classifier on the training data
mlp_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = mlp_classifier.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_rep = classification_report(y_val, y_val_pred)

# Output validation set evaluation metrics
print(f'Validation Accuracy: {val_accuracy*100:.2f}%')
print('\nValidation Confusion Matrix:')
print(val_conf_matrix)
print('\nValidation Classification Report:')
print(val_classification_rep)

# Calculate AUC on the validation set
lb = LabelBinarizer()
lb.fit(y_val)
y_val_lb = lb.transform(y_val)
y_val_pred_proba = mlp_classifier.predict_proba(X_val)
val_auc = roc_auc_score(y_val_lb, y_val_pred_proba, multi_class='ovr')
print(f'Validation AUC: {val_auc:.4f}')

# Once the model is finalized after validation, evaluate it on the test set
y_test_pred = mlp_classifier.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_rep = classification_report(y_test, y_test_pred)

# Output test set evaluation metrics
print(f'Test Accuracy: {test_accuracy*100:.2f}%')
print('\nTest Confusion Matrix:')
print(test_conf_matrix)
print('\nTest Classification Report:')
print(test_classification_rep)

# Calculate AUC on the test set
y_test_lb = lb.transform(y_test)
y_test_pred_proba = mlp_classifier.predict_proba(X_test)
test_auc = roc_auc_score(y_test_lb, y_test_pred_proba, multi_class='ovr')
print(f'Test AUC: {test_auc:.4f}') 

# Output number of original and predicted type counts for validation set
val_original_counts = y_val.value_counts()
val_predicted_counts = pd.Series(y_val_pred).value_counts()
print('\nNumber of Original Type Counts (Validation Set):')
print(val_original_counts)
print('\nNumber of Predicted Type Counts (Validation Set):')
print(val_predicted_counts)

# Output number of original and predicted type counts for test set
test_original_counts = y_test.value_counts()
test_predicted_counts = pd.Series(y_test_pred).value_counts()
print('\nNumber of Original Type Counts (Test Set):')
print(test_original_counts)
print('\nNumber of Predicted Type Counts (Test Set):')
print(test_predicted_counts)

# Calculate precision score
precision = precision_score(y_test, y_test_pred, average='weighted')
print(f'Precision Score: {precision:.4f}')


# AUC of Gradient Boost 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelBinarizer

# Load your data from Excel
df = pd.read_excel('/Users/asia/Desktop/Sir Amar/Pyrite_Standarized_data_file.xlsx')

# Identify features (X) and the target variable (y)
X = df[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
y = df['ore-forming fluids Type']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='mean')

# Apply the imputer to the X DataFrame
X = imputer.fit_transform(X)

# Split the data into training (60%), validation (20%), and testing sets (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize the Gradient Boosting classifier with the best parameters
gb_classifier = GradientBoostingClassifier(learning_rate=0.1, max_depth=5, n_estimators=300, random_state=42)

# Train the classifier on the training data
gb_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = gb_classifier.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_rep = classification_report(y_val, y_val_pred)

# Output validation set evaluation metrics
print(f'Validation Accuracy: {val_accuracy*100:.2f}%')
print('\nValidation Confusion Matrix:')
print(val_conf_matrix)
print('\nValidation Classification Report:')
print(val_classification_rep)

# Calculate AUC on the validation set
lb = LabelBinarizer()
lb.fit(y_val)
y_val_lb = lb.transform(y_val)
y_val_pred_proba = gb_classifier.predict_proba(X_val)
val_auc = roc_auc_score(y_val_lb, y_val_pred_proba, multi_class='ovr')
print(f'Validation AUC: {val_auc:.4f}')

# Once the model is finalized after validation, evaluate it on the test set
y_test_pred = gb_classifier.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_rep = classification_report(y_test, y_test_pred)

# Output test set evaluation metrics
print(f'Test Accuracy: {test_accuracy*100:.2f}%')
print('\nTest Confusion Matrix:')
print(test_conf_matrix)
print('\nTest Classification Report:')
print(test_classification_rep)

# Calculate AUC on the test set
y_test_lb = lb.transform(y_test)
y_test_pred_proba = gb_classifier.predict_proba(X_test)
test_auc = roc_auc_score(y_test_lb, y_test_pred_proba, multi_class='ovr')
print(f'Test AUC: {test_auc:.4f}')

# Output number of original and predicted type counts for validation set
val_original_counts = y_val.value_counts()
val_predicted_counts = pd.Series(y_val_pred).value_counts()
print('\nNumber of Original Type Counts (Validation Set):')
print(val_original_counts)
print('\nNumber of Predicted Type Counts (Validation Set):')
print(val_predicted_counts)

# Output number of original and predicted type counts for test set
test_original_counts = y_test.value_counts()
test_predicted_counts = pd.Series(y_test_pred).value_counts()
print('\nNumber of Original Type Counts (Test Set):')
print(test_original_counts)
print('\nNumber of Predicted Type Counts (Test Set):')
print(test_predicted_counts)

# Calculate precision score
precision = precision_score(y_test, y_test_pred, average='weighted')
print(f'Precision Score: {precision:.4f}')


# RF Testing on Unknown Data 

In [None]:
import pandas as pd
import numpy as np

# Specify the path to your Excel file
file_path = '/Users/asia/Desktop/Sir Amar/Test_data.xlsx'

# Specify columns to transform
columns_to_transform = ['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']

# Load the Excel file
xls = pd.ExcelFile(file_path)

# Get the names of all sheets in the Excel file
sheet_names = xls.sheet_names

# Create a dictionary to store the transformed dataframes
dfs = {}

# Loop over each sheet
for sheet_name in sheet_names:
    # Read the sheet into a dataframe
    df = pd.read_excel(file_path, sheet_name=sheet_name)

    # Apply natural logarithm to selected columns
    df[columns_to_transform] = np.log(df[columns_to_transform])

    # Calculate the mean and standard deviation of the log-transformed columns
   # mean = df[columns_to_transform].mean()
   # std = df[columns_to_transform].std()

    # Standardize the log-transformed columns using the mean and standard deviation
  #  df[columns_to_transform] = (df[columns_to_transform] - mean) / std

    # Round the values to four decimal places
    df = df.round(4)

    # Store the transformed dataframe in the dictionary
    dfs[sheet_name] = df

# Save the transformed dataframes to a new Excel file
with pd.ExcelWriter('Pyrite_Standarized_Test_data_file.xlsx') as writer:
    for sheet_name, df in dfs.items():
        df.to_excel(writer, sheet_name=sheet_name, index=False)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
from sklearn.impute import SimpleImputer

# Load your data from Excel
df = pd.read_excel('/Users/asia/Desktop/Sir Amar/Pyrite_Standarized_data_file.xlsx')

# Identify features (X) and the target variable (y)
X = df[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
y = df['ore-forming fluids Type']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='constant', fill_value=0.08)
X = imputer.fit_transform(X)  # Apply the imputer to the X DataFrame


# Split the data into training (60%), validation (20%), and testing sets (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

# Initialize the Random Forest classifier with specified parameters
rf_classifier = RandomForestClassifier(n_estimators=400, max_depth=20, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = rf_classifier.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_rep = classification_report(y_val, y_val_pred)

# Output validation set evaluation metrics
print(f'Validation Accuracy: {val_accuracy*100:.2f}%')
print('\nValidation Confusion Matrix:')
print(val_conf_matrix)
print('\nValidation Classification Report:')
print(val_classification_rep)

# Once the model is finalized after validation, evaluate it on the test set
y_test_pred = rf_classifier.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_rep = classification_report(y_test, y_test_pred)

# Output test set evaluation metrics
print(f'Test Accuracy: {test_accuracy*100:.2f}%')
print('\nTest Confusion Matrix:')
print(test_conf_matrix)
print('\nTest Classification Report:')
print(test_classification_rep)

# Load test data from the first sheet
test_data1 = pd.read_excel('/Users/asia/Desktop/Sir Amar/Test_data.xlsx', sheet_name='Sample 1')

# Apply the same transformations to the test data
X_test1 = test_data1[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
X_test1 = imputer.transform(X_test1)

# Make predictions on the test data
y_test_pred1 = rf_classifier.predict(X_test1)

# Print the predicted classes
print("Predicted classes for test data from Sheet1:")
print(y_test_pred1)

# Load test data from the second sheet
test_data2 = pd.read_excel('/Users/asia/Desktop/Sir Amar/Test_data.xlsx', sheet_name='Sample 2')

# Apply the same transformations to the test data
X_test2 = test_data2[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
X_test2 = imputer.transform(X_test2)

# Make predictions on the test data
y_test_pred2 = rf_classifier.predict(X_test2)

# Print the predicted classes
print("Predicted classes for test data from Sheet2:")
print(y_test_pred2)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
from sklearn.impute import SimpleImputer

# Load your data from Excel
df = pd.read_excel('/Users/asia/Desktop/Sir Amar/Pyrite_Standarized_data_file.xlsx')

# Identify features (X) and the target variable (y)
X = df[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
y = df['ore-forming fluids Type']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='mean')

# Apply the imputer to the X DataFrame
X = imputer.fit_transform(X)

# Split the data into training (60%), validation (20%), and testing sets (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

# Initialize the SVM classifier with specified parameters
svm_classifier = SVC(C=100, kernel='rbf', gamma=0.1)

# Train the classifier on the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = svm_classifier.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_rep = classification_report(y_val, y_val_pred)

# Output validation set evaluation metrics
print(f'Validation Accuracy: {val_accuracy*100:.2f}%')
print('\nValidation Confusion Matrix:')
print(val_conf_matrix)
print('\nValidation Classification Report:')
print(val_classification_rep)

# Once the model is finalized after validation, evaluate it on the test set
y_test_pred = svm_classifier.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_rep = classification_report(y_test, y_test_pred)

# Output test set evaluation metrics
print(f'Test Accuracy: {test_accuracy*100:.2f}%')
print('\nTest Confusion Matrix:')
print(test_conf_matrix)
print('\nTest Classification Report:')
print(test_classification_rep)

# Load test data from the first sheet
test_data1 = pd.read_excel('/Users/asia/Desktop/Sir Amar/Test_data.xlsx', sheet_name='Sample 1')

# Apply the same transformations to the test data
X_test1 = test_data1[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
X_test1 = imputer.transform(X_test1)

# Make predictions on the test data
y_test_pred1 = svm_classifier.predict(X_test1)

# Print the predicted classes
print("Predicted classes for test data from Sheet1:")
print(y_test_pred1)

# Load test data from the second sheet
test_data2 = pd.read_excel('/Users/asia/Desktop/Sir Amar/Test_data.xlsx', sheet_name='Sample 2')

# Apply the same transformations to the test data
X_test2 = test_data2[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
X_test2 = imputer.transform(X_test2)

# Make predictions on the test data
y_test_pred2 = svm_classifier.predict(X_test2)

# Print the predicted classes
print("Predicted classes for test data from Sheet2:")
print(y_test_pred2)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
from sklearn.impute import SimpleImputer

# Load your data from Excel
df = pd.read_excel('/Users/asia/Desktop/Sir Amar/Pyrite_Standarized_data_file.xlsx')

# Identify features (X) and the target variable (y)
X = df[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
y = df['ore-forming fluids Type']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='constant', fill_value=0.08)

# Apply the imputer to the X DataFrame
X = imputer.fit_transform(X)

# Split the data into training (60%), validation (20%), and testing sets (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

# Initialize the MLP classifier with specified parameters
mlp_classifier = MLPClassifier(hidden_layer_sizes=(50, 100, 50), max_iter=300, activation='tanh', solver='adam', alpha=0.0001, learning_rate='constant')

# Train the classifier on the training data
mlp_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = mlp_classifier.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_rep = classification_report(y_val, y_val_pred)

# Output validation set evaluation metrics
print(f'Validation Accuracy: {val_accuracy*100:.2f}%')
print('\nValidation Confusion Matrix:')
print(val_conf_matrix)
print('\nValidation Classification Report:')
print(val_classification_rep)

# Once the model is finalized after validation, evaluate it on the test set
y_test_pred = mlp_classifier.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_rep = classification_report(y_test, y_test_pred)

# Output test set evaluation metrics
print(f'Test Accuracy: {test_accuracy*100:.2f}%')
print('\nTest Confusion Matrix:')
print(test_conf_matrix)
print('\nTest Classification Report:')
print(test_classification_rep)

# Load test data from the first sheet
test_data1 = pd.read_excel('/Users/asia/Desktop/Sir Amar/Test_data.xlsx', sheet_name='Sample 1')

# Apply the same transformations to the test data
X_test1 = test_data1[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
X_test1 = imputer.transform(X_test1)

# Make predictions on the test data
y_test_pred1 = mlp_classifier.predict(X_test1)

# Print the predicted classes
print("Predicted classes for test data from Sheet1:")
print(y_test_pred1)

# Load test data from the second sheet
test_data2 = pd.read_excel('/Users/asia/Desktop/Sir Amar/Test_data.xlsx', sheet_name='Sample 2')

# Apply the same transformations to the test data
X_test2 = test_data2[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
X_test2 = imputer.transform(X_test2)

# Make predictions on the test data
y_test_pred2 = mlp_classifier.predict(X_test2)

# Print the predicted classes
print("Predicted classes for test data from Sheet2:")
print(y_test_pred2)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
from sklearn.impute import SimpleImputer

# Load your data from Excel
df = pd.read_excel('/Users/asia/Desktop/Sir Amar/Pyrite_Standarized_data_file.xlsx')

# Identify features (X) and the target variable (y)
X = df[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
y = df['ore-forming fluids Type']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='constant', fill_value=0.08)

# Apply the imputer to the X DataFrame
X = imputer.fit_transform(X)

# Split the data into training (60%), validation (20%), and testing sets (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

# Initialize the Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(learning_rate=0.1, max_depth=5, n_estimators=300, random_state=42)

# Train the classifier on the training data
gb_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = gb_classifier.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_rep = classification_report(y_val, y_val_pred)

# Output validation set evaluation metrics
print(f'Validation Accuracy: {val_accuracy*100:.2f}%')
print('\nValidation Confusion Matrix:')
print(val_conf_matrix)
print('\nValidation Classification Report:')
print(val_classification_rep)

# Once the model is finalized after validation, evaluate it on the test set
y_test_pred = gb_classifier.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_rep = classification_report(y_test, y_test_pred)

# Output test set evaluation metrics
print(f'Test Accuracy: {test_accuracy*100:.2f}%')
print('\nTest Confusion Matrix:')
print(test_conf_matrix)
print('\nTest Classification Report:')
print(test_classification_rep)

# Load test data from the first sheet
test_data1 = pd.read_excel('/Users/asia/Desktop/Sir Amar/Test_data.xlsx', sheet_name='Sample 1')

# Apply the same transformations to the test data
X_test1 = test_data1[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
X_test1 = imputer.transform(X_test1)

# Make predictions on the test data
y_test_pred1 = gb_classifier.predict(X_test1)

# Print the predicted classes
print("Predicted classes for test data from Sheet1:")
print(y_test_pred1)

# Load test data from the second sheet
test_data2 = pd.read_excel('/Users/asia/Desktop/Sir Amar/Test_data.xlsx', sheet_name='Sample 2')

# Apply the same transformations to the test data
X_test2 = test_data2[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
X_test2 = imputer.transform(X_test2)

# Make predictions on the test data
y_test_pred2 = gb_classifier.predict(X_test2)

# Print the predicted classes
print("Predicted classes for test data from Sheet2:")
print(y_test_pred2)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
from sklearn.impute import SimpleImputer

# Load your data from Excel
df = pd.read_excel('/Users/asia/Desktop/Sir Amar/Pyrite_Standarized_data_file.xlsx')

# Identify features (X) and the target variable (y)
X = df[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
y = df['ore-forming fluids Type']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='constant', fill_value=0.08)

# Apply the imputer to the X DataFrame
X = imputer.fit_transform(X)

# Split the data into training (60%), validation (20%), and testing sets (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

# Initialize the Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(learning_rate=0.1, max_depth=5, n_estimators=300, random_state=42)

# Apply k-fold cross-validation
scores = cross_val_score(gb_classifier, X_train, y_train, cv=15)

print("Cross-validation scores: ", scores)
print("Average cross-validation score: ", scores.mean())

# Train the classifier on the training data
gb_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = gb_classifier.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_rep = classification_report(y_val, y_val_pred)

# Output validation set evaluation metrics
print(f'Validation Accuracy: {val_accuracy*100:.2f}%')
print('\nValidation Confusion Matrix:')
print(val_conf_matrix)
print('\nValidation Classification Report:')
print(val_classification_rep)

# Once the model is finalized after validation, evaluate it on the test set
y_test_pred = gb_classifier.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_rep = classification_report(y_test, y_test_pred)

# Output test set evaluation metrics
print(f'Test Accuracy: {test_accuracy*100:.2f}%')
print('\nTest Confusion Matrix:')
print(test_conf_matrix)
print('\nTest Classification Report:')
print(test_classification_rep)

# Load test data from the first sheet
test_data1 = pd.read_excel('/Users/asia/Desktop/Sir Amar/Test_data.xlsx', sheet_name='Sample 1')

# Apply the same transformations to the test data
X_test1 = test_data1[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
X_test1 = imputer.transform(X_test1)

# Make predictions on the test data
y_test_pred1 = gb_classifier.predict(X_test1)

# Print the predicted classes
print("Predicted classes for test data from Sheet1:")
print(y_test_pred1)

# Load test data from the second sheet
test_data2 = pd.read_excel('/Users/asia/Desktop/Sir Amar/Test_data.xlsx', sheet_name='Sample 2')

# Apply the same transformations to the test data
X_test2 = test_data2[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
X_test2 = imputer.transform(X_test2)

# Make predictions on the test data
y_test_pred2 = gb_classifier.predict(X_test2)

# Print the predicted classes
print("Predicted classes for test data from Sheet2:")
print(y_test_pred2)

# K fold Cross Validation

In [None]:
import pandas as pd
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer

# Load your data from Excel
df = pd.read_excel('/home/alifian/Geology Data Analysis/Pyrite_-21-feb(Final Version).xlsx')

# Identify features (X) and the target variable (y)
X = df[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
y = df['ore-forming fluids Type']
groups = df['Location']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='mean')

# Apply the imputer to the X DataFrame
X = imputer.fit_transform(X)

# Initialize the Random Forest classifier with specified parameters
rf_classifier = RandomForestClassifier(n_estimators=400, max_depth=20, random_state=42)

# Initialize LeaveOneGroupOut cross-validator
logo = LeaveOneGroupOut()

for train_index, test_index in logo.split(X, y, groups):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    location_train, location_test = groups[train_index], groups[test_index]

    # Train the classifier on the training data
    rf_classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = rf_classifier.predict(X_test)

    # Evaluate the model on the test set
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    # Output evaluation metrics for each location
    print(f'Test Location: {location_test.iloc[0]} Accuracy: {accuracy*100:.2f}%')
    print('\nLocation Confusion Matrix:')
    print(conf_matrix)
    print('\nLocation Classification Report:')
    print(classification_rep)
    print('------------------------')

In [None]:
import pandas as pd
from sklearn.model_selection import KFold
import pandas as pd
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer

# Load your data from Excel
df = pd.read_excel('/home/alifian/Geology Data Analysis/Pyrite_-21-feb(Final Version).xlsx')

# Identify features (X) and the target variable (y)
X = df[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
y = df['ore-forming fluids Type']
groups = df['Location']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='mean')

# Apply the imputer to the X DataFrame
X = imputer.fit_transform(X)

# Initialize the SVM classifier with specified parameters
svm_classifier = SVC(C=100, kernel='rbf', gamma=0.1, probability=True)

# Initialize LeaveOneGroupOut cross-validator
logo = LeaveOneGroupOut()

for train_index, test_index in logo.split(X, y, groups):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    location_train, location_test = groups[train_index], groups[test_index]

    # Train the classifier on the training data
    svm_classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = svm_classifier.predict(X_test)

    # Evaluate the model on the test set
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    # Output evaluation metrics for each location
    print(f'Test Location: {location_test.iloc[0]}')
    print(f'Accuracy: {accuracy*100:.2f}%')
    print('\nConfusion Matrix:')
    print(conf_matrix)
    print('\nClassification Report:')
    print(classification_rep)
    print('------------------------')


In [None]:
import pandas as pd
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer

# Load your data from Excel
df = pd.read_excel('/home/alifian/Geology Data Analysis/Pyrite_-21-feb(Final Version).xlsx')

# Identify features (X) and the target variable (y)
X = df[['Co', 'Ni', 'Cu', 'Zn', 'Se', 'Ag', 'Sb', 'Pb', 'Bi', 'As']]
y = df['ore-forming fluids Type']
groups = df['Location']

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(strategy='mean')

# Apply the imputer to the X DataFrame
X = imputer.fit_transform(X)

# Initialize the MLP classifier with specified parameters
mlp_classifier = MLPClassifier(hidden_layer_sizes=(50, 100, 50), max_iter=300, activation='tanh', solver='adam', alpha=0.0001, learning_rate='constant')

# Initialize LeaveOneGroupOut cross-validator
logo = LeaveOneGroupOut()

for train_index, test_index in logo.split(X, y, groups):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    location_train, location_test = groups[train_index], groups[test_index]

    # Train the classifier on the training data
    mlp_classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = mlp_classifier.predict(X_test)

    # Evaluate the model on the test set
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    # Output evaluation metrics for each location
    print(f'Test Location: {location_test.iloc[0]} Accuracy: {accuracy*100:.2f}%')
    print('\nLocation Confusion Matrix:')
    print(conf_matrix)
    print('\nLocation Classification Report:')
    print(classification_rep)
    print('------------------------')