In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import joblib

from sklearn.neighbors import KNeighborsClassifier

from sklearn.neural_network import MLPClassifier 

import pandas as pd

In [2]:
# Read the CSV file into a DataFrame
file_path = 'features_extracted_V2.csv'  # Replace with your file path
df = pd.read_csv(file_path)

# Separating out a sample for prediction
predict_rows = df.sample(60, random_state=13)
df = df.drop(predict_rows.index)

# Separating features and label
df_Y = df['label']
df_X = df.drop('label', axis=1)

# Preparing the training data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df_X)
df_X_scaled = scaler.transform(df_X)
joblib.dump(scaler, 'scaler_model.joblib')

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_X_scaled, df_Y, test_size=0.20, random_state=42)

# Transform your predict_rows for prediction

predict_features = predict_rows.drop('label', axis=1)
predict_labels = predict_rows['label']

# Scale the features of the 15 rows
predict_features_scaled = scaler.transform(predict_features)


In [3]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score, confusion_matrix

best_precision_accuracy = 0
best_config = None
best_model = None

base_learners_set1 = [('rf', RandomForestClassifier(criterion='entropy', max_features='sqrt', min_samples_leaf=1, min_samples_split=3, n_estimators=100)), 
                      ('mlp', MLPClassifier(max_iter=500, activation='relu', alpha=0.001, hidden_layer_sizes=(20,), learning_rate='adaptive', solver='adam')),
                      ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=15, n_neighbors=20, p=1, weights='distance'))]

base_learners_set2 = [('rf', RandomForestClassifier(criterion='entropy', max_features='sqrt', min_samples_leaf=1, min_samples_split=3, n_estimators=100)), 
                      ('mlp', MLPClassifier(max_iter=500, activation='relu', alpha=0.001, hidden_layer_sizes=(20,), learning_rate='adaptive', solver='adam')), 
                      ('svm', SVC(C=10, kernel='rbf', tol=0.001))]

base_learners_set3 = [('rf', RandomForestClassifier(criterion='entropy', max_features='sqrt', min_samples_leaf=1, min_samples_split=3, n_estimators=100)),
                      ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=15, n_neighbors=20, p=1, weights='distance')), 
                      ('svm', SVC(C=10, kernel='rbf', tol=0.001))]

base_learners_set4 = [('mlp', MLPClassifier(max_iter=500, activation='relu', alpha=0.001, hidden_layer_sizes=(20,), learning_rate='adaptive', solver='adam')),
                      ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=15, n_neighbors=20, p=1, weights='distance')), 
                      ('svm', SVC(C=10, kernel='rbf', tol=0.001))]

base_learners = [base_learners_set1, base_learners_set2, base_learners_set3, base_learners_set4]

for idx, base_learner_group in enumerate(base_learners):
    meta_learner = LogisticRegression()
    clf = StackingClassifier(estimators=base_learner_group, final_estimator=meta_learner)

    # Train the model on the full training data
    clf.fit(X_train, y_train)

    # Predict on the test set and evaluate
    test_predictions = clf.predict(X_test)
    precision = precision_score(y_test, test_predictions) * 100
    accuracy = accuracy_score(y_test, test_predictions) * 100
    combined_metric = (precision + accuracy) / 2  # You can adjust the combination based on your preference

    print(f'Model Configuration {idx + 1} - Test Set Evaluation:')
    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('F1 Score:', f1_score(y_test, test_predictions) * 100)
    print('Recall:', recall_score(y_test, test_predictions) * 100)
    print('ROC AUC:', roc_auc_score(y_test, test_predictions) * 100)
    print('Confusion Matrix:', confusion_matrix(y_test, test_predictions))
    print('Combined Metric:', combined_metric)
    print('-----------------------------------------\n')

    # Predict on the 15 rows and compare with actual labels
    predict_predictions = clf.predict(predict_features_scaled)
    print('Prediction on 15 Rows:')
    print('Predicted Labels:', predict_predictions)
    print('Actual Labels:', predict_labels.values)
    print('-----------------------------------------\n')

    # Save the best configuration and model based on the combined metric
    if combined_metric > best_precision_accuracy:
        best_precision_accuracy = combined_metric
        best_config = base_learner_group
        best_model = clf

# Now, best_config contains the base learners configuration, and best_model contains the best StackingClassifier.
print(f'The best model configuration is {best_config} with a combined metric of {best_precision_accuracy:.2f}')

# Save the best model to a file (you can choose the serialization method based on your preferences)

joblib.dump(best_model, 'best_model.pkl')





Model Configuration 1 - Test Set Evaluation:
Accuracy: 97.76384535005225
Precision: 97.16070414537195
F1 Score: 96.96797959761973
Recall: 96.77601809954751
ROC AUC: 97.55937133018476
Confusion Matrix: [[2967   50]
 [  57 1711]]
Combined Metric: 97.4622747477121
-----------------------------------------

Prediction on 15 Rows:
Predicted Labels: [0 1 0 0 0 1 0 1 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Actual Labels: [0 1 0 0 0 1 0 1 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
-----------------------------------------

Model Configuration 2 - Test Set Evaluation:
Accuracy: 97.76384535005225
Precision: 97.21432632177374
F1 Score: 96.96626027785653
Recall: 96.71945701357465
ROC AUC: 97.54766354159011
Confusion Matrix: [[2968   49]
 [  58 1710]]
Combined Metric: 97.48908583591299
-----------------------------------------

Prediction on 15 Rows:
Predicted Labels: [0 

['best_model.pkl']