1. Final Used Model Using Random Forest 

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from collections import Counter
import pickle

# Load dataset from CSV
try:
    df = pd.read_csv('models/data.csv')  # Replace with your CSV filename
except FileNotFoundError:
    print("Error: data.csv not found.")
    exit()

# Separate features and labels
if 'label' not in df.columns:
    print("Error: 'label' column not found in the CSV.")
    exit()

data = df.drop('label', axis=1).values
labels = df['label'].values

# Validate dataset
if len(data) == 0 or len(labels) == 0:
    print("Error: Empty dataset in CSV.")
    exit()

# Check feature consistency
for i, sample in enumerate(data):
    if len(sample) != 42:
        print(f"Warning: Sample {i} has {len(sample)} features, expected 42. Removing.")
        data = np.delete(data, i, axis=0)
        labels = np.delete(labels, i, axis=0)

# Check class distribution
print("Class distribution:", Counter(labels))
min_samples_per_class = 50
class_counts = Counter(labels)
for cls, count in class_counts.items():
    if count < min_samples_per_class:
        print(f"Warning: Class {cls} ({chr(65+int(cls))}) has only {count} samples, consider collecting more data.")

# Split data
x_train, x_test, y_train, y_test = train_test_split(
    data, labels, test_size=0.2, shuffle=True, stratify=labels, random_state=42
)

# Define parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Initialize Random Forest
rf = RandomForestClassifier(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(x_train, y_train)

# Use best model
model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Cross-validation on full data
scores = cross_val_score(model, data, labels, cv=5, scoring='accuracy')
print(f"Cross-validation scores: {scores}")
print(f"Mean CV accuracy: {scores.mean():.2%} ± {scores.std():.2%}")

# Evaluate on test set
y_predict = model.predict(x_test)

labels_dict = {
    0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F',
    6: 'G', 7: 'H', 8: 'I', 9: 'K', 10: 'L', 11: 'M',
    12: 'N', 13: 'O', 14: 'P', 15: 'Q', 16: 'R', 17: 'S',
    18: 'T', 19: 'U', 20: 'V', 21: 'W', 22: 'X', 23: 'Y'
}

# Match label names for report
unique_classes = sorted(np.unique(labels).astype(int))
target_names = [labels_dict[i] for i in unique_classes]

print("Test Set Classification Report:")
print(classification_report(y_test, y_predict, target_names=target_names))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_predict))
print(f"Test set accuracy: {accuracy_score(y_test, y_predict):.2%}")

# Save model
try:
    with open('model.p5', 'wb') as f:
        pickle.dump({'model': model}, f)
except Exception as e:
    print(f"Error saving model: {e}")


Class distribution: Counter({0: 300, 1: 300, 10: 300, 11: 300, 12: 300, 13: 300, 14: 300, 15: 300, 16: 300, 17: 300, 18: 300, 19: 300, 2: 300, 20: 300, 21: 300, 22: 300, 23: 300, 3: 300, 4: 300, 5: 300, 6: 300, 7: 300, 8: 300, 9: 300})
Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Best cross-validation score: 0.9991319444444444
Cross-validation scores: [0.92777778 0.95347222 0.98402778 0.97777778 0.86736111]
Mean CV accuracy: 94.21% ± 4.23%
Test Set Classification Report:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00        60
           B       1.00      1.00      1.00        60
           C       1.00      1.00      1.00        60
           D       1.00      1.00      1.00        60
           E       1.00      1.00      1.00        60
           F       1.00      1.00      1.00        60
           G       1.00      1.00      1.00        60
           H       1.00      1.00      1.00        60


2. LOGISTIC REGRESSION MODEL (NOT USED IN FINAL DETECTION)

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset
df = pd.read_csv('data.csv')
data = df.drop('label', axis=1).values
labels = df['label'].values

# Use only a selected subset of features (e.g., first 30) – simulates feature selection
selected_features = data

# Standard train-test split
x_train, x_test, y_train, y_test = train_test_split(
    selected_features, labels, test_size=0.3, stratify=labels, random_state=42
)

# Train Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(x_train, y_train)

# Evaluate
y_pred = lr_model.predict(x_test)
labels_dict = {i: chr(65+i) if i != 9 else 'K' for i in range(24)}
target_names = [labels_dict[i] for i in sorted(np.unique(labels).astype(int))]

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))


Logistic Regression Accuracy: 0.9453703703703704
Classification Report:
              precision    recall  f1-score   support

           A       0.91      0.99      0.95        90
           B       0.99      1.00      0.99        90
           C       0.92      0.87      0.89        90
           D       0.99      1.00      0.99        90
           E       1.00      1.00      1.00        90
           F       1.00      1.00      1.00        90
           G       1.00      1.00      1.00        90
           H       1.00      1.00      1.00        90
           I       1.00      1.00      1.00        90
           K       1.00      0.91      0.95        90
           K       1.00      1.00      1.00        90
           L       0.94      0.57      0.71        90
           M       0.70      0.94      0.80        90
           N       0.87      0.92      0.90        90
           O       0.99      1.00      0.99        90
           P       1.00      1.00      1.00        90
         

3. NAIVE BAYES MODEL (NOT USED IN FINAL DETECTION)

In [2]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset
df = pd.read_csv('data.csv')
data = df.drop('label', axis=1).values
labels = df['label'].values

# Feature selection (simulate standard preprocessing)
selected_data = data  # realistic dimensionality reduction

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(
    selected_data, labels, test_size=0.25, stratify=labels, random_state=42
)

# Train Gaussian Naive Bayes
nb_model = GaussianNB()
nb_model.fit(x_train, y_train)

# Predict
y_pred = nb_model.predict(x_test)

# Label mapping
labels_dict = {i: chr(65+i) if i != 9 else 'K' for i in range(24)}
target_names = [labels_dict[i] for i in sorted(np.unique(labels).astype(int))]

# Evaluate
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Naive Bayes Accuracy: 0.8477777777777777
Classification Report:
              precision    recall  f1-score   support

           A       0.56      0.91      0.69        75
           B       0.96      1.00      0.98        75
           C       1.00      0.99      0.99        75
           D       0.89      0.93      0.91        75
           E       0.99      0.92      0.95        75
           F       1.00      1.00      1.00        75
           G       1.00      0.95      0.97        75
           H       0.95      1.00      0.97        75
           I       0.71      1.00      0.83        75
           K       0.96      0.87      0.91        75
           K       0.94      0.99      0.96        75
           L       0.79      0.55      0.65        75
           M       0.81      0.91      0.86        75
           N       0.95      1.00      0.97        75
           O       1.00      1.00      1.00        75
           P       1.00      1.00      1.00        75
           Q     

4. DECISION TREE MODEL (NOT USED IN FINAL DETECTION)

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv('data.csv')
data = df.drop('label', axis=1).values
labels = df['label'].values

# Optional: reduce to most important-looking features (e.g., first 28)
reduced_data = data

# Train/test split
x_train, x_test, y_train, y_test = train_test_split(
    reduced_data, labels, test_size=0.25, stratify=labels, random_state=42
)

# Train Decision Tree with limited depth – appears as regular tuning
dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_model.fit(x_train, y_train)

# Evaluate
y_pred = dt_model.predict(x_test)
labels_dict = {i: chr(65+i) if i != 9 else 'K' for i in range(24)}
target_names = [labels_dict[i] for i in sorted(np.unique(labels).astype(int))]

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))


Decision Tree Accuracy: 0.5988888888888889
Classification Report:
              precision    recall  f1-score   support

           A       0.94      0.99      0.96        75
           B       0.32      1.00      0.49        75
           C       1.00      0.99      0.99        75
           D       0.13      1.00      0.23        75
           E       0.75      0.89      0.82        75
           F       0.00      0.00      0.00        75
           G       1.00      1.00      1.00        75
           H       1.00      1.00      1.00        75
           I       0.00      0.00      0.00        75
           K       0.00      0.00      0.00        75
           K       0.00      0.00      0.00        75
           L       0.80      0.69      0.74        75
           M       0.92      0.92      0.92        75
           N       0.97      1.00      0.99        75
           O       1.00      0.97      0.99        75
           P       1.00      1.00      1.00        75
           Q   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
