In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout, BatchNormalization, MaxPooling1D
from tensorflow.keras.utils import to_categorical

In [3]:
train_data = pd.read_csv("Training.csv").drop(columns=["Unnamed: 133"], errors="ignore")
test_data = pd.read_csv("Testing.csv").dropna(axis=1)

In [4]:
# Feature selection
X = train_data.drop(columns=["prognosis"])
y = train_data["prognosis"]  # Convert y to a 1D array

# Test dataset
P = test_data.drop(columns=["prognosis"])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=24, stratify=y)

print(f"Train: {X_train.shape}, {y_train.shape}")
print(f"Test: {X_test.shape}, {y_test.shape}")

Train: (3936, 132), (3936,)
Test: (984, 132), (984,)


## KNN

In [5]:
# Initialize kNN classifier
knn = KNeighborsClassifier(n_neighbors=10)

# Train the model
knn.fit(X_train, y_train)

# Predictions
training_knn = knn.predict(X_train)
testing_knn = knn.predict(X_test)

# Evaluate on training set
print("Metrics on ---Training Set ----")
print(f'Accuracy: {accuracy_score(y_train, training_knn):.2f}')
print(f'Precision: {precision_score(y_train, training_knn, average="weighted", zero_division=1):.2f}')
print(f'Recall: {recall_score(y_train, training_knn, average="weighted", zero_division=1):.2f}')
#print("\nClassification Report:\n", classification_report(y_train, training_knn))

# Evaluate on test set
print("\nMetrics on ---Test Set ----")
print(f'Accuracy: {accuracy_score(y_test, testing_knn):.2f}')
print(f'Precision: {precision_score(y_test, testing_knn, average="weighted", zero_division=1):.2f}')
print(f'Recall: {recall_score(y_test, testing_knn, average="weighted", zero_division=1):.2f}')
#print("\nClassification Report:\n", classification_report(y_test, testing_knn))

# cross-validation
cv_scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
print(f"\nCross-validation accuracy: {cv_scores.mean():.2f} and {cv_scores.std():.2f}")

Metrics on ---Training Set ----
Accuracy: 1.00
Precision: 1.00
Recall: 1.00

Metrics on ---Test Set ----
Accuracy: 1.00
Precision: 1.00
Recall: 1.00

Cross-validation accuracy: 1.00 and 0.00


## Decision Tree

In [6]:
# ______ Εκπαίδευση Decision Tree _____
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

# Predictions
training_dc = decision_tree.predict(X_train)
testing_dc = decision_tree.predict(X_test)

# Evaluate on training set
print("Metrics on ---Training Set ----")
print(f'Accuracy: {accuracy_score(y_train, training_dc):.2f}')
print(f'Precision: {precision_score(y_train, training_dc, average="weighted", zero_division=1):.2f}')
print(f'Recall: {recall_score(y_train, training_dc, average="weighted", zero_division=1):.2f}')
#print("\nClassification Report:\n", classification_report(y_train, training_dc))

# Evaluate on test set
print("\nMetrics on ---Test Set ----")
print(f'Accuracy: {accuracy_score(y_test, testing_dc):.2f}')
print(f'Precision: {precision_score(y_test, testing_dc, average="weighted", zero_division=1):.2f}')
print(f'Recall: {recall_score(y_test, testing_dc, average="weighted", zero_division=1):.2f}')
#print("\nClassification Report:\n", classification_report(y_test, testing_dc))

# cross-validation
cv_scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
print(f"\nCross-validation accuracy: {cv_scores.mean():.2f} and {cv_scores.std():.2f}")

Metrics on ---Training Set ----
Accuracy: 1.00
Precision: 1.00
Recall: 1.00

Metrics on ---Test Set ----
Accuracy: 1.00
Precision: 1.00
Recall: 1.00

Cross-validation accuracy: 1.00 and 0.00


## Random Forest 

In [7]:
# _____ Εκπαίδευση Random Forest _____
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train, y_train)

# Predictions
training_rf = random_forest.predict(X_train)
testing_rf = random_forest.predict(X_test)

# Evaluate on training set
print("Metrics on ---Training Set ----")
print(f'Accuracy: {accuracy_score(y_train, training_rf):.2f}')
print(f'Precision: {precision_score(y_train, training_rf, average="weighted", zero_division=1):.2f}')
print(f'Recall: {recall_score(y_train, training_rf, average="weighted", zero_division=1):.2f}')
#print("\nClassification Report:\n", classification_report(y_train, training_rf))

# Evaluate on test set
print("\nMetrics on ---Test Set ----")
print(f'Accuracy: {accuracy_score(y_test, testing_rf):.2f}')
print(f'Precision: {precision_score(y_test, testing_rf, average="weighted", zero_division=1):.2f}')
print(f'Recall: {recall_score(y_test, testing_rf, average="weighted", zero_division=1):.2f}')
#print("\nClassification Report:\n", classification_report(y_test, testing_rf))

# cross-validation
cv_scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
print(f"\nCross-validation accuracy: {cv_scores.mean():.2f} and {cv_scores.std():.2f}")

Metrics on ---Training Set ----
Accuracy: 1.00
Precision: 1.00
Recall: 1.00

Metrics on ---Test Set ----
Accuracy: 1.00
Precision: 1.00
Recall: 1.00

Cross-validation accuracy: 1.00 and 0.00


## SVM

In [8]:
# Support Vector Machine 
# Εκπαίδευση SVM
svm = SVC(random_state=42)
svm.fit(X_train, y_train)

# Predictions
training_svm = svm.predict(X_train)
testing_svm = svm.predict(X_test)

# Evaluate on training set
print("Metrics on ---Training Set ----")
print(f'Accuracy: {accuracy_score(y_train, training_svm):.2f}')
print(f'Precision: {precision_score(y_train, training_svm, average="weighted", zero_division=1):.2f}')
print(f'Recall: {recall_score(y_train, training_svm, average="weighted", zero_division=1):.2f}')
#print("\nClassification Report:\n", classification_report(y_train, training_svm))

# Evaluate on test set
print("\nMetrics on ---Test Set ----")
print(f'Accuracy: {accuracy_score(y_test, testing_svm):.2f}')
print(f'Precision: {precision_score(y_test, testing_svm, average="weighted", zero_division=1):.2f}')
print(f'Recall: {recall_score(y_test, testing_svm, average="weighted", zero_division=1):.2f}')
#print("\nClassification Report:\n", classification_report(y_test, testing_svm))

# cross-validation
cv_scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
print(f"\nCross-validation accuracy: {cv_scores.mean():.2f} and {cv_scores.std():.2f}")

Metrics on ---Training Set ----
Accuracy: 1.00
Precision: 1.00
Recall: 1.00

Metrics on ---Test Set ----
Accuracy: 1.00
Precision: 1.00
Recall: 1.00

Cross-validation accuracy: 1.00 and 0.00


## MLP

In [9]:
# _____ MLP _____
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)

mlp.fit(X_train, y_train)

# Predictions
training_mlp = mlp.predict(X_train)
testing_mlp = mlp.predict(X_test)

# Evaluate on training set
print("Metrics on ---Training Set ----")
print(f'Accuracy: {accuracy_score(y_train, training_mlp):.2f}')
print(f'Precision: {precision_score(y_train, training_mlp, average="weighted", zero_division=1):.2f}')
print(f'Recall: {recall_score(y_train, training_mlp, average="weighted", zero_division=1):.2f}')
#print("\nClassification Report:\n", classification_report(y_train, training_mlp))

# Evaluate on test set
print("\nMetrics on ---Test Set ----")
print(f'Accuracy: {accuracy_score(y_test, testing_mlp):.2f}')
print(f'Precision: {precision_score(y_test, testing_mlp, average="weighted", zero_division=1):.2f}')
print(f'Recall: {recall_score(y_test, testing_mlp, average="weighted", zero_division=1):.2f}')
#print("\nClassification Report:\n", classification_report(y_test, testing_mlp))

# cross-validation
cv_scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
print(f"\nCross-validation accuracy: {cv_scores.mean():.2f} and {cv_scores.std():.2f}")

Metrics on ---Training Set ----
Accuracy: 1.00
Precision: 1.00
Recall: 1.00

Metrics on ---Test Set ----
Accuracy: 1.00
Precision: 1.00
Recall: 1.00

Cross-validation accuracy: 1.00 and 0.00


## CNN

In [18]:
# Features and labels
X = train_data.drop(columns=["prognosis"]).values  # One-hot encoded features
y = train_data["prognosis"].values  # Labels

# Encode labels as numbers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = to_categorical(y)  # Convert labels to one-hot encoding

# Reshape input for CNN (CNN expects 3D input: samples, time steps, features)
X = X.reshape(X.shape[0], X.shape[1], 1)  # No scaling needed

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24, stratify=y)

# CNN Model
cnn = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    
    Conv1D(128, kernel_size=3, activation='relu'),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(y_train.shape[1], activation='softmax')  # Output layer
])

# Compile the model
cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = cnn.fit(X_train, y_train, epochs=50, batch_size=16, validation_data=(X_test, y_test), verbose=1)

# Evaluate the model
test_loss, test_acc = cnn.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {test_acc:.2f}")

# Predictions
y_pred = cnn.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

# Convert predictions back to original labels
y_pred_labels = label_encoder.inverse_transform(y_pred_classes)
y_true_labels = label_encoder.inverse_transform(y_true_classes)

# Classification Report
#print("\nClassification Report:\n", classification_report(y_true_labels, y_pred_labels))


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.7962 - loss: 0.8901 - val_accuracy: 0.4370 - val_loss: 2.1255
Epoch 2/50
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.9941 - loss: 0.0233 - val_accuracy: 0.9990 - val_loss: 0.0158
Epoch 3/50
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.9928 - loss: 0.0257 - val_accuracy: 1.0000 - val_loss: 5.4693e-05
Epoch 4/50
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.9902 - loss: 0.0283 - val_accuracy: 1.0000 - val_loss: 2.2080e-06
Epoch 5/50
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 20ms/step - accuracy: 0.9976 - loss: 0.0136 - val_accuracy: 1.0000 - val_loss: 4.5428e-06
Epoch 6/50
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.9969 - loss: 0.0114 - val_accuracy: 1.0000 - val_loss: 2.9583e-06
Epoch 7/50
[1m24

## Testing models

In [13]:
print("Precision:", precision_score(test_data["prognosis"], knn.predict(P), average='weighted', zero_division=1))

Precision: 1.0


In [14]:
print("Precision:", precision_score(test_data["prognosis"], decision_tree.predict(P), average='weighted', zero_division=1))

Precision: 0.9880952380952381


In [15]:
print("Precision:", precision_score(test_data["prognosis"], random_forest.predict(P), average='weighted', zero_division=1))

Precision: 0.9880952380952381


In [16]:
print("Precision:", precision_score(test_data["prognosis"], svm.predict(P), average='weighted', zero_division=1))

Precision: 1.0


In [17]:
print("Precision:", precision_score(test_data["prognosis"], mlp.predict(P), average='weighted', zero_division=1))

Precision: 0.9880952380952381


In [21]:
# Predictions on test dataset
P = test_data.drop(columns=["prognosis"]).values  # Extract test features
P = np.expand_dims(P, axis=-1)  # Reshape for CNN

# Predict probabilities
y_pred_cnn = cnn.predict(P)

# Convert probabilities to class labels
y_pred_labels = label_encoder.inverse_transform(np.argmax(y_pred_cnn, axis=1))

# Calculate precision
print("Precision:", precision_score(test_data["prognosis"], y_pred_labels, average='weighted', zero_division=1))

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
Precision: 0.9880952380952381


# Save Pre-trained model

In [22]:
import joblib

# Αποθήκευση του εκπαιδευμένου MLP μοντέλου
joblib.dump(knn, 'knn.joblib')
print("Το μοντέλο αποθηκεύτηκε επιτυχώς")

# Φόρτωση του αποθηκευμένου μοντέλου
knn_loaded = joblib.load('knn.joblib')
print("Το μοντέλο φορτώθηκε επιτυχώς")

Το μοντέλο αποθηκεύτηκε επιτυχώς
Το μοντέλο φορτώθηκε επιτυχώς
