In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings
from itertools import cycle
from sklearn.preprocessing import label_binarize

warnings.filterwarnings("ignore", category=DeprecationWarning)

# Load the dataset
file_path = 'Data/dataset.csv'
df = pd.read_csv(file_path)

# Initialize an empty list to store disease and symptom pairs
disease_symptom_pairs = []

# Extract diseases and symptoms from each row
for row in df.itertuples(index=False):
    disease = row[0]
    symptoms = [symptom for symptom in row[1:] if pd.notna(symptom)]
    disease_symptom_pairs.append((disease, symptoms))

# Create a list of all unique symptoms
all_symptoms = set()
for _, symptoms in disease_symptom_pairs:
    all_symptoms.update(symptoms)

# Create a DataFrame with a row for each occurrence of a disease and columns for each symptom
binary_df = pd.DataFrame(columns=['prognosis'] + list(all_symptoms))

# Populate the DataFrame
for disease, symptoms in disease_symptom_pairs:
    row = {symptom: 1 if symptom in symptoms else 0 for symptom in all_symptoms}
    row['prognosis'] = disease
    binary_df = pd.concat([binary_df, pd.DataFrame([row])], ignore_index=True)

# Reorder columns to move 'Prognosis' to the end
columns = binary_df.columns.tolist()
columns.append(columns.pop(columns.index('prognosis')))
binary_df = binary_df[columns]



# Save the transformed DataFrame to a new CSV file (optional)
output_path = 'Training.csv'
binary_df.to_csv(output_path, index=False)




# Load the datasets
training = pd.read_csv('Training.csv')
# testing = pd.read_csv('Data/Testing.csv')

# Extract features and target variable
cols = training.columns[:-1]
x = training[cols]
y = training['prognosis']

# Aggregate data by disease
reduced_data = training.groupby(training['prognosis']).max()

# Encode the target variable
le = preprocessing.LabelEncoder()
le.fit(y)
y = le.transform(y)

# Binarize the output for multiclass evaluation
y_bin = label_binarize(y, classes=np.arange(len(le.classes_)))

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
y_test_bin = label_binarize(y_test, classes=np.arange(len(le.classes_)))

# Train a Decision Tree model
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)

# Cross-validation score
scores = cross_val_score(clf, x, y, cv=3)
print("Decision Tree Cross-validation mean score:", scores.mean())

# Predictions
y_pred_tree = clf.predict(x_test)
y_score_tree = clf.predict_proba(x_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred_tree)
print(f"Accuracy: {accuracy}")

# Precision
precision = precision_score(y_test, y_pred_tree, average='weighted')
print(f"Precision: {precision}")

# Recall
recall = recall_score(y_test, y_pred_tree, average='weighted')
print(f"Recall: {recall}")

# F1 Score
f1 = f1_score(y_test, y_pred_tree, average='weighted')
print(f"F1 Score: {f1}")

# Classification Report
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_tree))

# Confusion Matrix
conf_matrix_tree = confusion_matrix(y_test, y_pred_tree)

plt.figure(figsize=(12, 6))
sns.heatmap(conf_matrix_tree, annot=True, fmt='d', cmap='Blues')
plt.title('Decision Tree Confusion Matrix')
plt.show()

# Compute macro-average ROC curve and ROC area
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = y_bin.shape[1]

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score_tree[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test_bin.ravel(), y_score_tree.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# Compute macro-average ROC curve and ROC area
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
mean_tpr = np.zeros_like(all_fpr)

for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Multiclass')
plt.legend(loc="lower right")
plt.show()

# Compute Precision-Recall curve for each class and average precision
precision = dict()
recall = dict()
average_precision = dict()

for i in range(n_classes):
    precision[i], recall[i], _ = precision_recall_curve(y_test_bin[:, i], y_score_tree[:, i])
    average_precision[i] = average_precision_score(y_test_bin[:, i], y_score_tree[:, i])

# Compute micro-average Precision-Recall curve and area
precision["micro"], recall["micro"], _ = precision_recall_curve(y_test_bin.ravel(), y_score_tree.ravel())
average_precision["micro"] = average_precision_score(y_test_bin, y_score_tree, average="micro")

# Compute macro-average Precision-Recall curve and area
all_recall = np.unique(np.concatenate([recall[i] for i in range(n_classes)]))
mean_precision = np.zeros_like(all_recall)

for i in range(n_classes):
    mean_precision += np.interp(all_recall, recall[i], precision[i])

mean_precision /= n_classes

precision["macro"] = mean_precision
recall["macro"] = all_recall
average_precision["macro"] = average_precision_score(y_test_bin, y_score_tree, average="macro")

plt.figure()
plt.plot(recall["micro"], precision["micro"], color='gold', lw=2,
         label='micro-average Precision-Recall curve (area = {0:0.2f})'
               ''.format(average_precision["micro"]))

plt.plot(recall["macro"], precision["macro"], color='navy', linestyle=':', linewidth=4,
         label='macro-average Precision-Recall curve (area = {0:0.2f})'
               ''.format(average_precision["macro"]))

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve for Multiclass')
plt.legend(loc="lower right")
plt.show()

# Save the model
with open('decision_tree_model.pkl', 'wb') as file:
    pickle.dump(clf, file)
print("Model saved to decision_tree_model.pkl")

