In [9]:
#Importing Data from disease_features.csv

import pandas as pd
import numpy as np
import ast
import streamlit as st
df = pd.read_csv('disease_features.csv')


In [10]:
# ──────────────────────────────────────────────────────────────────────────────
# Task 1: TF‑IDF Feature Extraction and One‑Hot Encoding
# ──────────────────────────────────────────────────────────────────────────────

import ast
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

# Step 1: Parse string‑encoded lists into real Python lists
columns_to_parse = ['Risk Factors', 'Symptoms', 'Signs', 'Subtypes']
for col in columns_to_parse:
    # If the cell is not null, convert the string "[a, b]" into a Python list ['a', 'b']
    df[col] = df[col].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])

# Step 2: Build text columns for TF‑IDF
# Join each list into one space‑separated string, ready for text vectorization
df['Risk Factors_str'] = df['Risk Factors'].apply(lambda lst: ' '.join(lst))
df['Symptoms_str']     = df['Symptoms'].apply(lambda lst: ' '.join(lst))
df['Signs_str']        = df['Signs'].apply(lambda lst: ' '.join(lst))

# Step 3: Vectorize each text column with TF‑IDF
vectorizers = {}    # To store each TfidfVectorizer object
tfidf_matrices = {} # To store the resulting sparse matrices

for col in ['Risk Factors_str', 'Symptoms_str', 'Signs_str']:
    # Initialize a fresh TF‑IDF vectorizer
    vec = TfidfVectorizer()
    # Fit to the column and transform into a sparse matrix
    mat = vec.fit_transform(df[col])
    # Keep references for later use
    vectorizers[col] = vec
    tfidf_matrices[col] = mat
    # Optional debug: print(f"TF‑IDF on {col} → shape {mat.shape}")

# Step 4: Combine the three TF‑IDF matrices into one
combined_tfidf = hstack([
    tfidf_matrices['Risk Factors_str'],
    tfidf_matrices['Symptoms_str'],
    tfidf_matrices['Signs_str']
])
# Convert to dense array if you need to inspect values directly
dense_matrix = combined_tfidf.toarray()

# Step 5: One‑Hot encode the original list columns
mlb = MultiLabelBinarizer()
onehot_matrices = {}

for col in ['Risk Factors', 'Symptoms', 'Signs']:
    # Transform each list of labels into a binary indicator matrix
    onehot = mlb.fit_transform(df[col])
    onehot_matrices[col] = onehot
    print(f"One‑hot encoding for {col}: {onehot.shape}")

# Stack the one‑hot matrices side by side into one array
combined_onehot = np.hstack([
    onehot_matrices['Risk Factors'],
    onehot_matrices['Symptoms'],
    onehot_matrices['Signs']
])

# ──────────────────────────────────────────────────────────────────────────────
# Comparison Metrics
# ──────────────────────────────────────────────────────────────────────────────

# 1. Sparsity: fraction of zero entries in each matrix
tfidf_sparsity  = 1 - (np.count_nonzero(dense_matrix) / dense_matrix.size)
onehot_sparsity = 1 - (np.count_nonzero(combined_onehot) / combined_onehot.size)
print("\nSparsity Comparison:")
print(f"TF‑IDF Sparsity:  {tfidf_sparsity:.2%}")
print(f"One‑hot Sparsity: {onehot_sparsity:.2%}")

# 2. Dimensionality: number of features in each encoding
print("\nDimensionality Comparison:")
print(f"TF‑IDF features:  {combined_tfidf.shape[1]}")
print(f"One‑hot features: {combined_onehot.shape[1]}")

# 3. Basic statistics: density and mean values
print("\nMatrix Statistics:")
print("TF‑IDF:")
print(f"- Non‑zero elements: {combined_tfidf.nnz}")
print(f"- Mean value:        {combined_tfidf.mean():.4f}")
print("\nOne‑hot:")
print(f"- Non‑zero elements: {np.count_nonzero(combined_onehot)}")
print(f"- Mean value:        {combined_onehot.mean():.4f}")

# 4. Feature distribution for TF‑IDF (non‑zero values only)
tfidf_values = combined_tfidf.data
print("\nTF‑IDF Value Distribution:")
print(f"- Min:    {tfidf_values.min():.4f}")
print(f"- Max:    {tfidf_values.max():.4f}")
print(f"- Mean:   {tfidf_values.mean():.4f}")
print(f"- Median: {np.median(tfidf_values):.4f}")
print(f"- Std Dev:{tfidf_values.std():.4f}")

# 5. One‑hot value distribution (only 0s and 1s)
onehot_values = combined_onehot.flatten()
print("\nOne‑hot Value Distribution:")
print(f"- Unique values: {np.unique(onehot_values)}")
print(f"- Min:           {onehot_values.min():.4f}")
print(f"- Max:           {onehot_values.max():.4f}")

# 6. Memory usage of the data arrays
print("\nMemory Usage:")
print(f"TF‑IDF:  {combined_tfidf.data.nbytes  / 1024:.2f} KB")
print(f"One‑hot: {combined_onehot.nbytes     / 1024:.2f} KB")

# 7. Information density: average non‑zero features per sample
print("\nInformation Density:")
print(f"TF‑IDF avg features per sample:  {combined_tfidf.nnz / combined_tfidf.shape[0]:.2f}")
print(f"One‑hot avg features per sample: {np.count_nonzero(combined_onehot) / combined_onehot.shape[0]:.2f}")


One‑hot encoding for Risk Factors: (25, 170)
One‑hot encoding for Symptoms: (25, 189)
One‑hot encoding for Signs: (25, 62)

Sparsity Comparison:
TF‑IDF Sparsity:  92.96%
One‑hot Sparsity: 95.15%

Dimensionality Comparison:
TF‑IDF features:  1020
One‑hot features: 421

Matrix Statistics:
TF‑IDF:
- Non‑zero elements: 1795
- Mean value:        0.0119

One‑hot:
- Non‑zero elements: 510
- Mean value:        0.0485

TF‑IDF Value Distribution:
- Min:    0.0239
- Max:    0.6903
- Mean:   0.1687
- Median: 0.1618
- Std Dev:0.0743

One‑hot Value Distribution:
- Unique values: [0 1]
- Min:           0.0000
- Max:           1.0000

Memory Usage:
TF‑IDF:  14.02 KB
One‑hot: 82.23 KB

Information Density:
TF‑IDF avg features per sample:  71.80
One‑hot avg features per sample: 20.40


In [11]:
# ──────────────────────────────────────────────────────────────────────────────
# Task 2: Dimensionality Reduction
# ──────────────────────────────────────────────────────────────────────────────

# Import necessary libraries for decomposition and plotting
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.pyplot as plt
import numpy as np

# Step 1: Apply dimensionality reduction methods

# 1.1 Truncated SVD on TF‑IDF matrix
#    - TruncatedSVD works directly with sparse matrices
#    - We choose n_components = 3 to reduce to 3 latent dimensions
n_components = 3
svd = TruncatedSVD(n_components=n_components)
# Fit SVD to the combined TF‑IDF sparse matrix and transform
tfidf_reduced = svd.fit_transform(combined_tfidf)

# 1.2 PCA on one‑hot encoded matrix
#    - PCA requires a dense array, but one‑hot is already dense
pca = PCA(n_components=n_components)
# Fit PCA to the combined one‑hot array and transform
onehot_reduced = pca.fit_transform(combined_onehot)

# 1.3 Print explained variance ratios for each component
print("Explained Variance Ratios:\n")

# For TruncatedSVD (TF‑IDF)
print("TF‑IDF (TruncatedSVD):")
total_var_svd = svd.explained_variance_ratio_.sum()
print(f"Total variance explained: {total_var_svd:.4f}")
for idx, ratio in enumerate(svd.explained_variance_ratio_, start=1):
    print(f" Component {idx}: {ratio:.4f}")

# For PCA (One‑Hot)
print("\nOne‑hot (PCA):")
total_var_pca = pca.explained_variance_ratio_.sum()
print(f"Total variance explained: {total_var_pca:.4f}")
for idx, ratio in enumerate(pca.explained_variance_ratio_, start=1):
    print(f" Component {idx}: {ratio:.4f}")

# ──────────────────────────────────────────────────────────────────────────────
# Step 2: 2D Visualization of Reduced Dimensions
# ──────────────────────────────────────────────────────────────────────────────

# 2.1 Map each disease to a clinical category for coloring
category_mapping = {
    "Acute Coronary Syndrome":        "Cardiovascular",
    "Adrenal Insufficiency":           "Endocrine",
    "Alzheimer":                       "Neurological",
    "Aortic Dissection":               "Cardiovascular",
    "Asthma":                          "Respiratory",
    "Atrial Fibrillation":             "Cardiovascular",
    "Cardiomyopathy":                  "Cardiovascular",
    "COPD":                            "Respiratory",
    "Diabetes":                        "Endocrine",
    "Epilepsy":                        "Neurological",
    "Gastritis":                       "Gastrointestinal",
    "Gastro-oesophageal Reflux Disease":"Gastrointestinal",
    "Heart Failure":                   "Cardiovascular",
    "Hyperlipidemia":                  "Cardiovascular",
    "Hypertension":                    "Cardiovascular",
    "Migraine":                        "Neurological",
    "Multiple Sclerosis":              "Neurological",
    "Peptic Ulcer Disease":            "Gastrointestinal",
    "Pituitary Disease":               "Endocrine",
    "Pneumonia":                       "Respiratory",
    "Pulmonary Embolism":              "Cardiovascular",
    "Stroke":                          "Neurological",
    "Thyroid Disease":                 "Endocrine",
    "Tuberculosis":                    "Infectious",
    "Upper Gastrointestinal Bleeding": "Gastrointestinal"
}

# Add a 'Category' column to df by mapping disease names
df['Category'] = df['Disease'].map(category_mapping)

# Convert categories to numeric codes for color mapping
unique_categories = df['Category'].unique()
category_to_num = {cat: idx for idx, cat in enumerate(unique_categories)}
category_nums = df['Category'].map(category_to_num)

# 2.2 Create side‑by‑side 2D scatter plots
plt.figure(figsize=(16, 7))

# Plot for TF‑IDF reduced data
plt.subplot(1, 2, 1)
plt.scatter(
    tfidf_reduced[:, 0], tfidf_reduced[:, 1],
    c=category_nums, cmap='viridis', alpha=0.8
)
plt.title('TF‑IDF Vectorization (2D)')
plt.xlabel('Component 1')
plt.ylabel('Component 2')

# Build a custom legend using Line2D handles
from matplotlib.lines import Line2D
legend_handles = [
    Line2D([0], [0], marker='o', color='w',
           markerfacecolor=plt.cm.viridis(category_to_num[cat]/len(category_to_num)),
           markersize=8, label=cat)
    for cat in unique_categories
]
plt.legend(handles=legend_handles, title="Disease Categories")
plt.grid(True, linestyle='--', alpha=0.7)

# Plot for One‑Hot reduced data
plt.subplot(1, 2, 2)
plt.scatter(
    onehot_reduced[:, 0], onehot_reduced[:, 1],
    c=category_nums, cmap='viridis', alpha=0.8
)
plt.title('One‑Hot Encoding (2D)')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.legend(handles=legend_handles, title="Disease Categories")
plt.grid(True, linestyle='--', alpha=0.7)

# Adjust layout and display
plt.tight_layout()
plt.show()

# ──────────────────────────────────────────────────────────────────────────────
# Optional: 3D Visualization (if n_components == 3)
# ──────────────────────────────────────────────────────────────────────────────

if n_components == 3:
    from mpl_toolkits.mplot3d import Axes3D

    fig = plt.figure(figsize=(16, 7))

    # 3D scatter for TF‑IDF
    ax1 = fig.add_subplot(121, projection='3d')
    ax1.scatter(
        tfidf_reduced[:, 0], tfidf_reduced[:, 1], tfidf_reduced[:, 2],
        c=category_nums, cmap='viridis', alpha=0.8
    )
    ax1.set_title('TF‑IDF Vectorization (3D)')
    ax1.set_xlabel('Component 1')
    ax1.set_ylabel('Component 2')
    ax1.set_zlabel('Component 3')
    ax1.legend(handles=legend_handles, title="Disease Categories")

    # 3D scatter for One‑Hot
    ax2 = fig.add_subplot(122, projection='3d')
    ax2.scatter(
        onehot_reduced[:, 0], onehot_reduced[:, 1], onehot_reduced[:, 2],
        c=category_nums, cmap='viridis', alpha=0.8
    )
    ax2.set_title('One‑Hot Encoding (3D)')
    ax2.set_xlabel('Component 1')
    ax2.set_ylabel('Component 2')
    ax2.set_zlabel('Component 3')
    ax2.legend(handles=legend_handles, title="Disease Categories")

    plt.tight_layout()
    plt.show()

# ──────────────────────────────────────────────────────────────────────────────
# Discussion (as comments):
#
# - TF‑IDF clusters appear more distinct, indicating that weighted term frequencies
#   help separate disease categories in lower-dimensional space.
# - One‑Hot clusters overlap more, since all features are equally weighted.
# ──────────────────────────────────────────────────────────────────────────────


Explained Variance Ratios:

TF‑IDF (TruncatedSVD):
Total variance explained: 0.1313
 Component 1: 0.0089
 Component 2: 0.0657
 Component 3: 0.0567

One‑hot (PCA):
Total variance explained: 0.2801
 Component 1: 0.1106
 Component 2: 0.0951
 Component 3: 0.0744


In [12]:
#Task 3: Train KNN Models and Logistic Regression

#Step 1 #########################################################################################
# Prepare for KNN modeling with different k values and distance metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.preprocessing import StandardScaler, MinMaxScaler  # Added for normalization
from sklearn.pipeline import Pipeline  # Added for creating pipelines with normalization
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")  # Suppress warnings

# Define the target variable (disease categories)
target = df['Category']

# Define k values and distance metrics to test
k_values = [3, 5, 7]
metrics = ['euclidean', 'manhattan', 'cosine']

# Define scoring metrics for cross-validation
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted', zero_division=0),
    'recall': make_scorer(recall_score, average='weighted', zero_division=0),
    'f1': make_scorer(f1_score, average='weighted', zero_division=0)
}

# Create DataFrames to store results
results_df = pd.DataFrame(columns=['Model', 'Feature', 'Normalization', 'k', 'Metric', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])

#Step 2 #########################################################################################
# Perform 3-fold cross-validation for KNN with different configurations
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Prepare data
tfidf_array = combined_tfidf.toarray()
onehot_array = combined_onehot

# Define normalization methods to test
normalizers = {
    'None': None,
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler()
}

# For TF-IDF features
for norm_name, normalizer in normalizers.items():
    for k in k_values:
        for metric in metrics:
            try:
                if normalizer is None:
                    # No normalization
                    knn = KNeighborsClassifier(n_neighbors=k, metric=metric, weights='distance')
                    cv_results = cross_validate(knn, tfidf_array, target, cv=cv, scoring=scoring)
                else:
                    # With normalization using pipeline
                    pipeline = Pipeline([
                        ('normalizer', normalizer),
                        ('knn', KNeighborsClassifier(n_neighbors=k, metric=metric, weights='distance'))
                    ])
                    cv_results = cross_validate(pipeline, tfidf_array, target, cv=cv, scoring=scoring)
                
                # Store results
                new_row = pd.DataFrame([{
                    'Model': 'KNN',
                    'Feature': 'TF-IDF',
                    'Normalization': norm_name,
                    'k': k,
                    'Metric': metric,
                    'Accuracy': cv_results['test_accuracy'].mean(),
                    'Precision': cv_results['test_precision'].mean(),
                    'Recall': cv_results['test_recall'].mean(),
                    'F1-Score': cv_results['test_f1'].mean()
                }])
                results_df = pd.concat([results_df, new_row], ignore_index=True)
            except Exception as e:
                # Handle any errors
                new_row = pd.DataFrame([{
                    'Model': 'KNN',
                    'Feature': 'TF-IDF',
                    'Normalization': norm_name,
                    'k': k,
                    'Metric': metric,
                    'Accuracy': np.nan,
                    'Precision': np.nan,
                    'Recall': np.nan,
                    'F1-Score': np.nan
                }])
                results_df = pd.concat([results_df, new_row], ignore_index=True)

# For One-hot encoded features
for norm_name, normalizer in normalizers.items():
    for k in k_values:
        for metric in metrics:
            try:
                if normalizer is None:
                    # No normalization
                    knn = KNeighborsClassifier(n_neighbors=k, metric=metric, weights='distance')
                    cv_results = cross_validate(knn, onehot_array, target, cv=cv, scoring=scoring)
                else:
                    # With normalization using pipeline
                    pipeline = Pipeline([
                        ('normalizer', normalizer),
                        ('knn', KNeighborsClassifier(n_neighbors=k, metric=metric, weights='distance'))
                    ])
                    cv_results = cross_validate(pipeline, onehot_array, target, cv=cv, scoring=scoring)
                
                # Store results
                new_row = pd.DataFrame([{
                    'Model': 'KNN',
                    'Feature': 'One-hot',
                    'Normalization': norm_name,
                    'k': k,
                    'Metric': metric,
                    'Accuracy': cv_results['test_accuracy'].mean(),
                    'Precision': cv_results['test_precision'].mean(),
                    'Recall': cv_results['test_recall'].mean(),
                    'F1-Score': cv_results['test_f1'].mean()
                }])
                results_df = pd.concat([results_df, new_row], ignore_index=True)
            except Exception as e:
                # Handle any errors
                new_row = pd.DataFrame([{
                    'Model': 'KNN',
                    'Feature': 'One-hot',
                    'Normalization': norm_name,
                    'k': k,
                    'Metric': metric,
                    'Accuracy': np.nan,
                    'Precision': np.nan,
                    'Recall': np.nan,
                    'F1-Score': np.nan
                }])
                results_df = pd.concat([results_df, new_row], ignore_index=True)

#Step 3 #########################################################################################
# Train Logistic Regression models on both matrices with and without normalization
for norm_name, normalizer in normalizers.items():
    # For TF-IDF features
    try:
        if normalizer is None:
            # No normalization
            lr = LogisticRegression(max_iter=2000, solver='saga', multi_class='auto', class_weight='balanced')
            lr_results = cross_validate(lr, tfidf_array, target, cv=cv, scoring=scoring)
        else:
            # With normalization using pipeline
            pipeline = Pipeline([
                ('normalizer', normalizer),
                ('lr', LogisticRegression(max_iter=2000, solver='saga', multi_class='auto', class_weight='balanced'))
            ])
            lr_results = cross_validate(pipeline, tfidf_array, target, cv=cv, scoring=scoring)
        
        # Store results
        new_row = pd.DataFrame([{
            'Model': 'Logistic Regression',
            'Feature': 'TF-IDF',
            'Normalization': norm_name,
            'k': 'N/A',
            'Metric': 'N/A',
            'Accuracy': lr_results['test_accuracy'].mean(),
            'Precision': lr_results['test_precision'].mean(),
            'Recall': lr_results['test_recall'].mean(),
            'F1-Score': lr_results['test_f1'].mean()
        }])
        results_df = pd.concat([results_df, new_row], ignore_index=True)
    except Exception as e:
        # Handle any errors
        new_row = pd.DataFrame([{
            'Model': 'Logistic Regression',
            'Feature': 'TF-IDF',
            'Normalization': norm_name,
            'k': 'N/A',
            'Metric': 'N/A',
            'Accuracy': np.nan,
            'Precision': np.nan,
            'Recall': np.nan,
            'F1-Score': np.nan
        }])
        results_df = pd.concat([results_df, new_row], ignore_index=True)

    # For One-hot encoded features
    try:
        if normalizer is None:
            # No normalization
            lr = LogisticRegression(max_iter=2000, solver='saga', multi_class='auto', class_weight='balanced')
            lr_results = cross_validate(lr, onehot_array, target, cv=cv, scoring=scoring)
        else:
            # With normalization using pipeline
            pipeline = Pipeline([
                ('normalizer', normalizer),
                ('lr', LogisticRegression(max_iter=2000, solver='saga', multi_class='auto', class_weight='balanced'))
            ])
            lr_results = cross_validate(pipeline, onehot_array, target, cv=cv, scoring=scoring)
        
        # Store results
        new_row = pd.DataFrame([{
            'Model': 'Logistic Regression',
            'Feature': 'One-hot',
            'Normalization': norm_name,
            'k': 'N/A',
            'Metric': 'N/A',
            'Accuracy': lr_results['test_accuracy'].mean(),
            'Precision': lr_results['test_precision'].mean(),
            'Recall': lr_results['test_recall'].mean(),
            'F1-Score': lr_results['test_f1'].mean()
        }])
        results_df = pd.concat([results_df, new_row], ignore_index=True)
    except Exception as e:
        # Handle any errors
        new_row = pd.DataFrame([{
            'Model': 'Logistic Regression',
            'Feature': 'One-hot',
            'Normalization': norm_name,
            'k': 'N/A',
            'Metric': 'N/A',
            'Accuracy': np.nan,
            'Precision': np.nan,
            'Recall': np.nan,
            'F1-Score': np.nan
        }])
        results_df = pd.concat([results_df, new_row], ignore_index=True)

#Step 4 #########################################################################################
# Display results in smaller, more focused tables
# Format numeric columns to 4 decimal places
numeric_cols = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
for col in numeric_cols:
    results_df[col] = results_df[col].apply(lambda x: f"{x:.4f}" if not pd.isna(x) else "N/A")

# 1. Compare KNN with different normalizations
print("\n1. KNN Model Comparison by Normalization Method:")
for norm_name in results_df['Normalization'].unique():
    print(f"\n--- KNN with {norm_name} Normalization ---")
    
    # Filter data for KNN with this normalization
    knn_norm_data = results_df[(results_df['Model'] == 'KNN') & 
                              (results_df['Normalization'] == norm_name)]
    
    # Create a pivot table to compare TF-IDF vs One-hot
    pivot_table = pd.pivot_table(
        knn_norm_data,
        values=['Accuracy', 'F1-Score'],
        index=['k', 'Metric'],
        columns=['Feature'],
        aggfunc='first'  # Just take the first value since there should be only one
    )
    
    # Reorder columns for better readability
    if ('Accuracy', 'TF-IDF') in pivot_table.columns and ('Accuracy', 'One-hot') in pivot_table.columns:
        pivot_table = pivot_table[[('Accuracy', 'TF-IDF'), ('Accuracy', 'One-hot'), 
                                  ('F1-Score', 'TF-IDF'), ('F1-Score', 'One-hot')]]
    
    # Display the table
    print(pivot_table)

# 2. Compare Logistic Regression with different normalizations
print("\n2. Logistic Regression Model Comparison by Normalization Method:")
lr_data = results_df[results_df['Model'] == 'Logistic Regression']

# Create a pivot table to compare TF-IDF vs One-hot across normalizations
lr_pivot = pd.pivot_table(
    lr_data,
    values=['Accuracy', 'Precision', 'Recall', 'F1-Score'],
    index=['Normalization'],
    columns=['Feature'],
    aggfunc='first'  # Just take the first value since there should be only one
)

# Display the table
print(lr_pivot)

# 3. Best Models by Feature Type
print("\n3. Best Models by Feature Type:")
# Convert F1-Score to float for finding the best models
results_df['F1-Score_float'] = results_df['F1-Score'].apply(lambda x: float(x) if x != "N/A" else 0)

# Find best model for TF-IDF
best_tfidf = results_df[results_df['Feature'] == 'TF-IDF'].loc[results_df[results_df['Feature'] == 'TF-IDF']['F1-Score_float'].idxmax()]
best_onehot = results_df[results_df['Feature'] == 'One-hot'].loc[results_df[results_df['Feature'] == 'One-hot']['F1-Score_float'].idxmax()]

# Create a DataFrame with the best models
best_models = pd.DataFrame([best_tfidf, best_onehot])
best_models = best_models.drop('F1-Score_float', axis=1)
best_models.index = ['Best TF-IDF Model', 'Best One-hot Model']

# Display the table
print(best_models[['Model', 'Normalization', 'k', 'Metric', 'Accuracy', 'Precision', 'Recall', 'F1-Score']])

# 4. Top 5 Models Overall
print("\n4. Top 5 Models Overall:")
top_models = results_df.sort_values(by='F1-Score_float', ascending=False).head(5)
top_models = top_models.drop('F1-Score_float', axis=1)
top_models.index = range(1, len(top_models) + 1)  # Reset index to start from 1
print(top_models[['Model', 'Feature', 'Normalization', 'k', 'Metric', 'Accuracy', 'Precision', 'Recall', 'F1-Score']])

# 5. Effect of k Value on KNN Performance (Best Normalization and Metric)
print("\n5. Effect of k Value on KNN Performance:")
# Get the best normalization and metric from the top model
best_norm = top_models[top_models['Model'] == 'KNN']['Normalization'].iloc[0] if not top_models[top_models['Model'] == 'KNN'].empty else results_df['Normalization'].iloc[0]
best_metric = top_models[top_models['Model'] == 'KNN']['Metric'].iloc[0] if not top_models[top_models['Model'] == 'KNN'].empty else results_df['Metric'].iloc[0]

# Filter data for the best normalization and metric
k_effect_data = results_df[(results_df['Model'] == 'KNN') & 
                          (results_df['Normalization'] == best_norm) &
                          (results_df['Metric'] == best_metric)]

# Create a pivot table to compare k values
k_pivot = pd.pivot_table(
    k_effect_data,
    values=['Accuracy', 'F1-Score'],
    index=['k'],
    columns=['Feature'],
    aggfunc='first'
)

# Display the table
print(f"Using {best_norm} normalization and {best_metric} metric:")
print(k_pivot)

# Clean up temporary column
results_df = results_df.drop('F1-Score_float', axis=1)


1. KNN Model Comparison by Normalization Method:

--- KNN with None Normalization ---
            Accuracy         F1-Score        
Feature       TF-IDF One-hot   TF-IDF One-hot
k Metric                                     
3 cosine      0.6806  0.5972   0.6362  0.5574
  euclidean   0.4861  0.2731   0.3804  0.1616
  manhattan   0.3194  0.2731   0.1608  0.1616
5 cosine      0.7222  0.3981   0.6778  0.3178
  euclidean   0.4815  0.2731   0.3343  0.1505
  manhattan   0.3611  0.2731   0.2163  0.1505
7 cosine      0.5972  0.4444   0.5197  0.3986
  euclidean   0.3981  0.2778   0.2509  0.1545
  manhattan   0.3194  0.2778   0.1571  0.1545

--- KNN with StandardScaler Normalization ---
            Accuracy         F1-Score        
Feature       TF-IDF One-hot   TF-IDF One-hot
k Metric                                     
3 cosine      0.6806  0.5139   0.6245  0.4730
  euclidean   0.3194  0.2731   0.1571  0.1590
  manhattan   0.3194  0.2731   0.1571  0.1590
5 cosine      0.6019  0.3565   0.5111 