DATASET 1 : STUDENT PERFORMANCE

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo

# fetch dataset
student_performance = fetch_ucirepo(id=320)

# data (as pandas dataframes)
X = student_performance.data.features
y = student_performance.data.targets

# Create a copy of X and y to avoid the SettingWithCopyWarning
X_copy = X.copy()
y_copy = y.copy()

# Check for missing values before preprocessing
print(f"Missing values in original X: {X_copy.isnull().sum().sum()}")
print(f"Missing values in original y: {y_copy.isnull().sum().sum()}")

# Handle missing values properly
X_copy = X_copy.replace('?', np.nan)
X_copy = X_copy.apply(pd.to_numeric, errors='coerce')

# Fill remaining missing values with the mean of each column
for col in X_copy.columns:
    if X_copy[col].isnull().sum() > 0:
        X_copy[col] = X_copy[col].fillna(X_copy[col].mean())

# Verify no missing values remain
print(f"Missing values after preprocessing: {X_copy.isnull().sum().sum()}")

# For the student performance dataset, we need to select a target variable
# Let's use G3 (final grade) as our primary target
if 'G3' in y_copy.columns:
    target = y_copy['G3']
else:
    # If G3 doesn't exist, use the first target column
    target = y_copy.iloc[:, 0]

# Make sure target has no missing values
target = target.fillna(target.mean())

# Convert to classification problem (since the original code uses classification metrics)
# Assuming grades below 10 are failing (standard in many grading systems)
target_class = (target >= 10).astype(int)

# Standardizing features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_copy)

# Double-check for NaN values after scaling
if np.isnan(X_scaled).any():
    print("Warning: NaN values found after scaling. Replacing with 0...")
    X_scaled = np.nan_to_num(X_scaled)

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, target_class, test_size=0.2, random_state=42)

# Logistic Regression as the base classifier
model = LogisticRegression(max_iter=1000)

# Function to evaluate and collect performance metrics
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
        "recall": recall_score(y_test, y_pred, average='weighted', zero_division=0),
        "f1_score": f1_score(y_test, y_pred, average='weighted', zero_division=0)
    }

# Classifiers to evaluate
classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(),
    "ANN": MLPClassifier(max_iter=1000)
}

# Dimensionality reduction methods
methods = {}
# Test a range of dimensions
for d in range(1, min(11, X_copy.shape[1])):  # Up to 10 features or max available
    print(f"\n### Feature Selection and Dimensionality Reduction with d = {d} ###\n")

    # Naïve Search (Random selection of d features)
    feature_indices = np.random.choice(range(X_scaled.shape[1]), d, replace=False)
    X_train_naive = X_train[:, feature_indices]
    X_test_naive = X_test[:, feature_indices]
    print(f"Naïve Search Selected Features: {feature_indices}")
    methods[f"Naive_{d}"] = (X_train_naive, X_test_naive)

    try:
        # Step-wise Forward Selection
        sfs_forward = SequentialFeatureSelector(model, n_features_to_select=d, direction='forward').fit(X_train, y_train)
        X_train_sfs = sfs_forward.transform(X_train)
        X_test_sfs = sfs_forward.transform(X_test)
        print(f"Step-wise Forward Selection Selected Features: {np.where(sfs_forward.get_support())[0]}")
        methods[f"SFS_Forward_{d}"] = (X_train_sfs, X_test_sfs)
    except Exception as e:
        print(f"Step-wise Forward Selection failed: {e}")

    try:
        # Step-wise Backward Removal
        sfs_backward = SequentialFeatureSelector(model, n_features_to_select=d, direction='backward').fit(X_train, y_train)
        X_train_sbs = sfs_backward.transform(X_train)
        X_test_sbs = sfs_backward.transform(X_test)
        print(f"Step-wise Backward Removal Selected Features: {np.where(sfs_backward.get_support())[0]}")
        methods[f"SFS_Backward_{d}"] = (X_train_sbs, X_test_sbs)
    except Exception as e:
        print(f"Step-wise Backward Removal failed: {e}")

    # Bidirectional Search (using MLxtend's SFS)
    try:
        sfs_bi = SFS(model, k_features=d, forward=True, floating=False, scoring='accuracy', cv=5).fit(X_train, y_train)
        X_train_bi = X_train[:, list(sfs_bi.k_feature_idx_)]
        X_test_bi = X_test[:, list(sfs_bi.k_feature_idx_)]
        print(f"Bidirectional Search Selected Features: {list(sfs_bi.k_feature_idx_)}")
        methods[f"SFS_Bi_{d}"] = (X_train_bi, X_test_bi)
    except Exception as e:
        print(f"Bidirectional Search failed: {e}")

    # Step-wise Floating Forward Selection
    try:
        sfs_float_forward = SFS(model, k_features=d, forward=True, floating=True, scoring='accuracy', cv=5).fit(X_train, y_train)
        X_train_sfsf = X_train[:, list(sfs_float_forward.k_feature_idx_)]
        X_test_sfsf = X_test[:, list(sfs_float_forward.k_feature_idx_)]
        print(f"Step-wise Floating Forward Selection Selected Features: {list(sfs_float_forward.k_feature_idx_)}")
        methods[f"SFS_Float_Forward_{d}"] = (X_train_sfsf, X_test_sfsf)
    except Exception as e:
        print(f"Floating Forward Selection failed: {e}")

    # Step-wise Floating Backward Removal
    try:
        sfs_float_backward = SFS(model, k_features=d, forward=False, floating=True, scoring='accuracy', cv=5).fit(X_train, y_train)
        X_train_sfsb = X_train[:, list(sfs_float_backward.k_feature_idx_)]
        X_test_sfsb = X_test[:, list(sfs_float_backward.k_feature_idx_)]
        print(f"Step-wise Floating Backward Removal Selected Features: {list(sfs_float_backward.k_feature_idx_)}")
        methods[f"SFS_Float_Backward_{d}"] = (X_train_sfsb, X_test_sfsb)
    except Exception as e:
        print(f"Floating Backward Selection failed: {e}")

    # Principal Component Analysis (PCA)
    try:
        pca = PCA(n_components=d)
        X_train_pca = pca.fit_transform(X_train)
        X_test_pca = pca.transform(X_test)
        print(f"PCA Explained Variance Ratio: {pca.explained_variance_ratio_}")
        methods[f"PCA_{d}"] = (X_train_pca, X_test_pca)
    except Exception as e:
        print(f"PCA failed: {e}")

    # Linear Discriminant Analysis (LDA)
    try:
        n_components = min(d, len(np.unique(y_train)) - 1)  # LDA components <= classes - 1
        if n_components > 0:  # Only perform LDA if we have enough classes
            lda = LDA(n_components=n_components)
            X_train_lda = lda.fit_transform(X_train, y_train)
            X_test_lda = lda.transform(X_test)
            if hasattr(lda, 'explained_variance_ratio_'):
                print(f"LDA Explained Variance Ratio: {lda.explained_variance_ratio_}")
            methods[f"LDA_{d}"] = (X_train_lda, X_test_lda)
    except Exception as e:
        print(f"LDA failed: {e}")

# Evaluate classifiers on each method
results = {}
for method_name, (X_tr, X_te) in methods.items():
    results[method_name] = {}
    for clf_name, clf in classifiers.items():
        try:
            results[method_name][clf_name] = evaluate_model(clf, X_tr, X_te, y_train, y_test)
            print(f"Evaluated {clf_name} with {method_name}: {results[method_name][clf_name]['accuracy']:.4f} accuracy")
        except Exception as e:
            print(f"Error evaluating {clf_name} with {method_name}: {e}")
            results[method_name][clf_name] = {"accuracy": 0, "precision": 0, "recall": 0, "f1_score": 0}

# Convert results to DataFrame for analysis
results_df = pd.DataFrame.from_dict({(i,j): results[i][j]
                                   for i in results.keys()
                                   for j in results[i].keys()},
                                   orient='index')

# Save the results to a CSV file
results_df.to_csv("student_performance_metrics.csv")

print("Feature selection and dimensionality reduction completed.")
print("Results saved to 'student_performance_metrics.csv'")

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
        "recall": recall_score(y_test, y_pred, average='weighted', zero_division=0),
        "f1_score": f1_score(y_test, y_pred, average='weighted', zero_division=0)
    }

# Classifiers to evaluate
classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(),
    "ANN": MLPClassifier(max_iter=1000)
}

# Evaluate classifiers on each method
results = {}
for method_name, (X_tr, X_te) in methods.items():
    results[method_name] = {}
    for clf_name, clf in classifiers.items():
        try:
            results[method_name][clf_name] = evaluate_model(clf, X_tr, X_te, y_train, y_test)
            print(f"Evaluated {clf_name} with {method_name}: {results[method_name][clf_name]['accuracy']:.4f} accuracy")
        except Exception as e:
            print(f"Error evaluating {clf_name} with {method_name}: {e}")
            results[method_name][clf_name] = {"accuracy": 0, "precision": 0, "recall": 0, "f1_score": 0}

# Convert results to DataFrame for analysis
results_df = pd.DataFrame.from_dict({(i, j): results[i][j] for i in results.keys() for j in results[i].keys()},
                                   orient='index')

# Save the results to a CSV file
results_df.to_csv("student_performance_metrics.csv")

print("Model evaluation completed. Results saved to 'student_performance_metrics.csv'")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the performance metrics data
results_df = pd.read_csv("student_performance_metrics.csv", index_col=[0, 1])

# Ensure the index is a MultiIndex
if not isinstance(results_df.index, pd.MultiIndex):
    results_df.index = pd.MultiIndex.from_tuples(results_df.index)

# Check the structure of the DataFrame
print("DataFrame Structure:")
print(results_df.index)
print(results_df.columns)

# 4i) Table Summary
print("Table Summary of Performance Metrics:")
print(results_df)

# 4ii) Pie Charts
def plot_pie_charts(results_df, metric):
    fig, axes = plt.subplots(3, 3, figsize=(18, 18))
    fig.suptitle(f'Pie Charts for {metric.capitalize()}', fontsize=20)
    for idx, (method, data) in enumerate(results_df.groupby(level=0)):
        if idx >= 9:
            break  # Ensure we don't exceed subplot grid
        ax = axes[idx // 3, idx % 3]
        data = data.xs(method, level=0)[metric]  # Access data with xs using level=0
        data.plot.pie(ax=ax, autopct='%1.1f%%', title=f'{method}')
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

for metric in ["accuracy", "precision", "recall", "f1_score"]:
    plot_pie_charts(results_df, metric)

# 4iii) Bar Charts
def plot_bar_charts(results_df, metric):
    results_metric = results_df[metric]  # Directly index if it's not a MultiIndex
    results_metric.unstack(level=1).plot(kind='bar', figsize=(15, 10), title=f'Bar Charts for {metric.capitalize()}')
    plt.ylabel(metric)
    plt.xticks(rotation=45)
    plt.legend(title='Classifiers', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

for metric in ["accuracy", "precision", "recall", "f1_score"]:
    plot_bar_charts(results_df, metric)

# 4iv) Line Graphs
def plot_line_graphs(results_df, metric):
    results_metric = results_df[metric]  # Directly index if it's not a MultiIndex
    results_metric.unstack(level=1).plot(kind='line', marker='o', figsize=(15, 10), title=f'Line Graphs for {metric.capitalize()}')
    plt.ylabel(metric)
    plt.xticks(rotation=45)
    plt.legend(title='Classifiers', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

for metric in ["accuracy", "precision", "recall", "f1_score"]:
    plot_line_graphs(results_df, metric)

# 4vi) Box Plots
def plot_box_plots(results_df, metric):
    results_metric = results_df[metric]  # Directly index if it's not a MultiIndex
    plt.figure(figsize=(15, 10))
    sns.boxplot(data=results_metric.unstack(level=1))
    plt.title(f'Box Plots for {metric.capitalize()}')
    plt.ylabel(metric)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

for metric in ["accuracy", "precision", "recall", "f1_score"]:
    plot_box_plots(results_df, metric)


DATASET 2 : WINE

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo

# fetch wine dataset
wine = fetch_ucirepo(id=109)

# data (as pandas dataframes)
X = wine.data.features
y = wine.data.targets

# metadata
print(wine.metadata)

# variable information
print(wine.variables)

# Create a copy of X and y to avoid the SettingWithCopyWarning
X_copy = X.copy()
y_copy = y.copy()

# Check for missing values before preprocessing
print(f"Missing values in original X: {X_copy.isnull().sum().sum()}")
print(f"Missing values in original y: {y_copy.isnull().sum().sum()}")

# Handle missing values properly
X_copy = X_copy.replace('?', np.nan)
X_copy = X_copy.apply(pd.to_numeric, errors='coerce')

# Fill remaining missing values with the mean of each column
for col in X_copy.columns:
    if X_copy[col].isnull().sum() > 0:
        X_copy[col] = X_copy[col].fillna(X_copy[col].mean())

# Verify no missing values remain
print(f"Missing values after preprocessing: {X_copy.isnull().sum().sum()}")

# For the wine dataset, we'll use the class attribute as our target
# The target is already properly formatted in the UCI repo
target = y_copy.iloc[:, 0]

# Make sure target has no missing values
target = target.fillna(target.mode()[0])  # Fill with mode since this is a classification task

# Standardizing features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_copy)

# Double-check for NaN values after scaling
if np.isnan(X_scaled).any():
    print("Warning: NaN values found after scaling. Replacing with 0...")
    X_scaled = np.nan_to_num(X_scaled)

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, target, test_size=0.2, random_state=42)

# Logistic Regression as the base classifier
model = LogisticRegression(max_iter=1000)

# Function to evaluate and collect performance metrics
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
        "recall": recall_score(y_test, y_pred, average='weighted', zero_division=0),
        "f1_score": f1_score(y_test, y_pred, average='weighted', zero_division=0)
    }

# Classifiers to evaluate
classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(),
    "ANN": MLPClassifier(max_iter=1000)
}

# Dimensionality reduction methods
methods = {}
# Test a range of dimensions
for d in range(1, min(11, X_copy.shape[1])):  # Up to 10 features or max available
    print(f"\n### Feature Selection and Dimensionality Reduction with d = {d} ###\n")

    # Naïve Search (Random selection of d features)
    feature_indices = np.random.choice(range(X_scaled.shape[1]), d, replace=False)
    X_train_naive = X_train[:, feature_indices]
    X_test_naive = X_test[:, feature_indices]
    print(f"Naïve Search Selected Features: {feature_indices}")
    methods[f"Naive_{d}"] = (X_train_naive, X_test_naive)

    try:
        # Step-wise Forward Selection
        sfs_forward = SequentialFeatureSelector(model, n_features_to_select=d, direction='forward').fit(X_train, y_train)
        X_train_sfs = sfs_forward.transform(X_train)
        X_test_sfs = sfs_forward.transform(X_test)
        print(f"Step-wise Forward Selection Selected Features: {np.where(sfs_forward.get_support())[0]}")
        methods[f"SFS_Forward_{d}"] = (X_train_sfs, X_test_sfs)
    except Exception as e:
        print(f"Step-wise Forward Selection failed: {e}")

    try:
        # Step-wise Backward Removal
        sfs_backward = SequentialFeatureSelector(model, n_features_to_select=d, direction='backward').fit(X_train, y_train)
        X_train_sbs = sfs_backward.transform(X_train)
        X_test_sbs = sfs_backward.transform(X_test)
        print(f"Step-wise Backward Removal Selected Features: {np.where(sfs_backward.get_support())[0]}")
        methods[f"SFS_Backward_{d}"] = (X_train_sbs, X_test_sbs)
    except Exception as e:
        print(f"Step-wise Backward Removal failed: {e}")

    # Bidirectional Search (using MLxtend's SFS)
    try:
        sfs_bi = SFS(model, k_features=d, forward=True, floating=False, scoring='accuracy', cv=5).fit(X_train, y_train)
        X_train_bi = X_train[:, list(sfs_bi.k_feature_idx_)]
        X_test_bi = X_test[:, list(sfs_bi.k_feature_idx_)]
        print(f"Bidirectional Search Selected Features: {list(sfs_bi.k_feature_idx_)}")
        methods[f"SFS_Bi_{d}"] = (X_train_bi, X_test_bi)
    except Exception as e:
        print(f"Bidirectional Search failed: {e}")

    # Step-wise Floating Forward Selection
    try:
        sfs_float_forward = SFS(model, k_features=d, forward=True, floating=True, scoring='accuracy', cv=5).fit(X_train, y_train)
        X_train_sfsf = X_train[:, list(sfs_float_forward.k_feature_idx_)]
        X_test_sfsf = X_test[:, list(sfs_float_forward.k_feature_idx_)]
        print(f"Step-wise Floating Forward Selection Selected Features: {list(sfs_float_forward.k_feature_idx_)}")
        methods[f"SFS_Float_Forward_{d}"] = (X_train_sfsf, X_test_sfsf)
    except Exception as e:
        print(f"Floating Forward Selection failed: {e}")

    # Step-wise Floating Backward Removal
    try:
        sfs_float_backward = SFS(model, k_features=d, forward=False, floating=True, scoring='accuracy', cv=5).fit(X_train, y_train)
        X_train_sfsb = X_train[:, list(sfs_float_backward.k_feature_idx_)]
        X_test_sfsb = X_test[:, list(sfs_float_backward.k_feature_idx_)]
        print(f"Step-wise Floating Backward Removal Selected Features: {list(sfs_float_backward.k_feature_idx_)}")
        methods[f"SFS_Float_Backward_{d}"] = (X_train_sfsb, X_test_sfsb)
    except Exception as e:
        print(f"Floating Backward Selection failed: {e}")

    # Principal Component Analysis (PCA)
    try:
        pca = PCA(n_components=d)
        X_train_pca = pca.fit_transform(X_train)
        X_test_pca = pca.transform(X_test)
        print(f"PCA Explained Variance Ratio: {pca.explained_variance_ratio_}")
        methods[f"PCA_{d}"] = (X_train_pca, X_test_pca)
    except Exception as e:
        print(f"PCA failed: {e}")

    # Linear Discriminant Analysis (LDA)
    try:
        n_components = min(d, len(np.unique(y_train)) - 1)  # LDA components <= classes - 1
        if n_components > 0:  # Only perform LDA if we have enough classes
            lda = LDA(n_components=n_components)
            X_train_lda = lda.fit_transform(X_train, y_train)
            X_test_lda = lda.transform(X_test)
            if hasattr(lda, 'explained_variance_ratio_'):
                print(f"LDA Explained Variance Ratio: {lda.explained_variance_ratio_}")
            methods[f"LDA_{d}"] = (X_train_lda, X_test_lda)
    except Exception as e:
        print(f"LDA failed: {e}")

# Evaluate classifiers on each method
results = {}
for method_name, (X_tr, X_te) in methods.items():
    results[method_name] = {}
    for clf_name, clf in classifiers.items():
        try:
            results[method_name][clf_name] = evaluate_model(clf, X_tr, X_te, y_train, y_test)
            print(f"Evaluated {clf_name} with {method_name}: {results[method_name][clf_name]['accuracy']:.4f} accuracy")
        except Exception as e:
            print(f"Error evaluating {clf_name} with {method_name}: {e}")
            results[method_name][clf_name] = {"accuracy": 0, "precision": 0, "recall": 0, "f1_score": 0}

# Convert results to DataFrame for analysis
results_df = pd.DataFrame.from_dict({(i,j): results[i][j]
                                   for i in results.keys()
                                   for j in results[i].keys()},
                                   orient='index')

# Save the results to a CSV file
results_df.to_csv("wine_dataset_metrics.csv")
print("Feature selection and dimensionality reduction completed.")
print("Results saved to 'wine_dataset_metrics.csv'")

{'uci_id': 109, 'name': 'Wine', 'repository_url': 'https://archive.ics.uci.edu/dataset/109/wine', 'data_url': 'https://archive.ics.uci.edu/static/public/109/data.csv', 'abstract': 'Using chemical analysis to determine the origin of wines', 'area': 'Physics and Chemistry', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 178, 'num_features': 13, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1992, 'last_updated': 'Mon Aug 28 2023', 'dataset_doi': '10.24432/C5PC7J', 'creators': ['Stefan Aeberhard', 'M. Forina'], 'intro_paper': {'ID': 246, 'type': 'NATIVE', 'title': 'Comparative analysis of statistical pattern recognition methods in high dimensional settings', 'authors': 'S. Aeberhard, D. Coomans, O. Vel', 'venue': 'Pattern Recognition', 'year': 1994, 'journal': None, 'DOI': '10.1016/0031-3203(94)90145-7', 'URL': 'https:



Evaluated ANN with Naive_4: 0.8889 accuracy
Evaluated Decision Tree with SFS_Forward_4: 0.9722 accuracy
Evaluated KNN with SFS_Forward_4: 0.9722 accuracy
Evaluated Naive Bayes with SFS_Forward_4: 0.9722 accuracy
Evaluated SVM with SFS_Forward_4: 1.0000 accuracy
Evaluated ANN with SFS_Forward_4: 1.0000 accuracy
Evaluated Decision Tree with SFS_Backward_4: 1.0000 accuracy
Evaluated KNN with SFS_Backward_4: 1.0000 accuracy
Evaluated Naive Bayes with SFS_Backward_4: 1.0000 accuracy
Evaluated SVM with SFS_Backward_4: 1.0000 accuracy
Evaluated ANN with SFS_Backward_4: 1.0000 accuracy
Evaluated Decision Tree with SFS_Bi_4: 0.9722 accuracy
Evaluated KNN with SFS_Bi_4: 0.9722 accuracy
Evaluated Naive Bayes with SFS_Bi_4: 0.9722 accuracy
Evaluated SVM with SFS_Bi_4: 1.0000 accuracy
Evaluated ANN with SFS_Bi_4: 1.0000 accuracy
Evaluated Decision Tree with SFS_Float_Forward_4: 0.9722 accuracy
Evaluated KNN with SFS_Float_Forward_4: 0.9722 accuracy
Evaluated Naive Bayes with SFS_Float_Forward_4: 0.

In [None]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to evaluate a classifier
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
        "recall": recall_score(y_test, y_pred, average='weighted', zero_division=0),
        "f1_score": f1_score(y_test, y_pred, average='weighted', zero_division=0)
    }

# Classifiers to evaluate
classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(),
    "ANN": MLPClassifier(max_iter=1000)
}

# Ensure `methods` dictionary exists before evaluation
if "methods" in locals():
    results = {}
    for method_name, (X_tr, X_te) in methods.items():
        results[method_name] = {}
        for clf_name, clf in classifiers.items():
            try:
                results[method_name][clf_name] = evaluate_model(clf, X_tr, X_te, y_train, y_test)
                print(f"Evaluated {clf_name} with {method_name}: {results[method_name][clf_name]['accuracy']:.4f} accuracy")
            except Exception as e:
                print(f"Error evaluating {clf_name} with {method_name}: {e}")
                results[method_name][clf_name] = {"accuracy": 0, "precision": 0, "recall": 0, "f1_score": 0}

    # Convert results to DataFrame for analysis
    results_df = pd.DataFrame.from_dict({(i, j): results[i][j] for i in results.keys() for j in results[i].keys()},
                                       orient='index')

    # Save the results to a CSV file
    results_df.to_csv("wine_dataset_metrics.csv")

    print("Model evaluation completed. Results saved to 'wine_dataset_metrics.csv'")
else:
    print("Error: `methods` dictionary not found. Ensure feature selection has been performed.")


Evaluated Decision Tree with Naive_1: 0.5000 accuracy
Evaluated KNN with Naive_1: 0.5833 accuracy
Evaluated Naive Bayes with Naive_1: 0.5000 accuracy
Evaluated SVM with Naive_1: 0.6111 accuracy
Evaluated ANN with Naive_1: 0.6111 accuracy
Evaluated Decision Tree with SFS_Forward_1: 0.6944 accuracy
Evaluated KNN with SFS_Forward_1: 0.7500 accuracy
Evaluated Naive Bayes with SFS_Forward_1: 0.8333 accuracy
Evaluated SVM with SFS_Forward_1: 0.8056 accuracy
Evaluated ANN with SFS_Forward_1: 0.8333 accuracy
Evaluated Decision Tree with SFS_Backward_1: 0.5278 accuracy
Evaluated KNN with SFS_Backward_1: 0.6667 accuracy
Evaluated Naive Bayes with SFS_Backward_1: 0.8056 accuracy
Evaluated SVM with SFS_Backward_1: 0.7778 accuracy
Evaluated ANN with SFS_Backward_1: 0.8056 accuracy
Evaluated Decision Tree with SFS_Bi_1: 0.6944 accuracy
Evaluated KNN with SFS_Bi_1: 0.7500 accuracy
Evaluated Naive Bayes with SFS_Bi_1: 0.8333 accuracy
Evaluated SVM with SFS_Bi_1: 0.8056 accuracy
Evaluated ANN with SFS_



Evaluated ANN with Naive_4: 0.8889 accuracy
Evaluated Decision Tree with SFS_Forward_4: 0.9722 accuracy
Evaluated KNN with SFS_Forward_4: 0.9722 accuracy
Evaluated Naive Bayes with SFS_Forward_4: 0.9722 accuracy
Evaluated SVM with SFS_Forward_4: 1.0000 accuracy
Evaluated ANN with SFS_Forward_4: 1.0000 accuracy
Evaluated Decision Tree with SFS_Backward_4: 1.0000 accuracy
Evaluated KNN with SFS_Backward_4: 1.0000 accuracy
Evaluated Naive Bayes with SFS_Backward_4: 1.0000 accuracy
Evaluated SVM with SFS_Backward_4: 1.0000 accuracy
Evaluated ANN with SFS_Backward_4: 1.0000 accuracy
Evaluated Decision Tree with SFS_Bi_4: 0.9722 accuracy
Evaluated KNN with SFS_Bi_4: 0.9722 accuracy
Evaluated Naive Bayes with SFS_Bi_4: 0.9722 accuracy
Evaluated SVM with SFS_Bi_4: 1.0000 accuracy
Evaluated ANN with SFS_Bi_4: 1.0000 accuracy
Evaluated Decision Tree with SFS_Float_Forward_4: 0.9722 accuracy
Evaluated KNN with SFS_Float_Forward_4: 0.9722 accuracy
Evaluated Naive Bayes with SFS_Float_Forward_4: 0.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the performance metrics data
results_df = pd.read_csv("wine_dataset_metrics.csv", index_col=[0, 1])

# Ensure the index is a MultiIndex
if not isinstance(results_df.index, pd.MultiIndex):
    results_df.index = pd.MultiIndex.from_tuples(results_df.index)

# Check the structure of the DataFrame
print("DataFrame Structure:")
print(results_df.index)
print(results_df.columns)

# 4i) Table Summary
print("Table Summary of Performance Metrics:")
print(results_df)

# 4ii) Pie Charts
def plot_pie_charts(results_df, metric):
    fig, axes = plt.subplots(3, 3, figsize=(18, 18))
    fig.suptitle(f'Pie Charts for {metric.capitalize()}', fontsize=20)

    for idx, (method, data) in enumerate(results_df.groupby(level=0)):
        if idx >= 9:
            break  # Limit to 9 pie charts to fit subplot grid

        ax = axes[idx // 3, idx % 3]
        data = data.xs(method, level=0)[metric]  # Extract data using xs
        data.plot.pie(ax=ax, autopct='%1.1f%%', title=f'{method}')

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

for metric in ["accuracy", "precision", "recall", "f1_score"]:
    plot_pie_charts(results_df, metric)

# 4iii) Bar Charts
def plot_bar_charts(results_df, metric):
    results_metric = results_df[metric]  # Directly access metric column
    results_metric.unstack(level=1).plot(kind='bar', figsize=(15, 10), title=f'Bar Charts for {metric.capitalize()}')

    plt.ylabel(metric)
    plt.xticks(rotation=45)
    plt.legend(title='Classifiers', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

for metric in ["accuracy", "precision", "recall", "f1_score"]:
    plot_bar_charts(results_df, metric)

# 4iv) Line Graphs
def plot_line_graphs(results_df, metric):
    results_metric = results_df[metric]  # Directly access metric column
    results_metric.unstack(level=1).plot(kind='line', marker='o', figsize=(15, 10), title=f'Line Graphs for {metric.capitalize()}')

    plt.ylabel(metric)
    plt.xticks(rotation=45)
    plt.legend(title='Classifiers', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

for metric in ["accuracy", "precision", "recall", "f1_score"]:
    plot_line_graphs(results_df, metric)

# 4vi) Box Plots
def plot_box_plots(results_df, metric):
    results_metric = results_df[metric]  # Directly access metric column

    plt.figure(figsize=(15, 10))
    sns.boxplot(data=results_metric.unstack(level=1))

    plt.title(f'Box Plots for {metric.capitalize()}')
    plt.ylabel(metric)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

for metric in ["accuracy", "precision", "recall", "f1_score"]:
    plot_box_plots(results_df, metric)


DATASET 3 : LUNG CANCER

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo

# Fetch lung cancer dataset
lung_cancer = fetch_ucirepo(id=62)

# Data (as pandas dataframes)
X = lung_cancer.data.features
y = lung_cancer.data.targets

# Display metadata and variable information
print("Dataset Metadata:")
print(lung_cancer.metadata)
print("\nVariable Information:")
print(lung_cancer.variables)

# Create a copy of X and y to avoid the SettingWithCopyWarning
X_copy = X.copy()
y_copy = y.copy()

# Check for missing values before preprocessing
print(f"\nMissing values in original X: {X_copy.isnull().sum().sum()}")
print(f"Missing values in original y: {y_copy.isnull().sum().sum()}")

# Handle missing values properly
X_copy = X_copy.replace('?', np.nan)
X_copy = X_copy.apply(pd.to_numeric, errors='coerce')

# Fill remaining missing values with the mean of each column
for col in X_copy.columns:
    if X_copy[col].isnull().sum() > 0:
        X_copy[col] = X_copy[col].fillna(X_copy[col].mean())

# Verify no missing values remain
print(f"Missing values after preprocessing: {X_copy.isnull().sum().sum()}")

# For the lung cancer dataset, we need to extract the target variable
# The target is typically the 'Class' column or similar
if 'Class' in y_copy.columns:
    target = y_copy['Class']
elif 'TARGET' in y_copy.columns:
    target = y_copy['TARGET']
else:
    # If specific target columns don't exist, use the first target column
    target_col = y_copy.columns[0]
    target = y_copy[target_col]
    print(f"Using {target_col} as target variable")

# Make sure target has no missing values
target = target.fillna(target.mode()[0])  # Fill missing values with the most common class

# Convert target to numeric if it's not already
target = pd.to_numeric(target, errors='coerce')
# Check if target needs to be filled again after conversion
if target.isnull().sum() > 0:
    target = target.fillna(target.mode()[0])

# Standardizing features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_copy)

# Double-check for NaN values after scaling
if np.isnan(X_scaled).any():
    print("Warning: NaN values found after scaling. Replacing with 0...")
    X_scaled = np.nan_to_num(X_scaled)

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, target, test_size=0.2, random_state=42)

# Logistic Regression as the base classifier
model = LogisticRegression(max_iter=1000)

# Function to evaluate and collect performance metrics
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
        "recall": recall_score(y_test, y_pred, average='weighted', zero_division=0),
        "f1_score": f1_score(y_test, y_pred, average='weighted', zero_division=0)
    }

# Classifiers to evaluate
classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(),
    "ANN": MLPClassifier(max_iter=1000)
}

# Dimensionality reduction methods
methods = {}
# Test a range of dimensions
for d in range(1, min(11, X_copy.shape[1])):  # Up to 10 features or max available
    print(f"\n### Feature Selection and Dimensionality Reduction with d = {d} ###\n")

    # Naïve Search (Random selection of d features)
    feature_indices = np.random.choice(range(X_scaled.shape[1]), d, replace=False)
    X_train_naive = X_train[:, feature_indices]
    X_test_naive = X_test[:, feature_indices]
    print(f"Naïve Search Selected Features: {feature_indices}")
    methods[f"Naive_{d}"] = (X_train_naive, X_test_naive)

    try:
        # Step-wise Forward Selection
        sfs_forward = SequentialFeatureSelector(model, n_features_to_select=d, direction='forward').fit(X_train, y_train)
        X_train_sfs = sfs_forward.transform(X_train)
        X_test_sfs = sfs_forward.transform(X_test)
        print(f"Step-wise Forward Selection Selected Features: {np.where(sfs_forward.get_support())[0]}")
        methods[f"SFS_Forward_{d}"] = (X_train_sfs, X_test_sfs)
    except Exception as e:
        print(f"Step-wise Forward Selection failed: {e}")

    try:
        # Step-wise Backward Removal
        sfs_backward = SequentialFeatureSelector(model, n_features_to_select=d, direction='backward').fit(X_train, y_train)
        X_train_sbs = sfs_backward.transform(X_train)
        X_test_sbs = sfs_backward.transform(X_test)
        print(f"Step-wise Backward Removal Selected Features: {np.where(sfs_backward.get_support())[0]}")
        methods[f"SFS_Backward_{d}"] = (X_train_sbs, X_test_sbs)
    except Exception as e:
        print(f"Step-wise Backward Removal failed: {e}")

    # Bidirectional Search (using MLxtend's SFS)
    try:
        sfs_bi = SFS(model, k_features=d, forward=True, floating=False, scoring='accuracy', cv=5).fit(X_train, y_train)
        X_train_bi = X_train[:, list(sfs_bi.k_feature_idx_)]
        X_test_bi = X_test[:, list(sfs_bi.k_feature_idx_)]
        print(f"Bidirectional Search Selected Features: {list(sfs_bi.k_feature_idx_)}")
        methods[f"SFS_Bi_{d}"] = (X_train_bi, X_test_bi)
    except Exception as e:
        print(f"Bidirectional Search failed: {e}")

    # Step-wise Floating Forward Selection
    try:
        sfs_float_forward = SFS(model, k_features=d, forward=True, floating=True, scoring='accuracy', cv=5).fit(X_train, y_train)
        X_train_sfsf = X_train[:, list(sfs_float_forward.k_feature_idx_)]
        X_test_sfsf = X_test[:, list(sfs_float_forward.k_feature_idx_)]
        print(f"Step-wise Floating Forward Selection Selected Features: {list(sfs_float_forward.k_feature_idx_)}")
        methods[f"SFS_Float_Forward_{d}"] = (X_train_sfsf, X_test_sfsf)
    except Exception as e:
        print(f"Floating Forward Selection failed: {e}")

    # Step-wise Floating Backward Removal
    try:
        sfs_float_backward = SFS(model, k_features=d, forward=False, floating=True, scoring='accuracy', cv=5).fit(X_train, y_train)
        X_train_sfsb = X_train[:, list(sfs_float_backward.k_feature_idx_)]
        X_test_sfsb = X_test[:, list(sfs_float_backward.k_feature_idx_)]
        print(f"Step-wise Floating Backward Removal Selected Features: {list(sfs_float_backward.k_feature_idx_)}")
        methods[f"SFS_Float_Backward_{d}"] = (X_train_sfsb, X_test_sfsb)
    except Exception as e:
        print(f"Floating Backward Selection failed: {e}")

    # Principal Component Analysis (PCA)
    try:
        pca = PCA(n_components=d)
        X_train_pca = pca.fit_transform(X_train)
        X_test_pca = pca.transform(X_test)
        print(f"PCA Explained Variance Ratio: {pca.explained_variance_ratio_}")
        methods[f"PCA_{d}"] = (X_train_pca, X_test_pca)
    except Exception as e:
        print(f"PCA failed: {e}")

    # Linear Discriminant Analysis (LDA)
    try:
        n_components = min(d, len(np.unique(y_train)) - 1)  # LDA components <= classes - 1
        if n_components > 0:  # Only perform LDA if we have enough classes
            lda = LDA(n_components=n_components)
            X_train_lda = lda.fit_transform(X_train, y_train)
            X_test_lda = lda.transform(X_test)
            if hasattr(lda, 'explained_variance_ratio_'):
                print(f"LDA Explained Variance Ratio: {lda.explained_variance_ratio_}")
            methods[f"LDA_{d}"] = (X_train_lda, X_test_lda)
    except Exception as e:
        print(f"LDA failed: {e}")

# Evaluate classifiers on each method
results = {}
for method_name, (X_tr, X_te) in methods.items():
    results[method_name] = {}
    for clf_name, clf in classifiers.items():
        try:
            results[method_name][clf_name] = evaluate_model(clf, X_tr, X_te, y_train, y_test)
            print(f"Evaluated {clf_name} with {method_name}: {results[method_name][clf_name]['accuracy']:.4f} accuracy")
        except Exception as e:
            print(f"Error evaluating {clf_name} with {method_name}: {e}")
            results[method_name][clf_name] = {"accuracy": 0, "precision": 0, "recall": 0, "f1_score": 0}

# Convert results to DataFrame for analysis
results_df = pd.DataFrame.from_dict({(i,j): results[i][j]
                                  for i in results.keys()
                                  for j in results[i].keys()},
                                  orient='index')

# Save the results to a CSV file
results_df.to_csv("lung_cancer_metrics.csv")
print("Feature selection and dimensionality reduction completed.")
print("Results saved to 'lung_cancer_metrics.csv'")

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to evaluate a model
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
        "recall": recall_score(y_test, y_pred, average='weighted', zero_division=0),
        "f1_score": f1_score(y_test, y_pred, average='weighted', zero_division=0)
    }

# Define classifiers
classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(),
    "ANN": MLPClassifier(max_iter=1000)
}

# Dictionary to store results
results = {}

# Evaluate classifiers for each dimensionality reduction/feature selection method
for method_name, (X_tr, X_te) in methods.items():
    results[method_name] = {}
    for clf_name, clf in classifiers.items():
        try:
            results[method_name][clf_name] = evaluate_model(clf, X_tr, X_te, y_train, y_test)
            print(f"Evaluated {clf_name} with {method_name}: {results[method_name][clf_name]['accuracy']:.4f} accuracy")
        except Exception as e:
            print(f"Error evaluating {clf_name} with {method_name}: {e}")
            results[method_name][clf_name] = {"accuracy": 0, "precision": 0, "recall": 0, "f1_score": 0}

# Convert results to DataFrame
results_df = pd.DataFrame.from_dict({(i, j): results[i][j] for i in results.keys() for j in results[i].keys()},
                                    orient='index')

# Save results to CSV
results_df.to_csv("lung_cancer_metrics.csv")
print("Model evaluation completed. Results saved to 'lung_cancer_metrics.csv'")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the performance metrics data
results_df = pd.read_csv("lung_cancer_metrics.csv", index_col=[0, 1])

# Ensure the index is a MultiIndex
if not isinstance(results_df.index, pd.MultiIndex):
    results_df.index = pd.MultiIndex.from_tuples(results_df.index)

# Check the structure of the DataFrame
print("DataFrame Structure:")
print(results_df.index)
print(results_df.columns)

# Table Summary
print("Table Summary of Performance Metrics:")
print(results_df)

# Function to plot pie charts
def plot_pie_charts(results_df, metric):
    fig, axes = plt.subplots(3, 3, figsize=(18, 18))
    fig.suptitle(f'Pie Charts for {metric.capitalize()}', fontsize=20)
    for idx, (method, data) in enumerate(results_df.groupby(level=0)):
        if idx >= 9:
            break  # Prevent exceeding subplot grid
        ax = axes[idx // 3, idx % 3]
        data = data.xs(method, level=0)[metric]  # Access data using xs with level=0
        data.plot.pie(ax=ax, autopct='%1.1f%%', title=f'{method}')
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

# Function to plot bar charts
def plot_bar_charts(results_df, metric):
    results_metric = results_df[metric]  # Directly index if it's not a MultiIndex
    results_metric.unstack(level=1).plot(kind='bar', figsize=(15, 10), title=f'Bar Charts for {metric.capitalize()}')
    plt.ylabel(metric)
    plt.xticks(rotation=45)
    plt.legend(title='Classifiers', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

# Function to plot line graphs
def plot_line_graphs(results_df, metric):
    results_metric = results_df[metric]  # Directly index if it's not a MultiIndex
    results_metric.unstack(level=1).plot(kind='line', marker='o', figsize=(15, 10), title=f'Line Graphs for {metric.capitalize()}')
    plt.ylabel(metric)
    plt.xticks(rotation=45)
    plt.legend(title='Classifiers', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

# Function to plot box plots
def plot_box_plots(results_df, metric):
    results_metric = results_df[metric]  # Directly index if it's not a MultiIndex
    plt.figure(figsize=(15, 10))
    sns.boxplot(data=results_metric.unstack(level=1))
    plt.title(f'Box Plots for {metric.capitalize()}')
    plt.ylabel(metric)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Generate plots for each metric
metrics = ["accuracy", "precision", "recall", "f1_score"]
for metric in metrics:
    plot_pie_charts(results_df, metric)
    plot_bar_charts(results_df, metric)
    plot_line_graphs(results_df, metric)
    plot_box_plots(results_df, metric)


DATASET 4 : MUSHROOM

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from ucimlrepo import fetch_ucirepo

# Fetch dataset
mushroom = fetch_ucirepo(id=73)

# Data (as pandas dataframes)
X = mushroom.data.features
y = mushroom.data.targets

# Encode categorical features
X_encoded = pd.get_dummies(X)

# Encode target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y.iloc[:, 0])

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

# Logistic Regression as the base classifier
model = LogisticRegression(max_iter=1000)

# Function to evaluate and collect performance metrics
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
        "recall": recall_score(y_test, y_pred, average='weighted', zero_division=0),
        "f1_score": f1_score(y_test, y_pred, average='weighted', zero_division=0)
    }

# Classifiers to evaluate
classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(),
    "ANN": MLPClassifier(max_iter=1000)
}

# Dimensionality reduction methods
methods = {}
for d in range(1, min(11, X_encoded.shape[1])):
    print(f"\n### Feature Selection and Dimensionality Reduction with d = {d} ###\n")

    # Step-wise Forward Selection
    sfs_forward = SequentialFeatureSelector(model, n_features_to_select=d, direction='forward').fit(X_train, y_train)
    X_train_sfs = sfs_forward.transform(X_train)
    X_test_sfs = sfs_forward.transform(X_test)
    methods[f"SFS_Forward_{d}"] = (X_train_sfs, X_test_sfs)

    # Principal Component Analysis (PCA)
    pca = PCA(n_components=d)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    methods[f"PCA_{d}"] = (X_train_pca, X_test_pca)

    # Linear Discriminant Analysis (LDA)
    n_components = min(d, len(np.unique(y_train)) - 1)
    if n_components > 0:
        lda = LDA(n_components=n_components)
        X_train_lda = lda.fit_transform(X_train, y_train)
        X_test_lda = lda.transform(X_test)
        methods[f"LDA_{d}"] = (X_train_lda, X_test_lda)

# Evaluate classifiers on each method
results = {}
for method_name, (X_tr, X_te) in methods.items():
    results[method_name] = {}
    for clf_name, clf in classifiers.items():
        results[method_name][clf_name] = evaluate_model(clf, X_tr, X_te, y_train, y_test)

# Convert results to DataFrame
results_df = pd.DataFrame.from_dict({(i, j): results[i][j]
                                     for i in results.keys()
                                     for j in results[i].keys()},
                                     orient='index')

# Save the results to a CSV file
results_df.to_csv("mushroom_performance_metrics.csv")

print("Feature selection and dimensionality reduction completed.")
print("Results saved to 'mushroom_performance_metrics.csv'")



### Feature Selection and Dimensionality Reduction with d = 1 ###


### Feature Selection and Dimensionality Reduction with d = 2 ###


### Feature Selection and Dimensionality Reduction with d = 3 ###


### Feature Selection and Dimensionality Reduction with d = 4 ###


### Feature Selection and Dimensionality Reduction with d = 5 ###


### Feature Selection and Dimensionality Reduction with d = 6 ###


### Feature Selection and Dimensionality Reduction with d = 7 ###


### Feature Selection and Dimensionality Reduction with d = 8 ###


### Feature Selection and Dimensionality Reduction with d = 9 ###


### Feature Selection and Dimensionality Reduction with d = 10 ###

Feature selection and dimensionality reduction completed.
Results saved to 'mushroom_performance_metrics.csv'


In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to evaluate and collect performance metrics
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
        "recall": recall_score(y_test, y_pred, average='weighted', zero_division=0),
        "f1_score": f1_score(y_test, y_pred, average='weighted', zero_division=0)
    }

# Classifiers to evaluate
classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(),
    "ANN": MLPClassifier(max_iter=1000)
}

# Evaluate classifiers on each method
results = {}
for method_name, (X_tr, X_te) in methods.items():
    results[method_name] = {}
    for clf_name, clf in classifiers.items():
        try:
            results[method_name][clf_name] = evaluate_model(clf, X_tr, X_te, y_train, y_test)
            print(f"Evaluated {clf_name} with {method_name}: {results[method_name][clf_name]['accuracy']:.4f} accuracy")
        except Exception as e:
            print(f"Error evaluating {clf_name} with {method_name}: {e}")
            results[method_name][clf_name] = {"accuracy": 0, "precision": 0, "recall": 0, "f1_score": 0}

# Convert results to DataFrame for analysis
results_df = pd.DataFrame.from_dict({(i, j): results[i][j] for i in results.keys() for j in results[i].keys()},
                                   orient='index')

# Save the results to a CSV file
results_df.to_csv("mushroom_classification_metrics.csv")

print("Model evaluation completed. Results saved to 'mushroom_classification_metrics.csv'")


Evaluated Decision Tree with SFS_Forward_1: 0.8782 accuracy
Evaluated KNN with SFS_Forward_1: 0.8782 accuracy
Evaluated Naive Bayes with SFS_Forward_1: 0.8782 accuracy
Evaluated SVM with SFS_Forward_1: 0.8782 accuracy
Evaluated ANN with SFS_Forward_1: 0.8782 accuracy
Evaluated Decision Tree with PCA_1: 0.8535 accuracy
Evaluated KNN with PCA_1: 0.8917 accuracy
Evaluated Naive Bayes with PCA_1: 0.8868 accuracy
Evaluated SVM with PCA_1: 0.8843 accuracy
Evaluated ANN with PCA_1: 0.9065 accuracy
Evaluated Decision Tree with LDA_1: 1.0000 accuracy
Evaluated KNN with LDA_1: 1.0000 accuracy
Evaluated Naive Bayes with LDA_1: 1.0000 accuracy
Evaluated SVM with LDA_1: 1.0000 accuracy
Evaluated ANN with LDA_1: 1.0000 accuracy
Evaluated Decision Tree with SFS_Forward_2: 0.9360 accuracy
Evaluated KNN with SFS_Forward_2: 0.9360 accuracy
Evaluated Naive Bayes with SFS_Forward_2: 0.9354 accuracy
Evaluated SVM with SFS_Forward_2: 0.9360 accuracy
Evaluated ANN with SFS_Forward_2: 0.9360 accuracy
Evaluate

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Load the performance metrics data
results_df = pd.read_csv("mushroom_classification_metrics.csv", index_col=[0, 1])

# Ensure the index is a MultiIndex
if not isinstance(results_df.index, pd.MultiIndex):
    results_df.index = pd.MultiIndex.from_tuples(results_df.index)

# Check the structure of the DataFrame
print("DataFrame Structure:")
print(results_df.index)
print(results_df.columns)

# Table Summary
print("Table Summary of Performance Metrics:")
print(results_df)

# Pie Charts
def plot_pie_charts(results_df, metric):
    fig, axes = plt.subplots(3, 3, figsize=(18, 18))
    fig.suptitle(f'Pie Charts for {metric.capitalize()}', fontsize=20)
    for idx, (method, data) in enumerate(results_df.groupby(level=0)):
        if idx >= 9:
            break
        ax = axes[idx // 3, idx % 3]
        data = data.xs(method, level=0)[metric]
        data.plot.pie(ax=ax, autopct='%1.1f%%', title=f'{method}')
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

for metric in ["accuracy", "precision", "recall", "f1_score"]:
    plot_pie_charts(results_df, metric)

# Bar Charts
def plot_bar_charts(results_df, metric):
    results_metric = results_df[metric]
    results_metric.unstack(level=1).plot(kind='bar', figsize=(15, 10), title=f'Bar Charts for {metric.capitalize()}')
    plt.ylabel(metric)
    plt.xticks(rotation=45)
    plt.legend(title='Classifiers', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

for metric in ["accuracy", "precision", "recall", "f1_score"]:
    plot_bar_charts(results_df, metric)

# Line Graphs
def plot_line_graphs(results_df, metric):
    results_metric = results_df[metric]
    results_metric.unstack(level=1).plot(kind='line', marker='o', figsize=(15, 10), title=f'Line Graphs for {metric.capitalize()}')
    plt.ylabel(metric)
    plt.xticks(rotation=45)
    plt.legend(title='Classifiers', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

for metric in ["accuracy", "precision", "recall", "f1_score"]:
    plot_line_graphs(results_df, metric)

# Box Plots
def plot_box_plots(results_df, metric):
    results_metric = results_df[metric]
    plt.figure(figsize=(15, 10))
    sns.boxplot(data=results_metric.unstack(level=1))
    plt.title(f'Box Plots for {metric.capitalize()}')
    plt.ylabel(metric)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

for metric in ["accuracy", "precision", "recall", "f1_score"]:
    plot_box_plots(results_df, metric)


DATASET 5 : SPAMBASE

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo

# fetch Spambase dataset
spambase = fetch_ucirepo(id=94)

# data (as pandas dataframes)
X = spambase.data.features
y = spambase.data.targets

# Create a copy of X and y to avoid the SettingWithCopyWarning
X_copy = X.copy()
y_copy = y.copy()

# Check for missing values before preprocessing
print(f"Missing values in original X: {X_copy.isnull().sum().sum()}")
print(f"Missing values in original y: {y_copy.isnull().sum().sum()}")

# Handle missing values properly
X_copy = X_copy.replace('?', np.nan)
X_copy = X_copy.apply(pd.to_numeric, errors='coerce')

# Fill remaining missing values with the mean of each column
for col in X_copy.columns:
    if X_copy[col].isnull().sum() > 0:
        X_copy[col] = X_copy[col].fillna(X_copy[col].mean())

# Verify no missing values remain
print(f"Missing values after preprocessing: {X_copy.isnull().sum().sum()}")

# For the spambase dataset, we use the 'spam' column as our target
# This dataset already has a binary target (1 for spam, 0 for non-spam)
target = y_copy.iloc[:, 0]

# Make sure target has no missing values
if target.isnull().sum() > 0:
    target = target.fillna(target.mode()[0])  # Fill with most common value

# Standardizing features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_copy)

# Double-check for NaN values after scaling
if np.isnan(X_scaled).any():
    print("Warning: NaN values found after scaling. Replacing with 0...")
    X_scaled = np.nan_to_num(X_scaled)

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, target, test_size=0.2, random_state=42)

# Logistic Regression as the base classifier
model = LogisticRegression(max_iter=1000)

# Function to evaluate and collect performance metrics
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
        "recall": recall_score(y_test, y_pred, average='weighted', zero_division=0),
        "f1_score": f1_score(y_test, y_pred, average='weighted', zero_division=0)
    }

# Classifiers to evaluate
classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(),
    "ANN": MLPClassifier(max_iter=1000)
}

# Dimensionality reduction methods
methods = {}
# Test a range of dimensions - spambase has 57 features, so let's try up to 15
for d in range(1, min(16, X_copy.shape[1])):  # Up to 15 features or max available
    print(f"\n### Feature Selection and Dimensionality Reduction with d = {d} ###\n")

    # Naïve Search (Random selection of d features)
    feature_indices = np.random.choice(range(X_scaled.shape[1]), d, replace=False)
    X_train_naive = X_train[:, feature_indices]
    X_test_naive = X_test[:, feature_indices]
    print(f"Naïve Search Selected Features: {feature_indices}")
    methods[f"Naive_{d}"] = (X_train_naive, X_test_naive)

    try:
        # Step-wise Forward Selection
        sfs_forward = SequentialFeatureSelector(model, n_features_to_select=d, direction='forward').fit(X_train, y_train)
        X_train_sfs = sfs_forward.transform(X_train)
        X_test_sfs = sfs_forward.transform(X_test)
        print(f"Step-wise Forward Selection Selected Features: {np.where(sfs_forward.get_support())[0]}")
        methods[f"SFS_Forward_{d}"] = (X_train_sfs, X_test_sfs)
    except Exception as e:
        print(f"Step-wise Forward Selection failed: {e}")

    try:
        # Step-wise Backward Removal
        sfs_backward = SequentialFeatureSelector(model, n_features_to_select=d, direction='backward').fit(X_train, y_train)
        X_train_sbs = sfs_backward.transform(X_train)
        X_test_sbs = sfs_backward.transform(X_test)
        print(f"Step-wise Backward Removal Selected Features: {np.where(sfs_backward.get_support())[0]}")
        methods[f"SFS_Backward_{d}"] = (X_train_sbs, X_test_sbs)
    except Exception as e:
        print(f"Step-wise Backward Removal failed: {e}")

    # Bidirectional Search (using MLxtend's SFS)
    try:
        sfs_bi = SFS(model, k_features=d, forward=True, floating=False, scoring='accuracy', cv=5).fit(X_train, y_train)
        X_train_bi = X_train[:, list(sfs_bi.k_feature_idx_)]
        X_test_bi = X_test[:, list(sfs_bi.k_feature_idx_)]
        print(f"Bidirectional Search Selected Features: {list(sfs_bi.k_feature_idx_)}")
        methods[f"SFS_Bi_{d}"] = (X_train_bi, X_test_bi)
    except Exception as e:
        print(f"Bidirectional Search failed: {e}")

    # Step-wise Floating Forward Selection
    try:
        sfs_float_forward = SFS(model, k_features=d, forward=True, floating=True, scoring='accuracy', cv=5).fit(X_train, y_train)
        X_train_sfsf = X_train[:, list(sfs_float_forward.k_feature_idx_)]
        X_test_sfsf = X_test[:, list(sfs_float_forward.k_feature_idx_)]
        print(f"Step-wise Floating Forward Selection Selected Features: {list(sfs_float_forward.k_feature_idx_)}")
        methods[f"SFS_Float_Forward_{d}"] = (X_train_sfsf, X_test_sfsf)
    except Exception as e:
        print(f"Floating Forward Selection failed: {e}")

    # Step-wise Floating Backward Removal
    try:
        sfs_float_backward = SFS(model, k_features=d, forward=False, floating=True, scoring='accuracy', cv=5).fit(X_train, y_train)
        X_train_sfsb = X_train[:, list(sfs_float_backward.k_feature_idx_)]
        X_test_sfsb = X_test[:, list(sfs_float_backward.k_feature_idx_)]
        print(f"Step-wise Floating Backward Removal Selected Features: {list(sfs_float_backward.k_feature_idx_)}")
        methods[f"SFS_Float_Backward_{d}"] = (X_train_sfsb, X_test_sfsb)
    except Exception as e:
        print(f"Floating Backward Selection failed: {e}")

    # Principal Component Analysis (PCA)
    try:
        pca = PCA(n_components=d)
        X_train_pca = pca.fit_transform(X_train)
        X_test_pca = pca.transform(X_test)
        print(f"PCA Explained Variance Ratio: {pca.explained_variance_ratio_}")
        methods[f"PCA_{d}"] = (X_train_pca, X_test_pca)
    except Exception as e:
        print(f"PCA failed: {e}")

    # Linear Discriminant Analysis (LDA)
    try:
        n_components = min(d, len(np.unique(y_train)) - 1)  # LDA components <= classes - 1
        if n_components > 0:  # Only perform LDA if we have enough classes
            lda = LDA(n_components=n_components)
            X_train_lda = lda.fit_transform(X_train, y_train)
            X_test_lda = lda.transform(X_test)
            if hasattr(lda, 'explained_variance_ratio_'):
                print(f"LDA Explained Variance Ratio: {lda.explained_variance_ratio_}")
            methods[f"LDA_{d}"] = (X_train_lda, X_test_lda)
    except Exception as e:
        print(f"LDA failed: {e}")

# Evaluate classifiers on each method
results = {}
for method_name, (X_tr, X_te) in methods.items():
    results[method_name] = {}
    for clf_name, clf in classifiers.items():
        try:
            results[method_name][clf_name] = evaluate_model(clf, X_tr, X_te, y_train, y_test)
            print(f"Evaluated {clf_name} with {method_name}: {results[method_name][clf_name]['accuracy']:.4f} accuracy")
        except Exception as e:
            print(f"Error evaluating {clf_name} with {method_name}: {e}")
            results[method_name][clf_name] = {"accuracy": 0, "precision": 0, "recall": 0, "f1_score": 0}

# Convert results to DataFrame for analysis
results_df = pd.DataFrame.from_dict({(i,j): results[i][j]
                                   for i in results.keys()
                                   for j in results[i].keys()},
                                   orient='index')

# Save the results to a CSV file
results_df.to_csv("spambase_metrics.csv")

# Create a visualization of the top methods
# Extract accuracy values for each method and classifier
accuracy_df = pd.DataFrame(index=results.keys(), columns=classifiers.keys())
for method in results.keys():
    for clf in classifiers.keys():
        accuracy_df.loc[method, clf] = results[method][clf]['accuracy']

# Get the top 10 method-classifier combinations by accuracy
top_combinations = accuracy_df.stack().sort_values(ascending=False).head(10)
print("\nTop 10 method-classifier combinations by accuracy:")
print(top_combinations)

print("\nFeature selection and dimensionality reduction completed.")
print("Results saved to 'spambase_metrics.csv'")

Missing values in original X: 0
Missing values in original y: 0
Missing values after preprocessing: 0

### Feature Selection and Dimensionality Reduction with d = 1 ###

Naïve Search Selected Features: [9]
Step-wise Forward Selection Selected Features: [52]
Step-wise Backward Removal Selected Features: [52]
Bidirectional Search Selected Features: [52]
Step-wise Floating Forward Selection Selected Features: [52]
Step-wise Floating Backward Removal Selected Features: [52]
PCA Explained Variance Ratio: [0.10776224]
LDA Explained Variance Ratio: [1.]

### Feature Selection and Dimensionality Reduction with d = 2 ###

Naïve Search Selected Features: [ 5 32]
Step-wise Forward Selection Selected Features: [ 6 52]
Step-wise Backward Removal Selected Features: [ 6 52]
Bidirectional Search Selected Features: [6, 52]
Step-wise Floating Forward Selection Selected Features: [6, 52]
Step-wise Floating Backward Removal Selected Features: [6, 52]
PCA Explained Variance Ratio: [0.10776224 0.058507  ]
L

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define function to evaluate classifiers with cross-validation
def cross_val_evaluate(model, X, y, cv=5):
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    return scores.mean(), scores.std()

# Define function to evaluate models and compute additional metrics
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred
    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
        "recall": recall_score(y_test, y_pred, average='weighted', zero_division=0),
        "f1_score": f1_score(y_test, y_pred, average='weighted', zero_division=0),
        "roc_auc": roc_auc_score(y_test, y_pred_proba) if len(set(y_test)) > 1 else 0
    }

# Define classifiers to evaluate
classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(probability=True),
    "ANN": MLPClassifier(max_iter=1000)
}

# Evaluate classifiers on each feature selection method
results = {}
for method_name, (X_tr, X_te) in methods.items():
    results[method_name] = {}
    for clf_name, clf in classifiers.items():
        try:
            results[method_name][clf_name] = evaluate_model(clf, X_tr, X_te, y_train, y_test)
            print(f"Evaluated {clf_name} with {method_name}: {results[method_name][clf_name]['accuracy']:.4f} accuracy")
        except Exception as e:
            print(f"Error evaluating {clf_name} with {method_name}: {e}")
            results[method_name][clf_name] = {"accuracy": 0, "precision": 0, "recall": 0, "f1_score": 0, "roc_auc": 0}

# Convert results to DataFrame for analysis
results_df = pd.DataFrame.from_dict({(i, j): results[i][j] for i in results.keys() for j in results[i].keys()},
                                    orient='index')

# Save the results to a CSV file
results_df.to_csv("model_performance_metrics.csv")

print("Model evaluation completed. Results saved to 'model_performance_metrics.csv'")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the performance metrics data
results_df = pd.read_csv("student_performance_metrics.csv", index_col=[0, 1])

# Ensure the index is a MultiIndex
if not isinstance(results_df.index, pd.MultiIndex):
    results_df.index = pd.MultiIndex.from_tuples(results_df.index)

# Check structure
print("DataFrame Structure:")
print(results_df.index)
print(results_df.columns)

# 4i) Table Summary
print("Table Summary of Performance Metrics:")
print(results_df)

# Define a function to handle subplots dynamically
def create_subplots(num_plots, title):
    rows = (num_plots // 3) + (num_plots % 3 > 0)
    fig, axes = plt.subplots(rows, 3, figsize=(18, rows * 6))
    fig.suptitle(title, fontsize=20)
    axes = axes.flatten()
    return fig, axes

# 4ii) Pie Charts
def plot_pie_charts(results_df, metric):
    unique_methods = results_df.index.get_level_values(0).unique()
    fig, axes = create_subplots(len(unique_methods), f'Pie Charts for {metric.capitalize()}')

    for idx, method in enumerate(unique_methods):
        data = results_df.xs(method, level=0)[metric]
        data.plot.pie(ax=axes[idx], autopct='%1.1f%%', title=f'{method}')

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

for metric in ["accuracy", "precision", "recall", "f1_score"]:
    plot_pie_charts(results_df, metric)

# 4iii) Bar Charts
def plot_bar_charts(results_df, metric):
    results_metric = results_df[metric].unstack(level=1)
    results_metric.plot(kind='bar', figsize=(15, 10), title=f'Bar Chart for {metric.capitalize()}')
    plt.ylabel(metric)
    plt.xticks(rotation=45)
    plt.legend(title='Classifiers', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

for metric in ["accuracy", "precision", "recall", "f1_score"]:
    plot_bar_charts(results_df, metric)

# 4iv) Line Graphs
def plot_line_graphs(results_df, metric):
    results_metric = results_df[metric].unstack(level=1)
    results_metric.plot(kind='line', marker='o', figsize=(15, 10), title=f'Line Graph for {metric.capitalize()}')
    plt.ylabel(metric)
    plt.xticks(rotation=45)
    plt.legend(title='Classifiers', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

for metric in ["accuracy", "precision", "recall", "f1_score"]:
    plot_line_graphs(results_df, metric)

# 4vi) Box Plots
def plot_box_plots(results_df, metric):
    results_metric = results_df[metric].unstack(level=1)
    plt.figure(figsize=(15, 10))
    sns.boxplot(data=results_metric)
    plt.title(f'Box Plot for {metric.capitalize()}')
    plt.ylabel(metric)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

for metric in ["accuracy", "precision", "recall", "f1_score"]:
    plot_box_plots(results_df, metric)
