In [None]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler

def load_and_explore_datasets(folder_path):
    train_file = os.path.join(folder_path, "train.csv")
    val_file = os.path.join(folder_path, "val.csv")
    test_file = os.path.join(folder_path, "test.csv")

    train_data = pd.read_csv(train_file)
    val_data = pd.read_csv(val_file)
    test_data = pd.read_csv(test_file)

    print("Train Dataset:")
    print(train_data.info())
    print(train_data.head(), "\n")

    print("Validation Dataset:")
    print(val_data.info())
    print(val_data.head(), "\n")

    print("Test Dataset:")
    print(test_data.info())
    print(test_data.head(), "\n")

    print("Missing values in Train dataset:")
    print(train_data.isnull().sum(), "\n")

    print("Missing values in Validation dataset:")
    print(val_data.isnull().sum(), "\n")

    print("Missing values in Test dataset:")
    print(test_data.isnull().sum(), "\n")

    print("Class distribution in Train dataset:")
    print(train_data['Label'].value_counts(), "\n")

    print("Class distribution in Validation dataset:")
    print(val_data['Label'].value_counts(), "\n")

    print("Class distribution in Test dataset:")
    print(test_data['Label'].value_counts(), "\n")

    return train_data, val_data, test_data


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import pandas as pd

def preprocess_data(train_data, val_data, test_data):
    le = LabelEncoder()
    scaler = StandardScaler()

    # Extract the 'Label' column for each dataset
    train_labels = train_data['Label']
    val_labels = val_data['Label']
    test_labels = test_data['Label']

    # Print unique labels before encoding
    print("Unique labels in train data:", train_labels.unique())
    print("Unique labels in validation data:", val_labels.unique())
    print("Unique labels in test data:", test_labels.unique())

    print("\nFitting LabelEncoder on training labels...")
    le.fit(train_labels)  # Fit on train data only
    print("Classes found by LabelEncoder:", le.classes_)

    # Encode the labels for train, validation, and test datasets
    print("\nEncoding labels for train, validation, and test datasets...")
    train_labels_encoded = le.transform(train_labels)
    val_labels_encoded = le.transform(val_labels)
    test_labels_encoded = le.transform(test_labels)

    # Print the encoded labels for verification
    print("\nEncoded Train Labels (first 10):", train_labels_encoded[:10])
    print("Encoded Validation Labels (first 10):", val_labels_encoded[:10])
    print("Encoded Test Labels (first 10):", test_labels_encoded[:10])

    # Process features for each dataset (train, val, test)
    # --- Train dataset ---
    print("\nProcessing features for the train dataset...")
    train_features = train_data.drop(columns=['Label'])
    train_features_numeric = train_features.select_dtypes(include=['int64'])
    train_features_categorical = train_features.select_dtypes(include=['object'])

    print("Train features (numeric):", train_features_numeric.shape)
    print("Train features (categorical):", train_features_categorical.shape)

    print("\nEncoding categorical features in train data...")
    train_features_categorical_encoded = train_features_categorical.apply(le.fit_transform)
    train_features_combined = pd.concat([train_features_numeric, train_features_categorical_encoded], axis=1)

    print("\nCombined train features (before scaling):\n", train_features_combined.head())

    print("\nScaling train features...")
    train_features_scaled = scaler.fit_transform(train_features_combined)
    print("Scaled train features shape:", train_features_scaled.shape)

    # --- Validation dataset ---
    print("\nProcessing features for the validation dataset...")
    val_features = val_data.drop(columns=['Label'])
    val_features_numeric = val_features.select_dtypes(include=['int64'])
    val_features_categorical = val_features.select_dtypes(include=['object'])

    print("Validation features (numeric):", val_features_numeric.shape)
    print("Validation features (categorical):", val_features_categorical.shape)

    print("\nEncoding categorical features in validation data...")
    val_features_categorical_encoded = val_features_categorical.apply(le.fit_transform)
    val_features_combined = pd.concat([val_features_numeric, val_features_categorical_encoded], axis=1)

    print("\nCombined validation features (before scaling):\n", val_features_combined.head())

    print("\nScaling validation features...")
    val_features_scaled = scaler.transform(val_features_combined)
    print("Scaled validation features shape:", val_features_scaled.shape)

    # --- Test dataset ---
    print("\nProcessing features for the test dataset...")
    test_features = test_data.drop(columns=['Label'])
    test_features_numeric = test_features.select_dtypes(include=['int64'])
    test_features_categorical = test_features.select_dtypes(include=['object'])

    print("Test features (numeric):", test_features_numeric.shape)
    print("Test features (categorical):", test_features_categorical.shape)

    print("\nEncoding categorical features in test data...")
    test_features_categorical_encoded = test_features_categorical.apply(le.fit_transform)
    test_features_combined = pd.concat([test_features_numeric, test_features_categorical_encoded], axis=1)

    print("\nCombined test features (before scaling):\n", test_features_combined.head())

    print("\nScaling test features...")
    test_features_scaled = scaler.transform(test_features_combined)
    print("Scaled test features shape:", test_features_scaled.shape)

    # Prepare final datasets
    X_train = train_features_scaled
    y_train = train_labels_encoded
    X_val = val_features_scaled
    y_val = val_labels_encoded
    X_test = test_features_scaled
    y_test = test_labels_encoded

    print("\nFinal shapes of processed data:")
    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

    # Get unique labels across all datasets (train, val, test) and store in a list
    labels = list(set(train_labels.unique()).union(val_labels.unique(), test_labels.unique()))

    return X_train, y_train, X_val, y_val, X_test, y_test, le, scaler, labels


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def plot_class_distribution(data, column='Label'):
    plt.rcParams.update({'font.size': 8})
    plt.figure(figsize=(10, 6))
    sns.countplot(x=column, data=data, palette='Set2')
    plt.title(f"Class Distribution in {column} Column", fontsize=14)
    plt.xlabel(column, fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.tick_params(axis='both', labelsize=10)
    plt.grid(True)
    plt.show()


In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import numpy as np

def y_axis_formatter(x, pos):
    return f'{x/1000:.1f}'

def x_axis_formatter(x, pos):
    return f'{x/100000:.1f}'

def decimal_formatter(x, pos):
    if x == int(x):
        return f'{int(x)}'
    return f'{x:.2f}'.rstrip('0').rstrip('.')

def plot_numeric_feature_distribution(data, numeric_features):
    ax = data[numeric_features].hist(bins=20, figsize=(35, 30))
    plt.suptitle("Distribution of Numeric Features in Dataset", fontsize=40)
    numeric_features_list = numeric_features.tolist()

    for i in range(ax.shape[0]):
        for j in range(ax.shape[1]):
            axes = ax[i, j] if ax.ndim > 1 else ax[j]
            feature_index = i * ax.shape[1] + j
            if feature_index < len(numeric_features_list):
                feature_name = numeric_features_list[feature_index]
            else:
                continue
            axes.set_title(f"Feature {feature_name}", fontsize=28)
            axes.tick_params(axis='x', labelsize=24)
            axes.tick_params(axis='y', labelsize=24)
            axes.set_xlabel("Feature Value", fontsize=26)
            axes.set_ylabel("Frequency", fontsize=26)

            axes.xaxis.set_major_formatter(FuncFormatter(x_axis_formatter))
            axes.yaxis.set_major_formatter(FuncFormatter(y_axis_formatter))

            x_min, x_max = axes.get_xlim()
            y_min, y_max = axes.get_ylim()

            x_order = int(np.floor(np.log10(x_max)))
            y_order = int(np.floor(np.log10(y_max)))

            axes.text(0.95, 0.98, f'x-axis: $10^{x_order}$', transform=axes.transAxes,
                      fontsize=26, ha='right', va='top', color='black')
            axes.text(0.95, 0.92, f'y-axis: $10^{y_order}$', transform=axes.transAxes,
                      fontsize=26, ha='right', va='top', color='black')

    plt.tight_layout()
    plt.subplots_adjust(top=0.94)
    plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_correlation_matrix(data, numeric_features):
    correlation_matrix = data[numeric_features].corr()
    plt.figure(figsize=(12, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)

    plt.tick_params(axis='x', labelsize=10)
    plt.tick_params(axis='y', labelsize=10)
    
    plt.title("Correlation Matrix of Numeric Features", fontsize=14)
    plt.show()



In [None]:
import matplotlib.pyplot as plt

def plot_training_history(history, model_name, optimizer_name, activation_function, lr):
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title(f"{model_name} - {optimizer_name} - Activation: {activation_function} - LR: {lr}")
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f"{model_name} - {optimizer_name} - Activation: {activation_function} - LR: {lr}")
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()


In [None]:
import platform
import pandas as pd
import sys
import os
import subprocess

try:
    import torch
    TORCH_INSTALLED = True
except ImportError:
    TORCH_INSTALLED = False

try:
    import tensorflow as tf
    TF_INSTALLED = True
except ImportError:
    TF_INSTALLED = False

try:
    import psutil
    PSUTIL_INSTALLED = True
except ImportError:
    PSUTIL_INSTALLED = False


def get_package_version(package_name):
    try:
        return __import__(package_name).__version__
    except ImportError:
        return "Not Installed"
    except AttributeError:
        return "Unknown Version"


def get_environment_details():
    # Core environment details
    env_details = {
        "Python Version": platform.python_version(),
        "OS": platform.system(),
        "OS Version": platform.version(),
        "Processor": platform.processor(),
        "Platform": platform.platform(),
        "CPU Count": os.cpu_count(),
    }

    # GPU information (if available)
    if TORCH_INSTALLED and torch.cuda.is_available():
        env_details["GPU Device"] = torch.cuda.get_device_name(0)
        env_details["CUDA Version"] = torch.version.cuda
    elif TF_INSTALLED:
        gpus = tf.config.list_physical_devices('GPU')
        if gpus:
            env_details["GPU Device"] = gpus[0].name
            env_details["CUDA Version"] = "Via TensorFlow"
        else:
            env_details["GPU Device"] = "No GPU Detected"
    else:
        env_details["GPU Device"] = "No GPU Detected"

    # Memory information (if psutil is available)
    if PSUTIL_INSTALLED:
        mem_info = psutil.virtual_memory()
        env_details["Total Memory (GB)"] = round(mem_info.total / (1024 ** 3), 2)
    else:
        env_details["Total Memory (GB)"] = "Install psutil for details"

    # Relevant libraries and versions
    packages = [
        "numpy", "pandas", "tensorflow", "torch", "sklearn", 
        "matplotlib", "scipy", "seaborn", "pillow", "imblearn"
    ]
    package_versions = {pkg: get_package_version(pkg) for pkg in packages}
    
    return env_details, package_versions


def display_environment_table(env_details, package_versions, save_to_file=True):
    # Create DataFrames for better presentation
    env_data = pd.DataFrame(
        list(env_details.items()), 
        columns=["Component", "Version"]
    )

    package_data = pd.DataFrame(
        list(package_versions.items()), 
        columns=["Library", "Version"]
    )

    # Print to console
    print("\n--- Environment Details ---")
    print(env_data.to_markdown(index=False))

    print("\n--- Installed Packages ---")
    print(package_data.to_markdown(index=False))

    if save_to_file:
        # Save both to a Markdown file
        with open("environment_details.md", "w") as f:
            f.write("# Experiment Environment Details\n\n")
            f.write("## System Information\n")
            f.write(env_data.to_markdown(index=False) + "\n\n")
            f.write("## Installed Packages\n")
            f.write(package_data.to_markdown(index=False) + "\n")
        print("\nEnvironment details saved to `environment_details.md`.")
