# Comparative Analysis of all Data Sets

## Table of Contents

1. [Imports & Data Set Loading](#imports--data-set-loading)
2. [Detailed Analysis](#detailed-analysis)
    1. [Size Visualization](#1-size-visualization)
    2. [Data Types and Missing Values](#2-data-types-and-missing-values)
    3. [Identify Target Variables](#3-identify-target-variables)
    4. [Target Variable Analysis](#4-target-variable-analysis)
    5. [Feature Type Analysis](#5-feature-type-analysis)
    6. [Statistical Summary of Numerical Features](#6-statistical-summary-of-numerical-features)
    7. [Correlation Analysis](#7-correlation-analysis)
    8. [Data Quality Check](#8-data-quality-check)
3. [Baseline Model Performance](#baseline-model-performance)

## Imports & Data Set Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import warnings

warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
real_datasets = ['bank', 'credit', 'income']
synthetic_datasets = ['train_A', 'train_B', 'train_C']
dataset_folder = os.path.join("..", "data")
real_dataset_path = os.path.join(dataset_folder, "real")
synthetic_dataset_path = os.path.join(dataset_folder, "synthetic")

datasets = {}

# Load real datasets
for name in real_datasets:

    df = pd.read_csv(os.path.join(real_dataset_path, f'{name}.csv'))
    datasets[name] = df

# Load synthetic datasets  
for name in synthetic_datasets:

    df = pd.read_csv(os.path.join(synthetic_dataset_path, f'{name}.csv'))
    datasets[name] = df

print(f"\nTotal datasets loaded: {len(datasets)}")

# Quick Overview of All Datasets

print("Dataset Overview:")
print("-" * 50)
for name, df in datasets.items():
    dataset_type = "Real" if name in real_datasets else "Synthetic"
    memory_mb = df.memory_usage(deep=True).sum() / 1024**2
    print(f"{name:12} | {dataset_type:9} | {df.shape[0]:6,} rows × {df.shape[1]:2} cols | {memory_mb:.1f} MB")

## Detailed Analysis

#### 1. Size Visualization

What sizes do the 6 available data sets have? How many features (columns) and samples (rows) do they each have?


Which data sets have comparatively many/ few features or samples available?

Results:
- "Credit" has considerably less samples available than "bank" or "income" -> need to make good use of them
- The synthetic data sets all have comparatively few samples but many features -> good feature extraction needed

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Samples per dataset
dataset_names = list(datasets.keys())
sample_counts = [datasets[name].shape[0] for name in dataset_names]
colors = ['lightblue' if name in real_datasets else 'lightcoral' for name in dataset_names]

ax1.bar(dataset_names, sample_counts, color=colors)
ax1.set_title('Number of Samples per Dataset')
ax1.set_ylabel('Sample Count')
ax1.tick_params(axis='x', rotation=45)

# Features per dataset
feature_counts = [datasets[name].shape[1] for name in dataset_names]
ax2.bar(dataset_names, feature_counts, color=colors)
ax2.set_title('Number of Features per Dataset')
ax2.set_ylabel('Feature Count')
ax2.tick_params(axis='x', rotation=45)

# Add legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='lightblue', label='Real'),
                   Patch(facecolor='lightcoral', label='Synthetic')]
ax2.legend(handles=legend_elements)

plt.tight_layout()
plt.show()

#### 2. Data Types and Missing Values

What data types do we have as features? Do we need to handle any missing values?

Results:
- None of the data sets contains missing values -> implementation of missing value handling optional
- The real data sets contain a mixture of categorical (dtype('O'), also including the label column) and floating point features -> need to implement categorical feature encoding
- The real data sets contain only floating point features and the integer label column

In [None]:
for name, df in datasets.items():
    print(f"\n{'='*20} {name.upper()} {'='*20}")
    
    # Basic info
    print(f"Shape: {df.shape}")
    print(f"Data types: {df.dtypes.value_counts().to_dict()}")
    
    # Missing values
    missing = df.isnull().sum()
    missing_count = missing.sum()
    if missing_count > 0:
        print(f"Missing values: {missing_count} total")
        missing_cols = missing[missing > 0]
        for col, count in missing_cols.items():
            pct = (count / len(df)) * 100
            print(f"  - {col}: {count} ({pct:.1f}%)")
    else:
        print("Missing values: None")

#### 3. Identify Target Variables

How is the label column called in each data set?

In [None]:
## Identify Target Variables

print("Target Variable Detection:")
print("-" * 40)

target_info = {}
common_target_names = ['target', 'label', 'class', 'y', 'outcome', 'result']

for name, df in datasets.items():
    target_col = None
    
    # Check for common target names
    for candidate in common_target_names:
        if candidate in df.columns.str.lower():
            # Find the actual column name (case-insensitive)
            actual_col = [col for col in df.columns if col.lower() == candidate][0]
            target_col = actual_col
            break
    
    # If not found, check last column if it looks like a target
    if target_col is None:
        last_col = df.columns[-1]
        unique_vals = df[last_col].nunique()
        if unique_vals <= 10:  # Likely categorical target
            target_col = last_col
            print(f"{name:12} | Assumed target: '{target_col}' (last column, {unique_vals} unique values)")
        else:
            print(f"{name:12} | No clear target identified")
    else:
        print(f"{name:12} | Found target: '{target_col}'")
    
    target_info[name] = target_col

#### 4. Target Variable Analysis

How many samples per class do we have in each data set?

Results:
- All data sets have binary labels
- The real data sets (especially "bank" and "target") have a strong class imbalance -> need a strategy to handle this
- The synthetic data sets have quite balanced classes

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Target Variable Distributions', fontsize=16, fontweight='bold')

for idx, (name, df) in enumerate(datasets.items()):
    row = idx // 3
    col = idx % 3
    ax = axes[row, col]
    
    target_col = target_info[name]
    
    if target_col and target_col in df.columns:
        # Plot target distribution
        value_counts = df[target_col].value_counts()
        bars = ax.bar(range(len(value_counts)), value_counts.values, 
                     color='skyblue' if name in real_datasets else 'lightcoral')
        
        # Add value labels on bars
        for bar, count in zip(bars, value_counts.values):
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                   f'{count}', ha='center', va='bottom')
        
        ax.set_title(f'{name.upper()}\nTarget: {target_col}')
        ax.set_xlabel('Class')
        ax.set_ylabel('Count')
        ax.set_xticks(range(len(value_counts)))
        ax.set_xticklabels(value_counts.index, rotation=0)
        
        # Calculate and show balance ratio
        balance_ratio = value_counts.min() / value_counts.max()
        ax.text(0.02, 0.98, f'Balance: {balance_ratio:.2f}', 
               transform=ax.transAxes, va='top', fontsize=10,
               bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8))
        
    else:
        ax.text(0.5, 0.5, 'No clear target\nidentified', ha='center', va='center',
               transform=ax.transAxes, fontsize=12)
        ax.set_title(f'{name.upper()}\nNo target found')

plt.tight_layout()
plt.show()

#### 5. Feature Type Analysis

How many numerical (float) vs categorical features does each data set have? How large can we expect the value space for each categorical feature to be?

Results:
- The majority of features in the real data sets are categorical -> need to encode these somehow
- Each categorical feature just has a very limited range of possible values -> compact encoding should be possible

In [None]:
print("Feature Type Breakdown:")
print("-" * 30)

for name, df in datasets.items():
    print(f"\n{name.upper()}:")
    
    # Numerical features
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if target_info[name] in numerical_cols:
        numerical_cols.remove(target_info[name])
    
    # Categorical features  
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    if target_info[name] in categorical_cols:
        categorical_cols.remove(target_info[name])
    
    print(f"  Numerical features: {len(numerical_cols)}")
    if len(numerical_cols) <= 5:
        print(f"    {numerical_cols}")
    
    print(f"  Categorical features: {len(categorical_cols)}")
    if len(categorical_cols) <= 5:
        print(f"    {categorical_cols}")
    
    # Show cardinality for categorical features
    if categorical_cols:
        print("  Categorical feature cardinality:")
        for col in categorical_cols[:3]:  # Show first 3
            cardinality = df[col].nunique()
            print(f"    {col}: {cardinality} unique values")

#### 6. Statistical Summary of Numerical Features

What are the means/ standard derivations/ mins/ maxs of the numerical features?

In [None]:
for name, df in datasets.items():
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if target_info[name] in numerical_cols:
        numerical_cols.remove(target_info[name])
    
    if len(numerical_cols) > 0:
        print(f"\n{'='*15} {name.upper()} - NUMERICAL STATS {'='*15}")
        print(df[numerical_cols].describe().round(2))

#### 7. Correlation Analysis

For the numerical features: Do any of have correlating values?

Result: Both, Train_B and Train_C show some kind of correlating features -> can ignore one of these features in the model, as this is probably duplicate information

In [None]:
corr_matrices = []

for name, df in datasets.items():
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    if len(numerical_cols) > 1:
        print(f"\n{'='*10} CORRELATION ANALYSIS: {name.upper()} {'='*10}")
        
        # Calculate correlation matrix
        corr_matrix = df[numerical_cols].corr()
        
        # Find high correlations (excluding diagonal)
        high_corr_pairs = []
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                corr_val = corr_matrix.iloc[i, j]
                if abs(corr_val) > 0.7:
                    high_corr_pairs.append({
                        'feature1': corr_matrix.columns[i],
                        'feature2': corr_matrix.columns[j],
                        'correlation': corr_val
                    })
        
        if high_corr_pairs:
            print("High correlations found (|r| > 0.7):")
            for pair in high_corr_pairs:
                print(f"  {pair['feature1']} ↔ {pair['feature2']}: {pair['correlation']:.3f}")
        else:
            print("No high correlations found (|r| > 0.7)")
        
        # Collect correlation matrices for compact plotting
        if len(numerical_cols) <= 10:
            corr_matrices.append((name, corr_matrix))

# Plot all correlation heatmaps in a single figure
if corr_matrices:
    n = len(corr_matrices)
    ncols = 2
    nrows = (n + ncols - 1) // ncols
    fig, axes = plt.subplots(nrows, ncols, figsize=(7 * ncols, 6 * nrows))
    axes = np.array(axes).reshape(-1)  # Flatten in case of 1 row/col

    for idx, (name, corr_matrix) in enumerate(corr_matrices):
        ax = axes[idx]
        mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
        sns.heatmap(
            corr_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, fmt='.2f', cbar_kws={"shrink": .8}, ax=ax
        )
        ax.set_title(f'Correlation Matrix: {name.upper()}')
    # Hide unused subplots
    for j in range(idx + 1, len(axes)):
        fig.delaxes(axes[j])
    plt.tight_layout()
    plt.show()

#### 8. Data Quality Check

Are there any duplicate samples in the data sets?

Result: Only "income" has duplicate samples -> should remove those before training

In [None]:
print("Data Quality Assessment:")
print("-" * 30)

quality_issues = {}

for name, df in datasets.items():
    issues = []
    
    # Check for duplicate rows
    duplicates = df.duplicated().sum()
    if duplicates > 0:
        issues.append(f"{duplicates} duplicate rows")
    
    # Check for constant columns
    constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
    if constant_cols:
        issues.append(f"{len(constant_cols)} constant columns: {constant_cols}")
    
    # Check for high-cardinality categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    high_card_cols = [col for col in categorical_cols if df[col].nunique() > 50]
    if high_card_cols:
        issues.append(f"{len(high_card_cols)} high-cardinality categorical columns")
    
    # Check missing values percentage
    missing_pct = (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
    if missing_pct > 5:
        issues.append(f"{missing_pct:.1f}% missing values")
    
    quality_issues[name] = issues
    
    print(f"\n{name.upper()}:")
    if issues:
        for issue in issues:
            print(f"{issue}")
    else:
        print("No quality issues detected")

## Baseline Model Performance

How good are the classification results that a simple Random Forest Classifier can achieve on each data set?

In [None]:
print("Baseline Performance Evaluation:")
print("-" * 35)

baseline_results = {}

for name, df in datasets.items():
    target_col = target_info[name]
    
    if not target_col:
        print(f"\n{name.upper()}: Skipped (no target variable)")
        continue
    
    print(f"\n{name.upper()}:")
    
    try:
        # Prepare features and target
        X = df.drop(columns=[target_col])
        y = df[target_col]
        
        # Clean target variable (handle byte strings and other issues)
        if y.dtype == 'object':
            # Convert byte strings to regular strings
            y = y.astype(str)
            # Remove b' prefix and ' suffix if present
            y = y.str.replace(r"^b'|'$", "", regex=True)
            # Strip whitespace
            y = y.str.strip()
        
        print(f"  Target values: {sorted(y.unique())}")
        
        # Simple preprocessing for baseline
        X_processed = X.copy()
        
        # Encode categorical variables
        categorical_cols = X.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            # Same cleaning for categorical features
            if X_processed[col].dtype == 'object':
                X_processed[col] = X_processed[col].astype(str)
                X_processed[col] = X_processed[col].str.replace(r"^b'|'$", "", regex=True)
                X_processed[col] = X_processed[col].str.strip()
            
            le = LabelEncoder()
            X_processed[col] = le.fit_transform(X_processed[col].astype(str))
        
        # Ensure all columns are numeric
        X_processed = X_processed.select_dtypes(include=[np.number])
        
        if X_processed.empty:
            print("No usable features after preprocessing")
            continue
        
        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            X_processed, y, test_size=0.2, random_state=42, 
            stratify=y if len(np.unique(y)) > 1 else None
        )
        
        # Train Random Forest baseline
        rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        rf.fit(X_train, y_train)
        
        # Make predictions
        y_pred = rf.predict(X_test)
        
        # Calculate metrics
        unique_classes = len(np.unique(y))
        
        accuracy = accuracy_score(y_test, y_pred)
        
        if unique_classes == 2:
            # For binary classification, use macro average to avoid pos_label issues
            precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
            recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
            f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
        else:
            # For multiclass, use macro average
            precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
            recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
            f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
        
        # Store results
        baseline_results[name] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'features_used': X_processed.shape[1],
            'original_features': X.shape[1]
        }
        
        # Print results
        print(f"  Features: {X_processed.shape[1]}/{X.shape[1]} usable")
        print(f"  Accuracy:  {accuracy:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall:    {recall:.4f}")
        print(f"  F1-Score:  {f1:.4f}")
            
    except Exception as e:
        print(f"Error: {str(e)}")

In [None]:
if baseline_results:
    # Create performance comparison plot
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Baseline Performance Comparison', fontsize=16, fontweight='bold')
    
    metrics = ['accuracy', 'precision', 'recall', 'f1']
    metric_names = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
    
    for idx, (metric, metric_name) in enumerate(zip(metrics, metric_names)):
        row = idx // 2
        col = idx % 2
        ax = axes[row, col]
        
        names = list(baseline_results.keys())
        values = [baseline_results[name][metric] for name in names]
        colors = ['lightblue' if name in real_datasets else 'lightcoral' for name in names]
        
        bars = ax.bar(names, values, color=colors)
        ax.set_title(metric_name)
        ax.set_ylim(0, 1)
        ax.tick_params(axis='x', rotation=45)
        
        # Add value labels on bars
        for bar, value in zip(bars, values):
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                   f'{value:.3f}', ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.show()


Results:
- Among the real data sets, "bank" and "income" seem to be the easiest data sets for prediction
- Among the synthetic data sets, the classification difficulty seems to increase along A -> B -> C
- A simple Random Forest is already quite strong and can achieve accuracy scores of up to 94%