In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

print("=" * 80)
print("PART 1: DATA LOADING AND PREPROCESSING")
print("=" * 80)

# Load the actual data
df = pd.read_csv('/content/data.csv')

print(f"\n[INFO] Data loaded successfully!")
print(f"[INFO] Shape: {df.shape}")
print(f"[INFO] Columns: {list(df.columns)}")

# Display first few rows
print("\n" + "=" * 80)
print("FIRST 5 ROWS")
print("=" * 80)
print(df.head())

# Data types
print("\n" + "=" * 80)
print("DATA TYPES")
print("=" * 80)
print(df.dtypes)

# Basic statistics
print("\n" + "=" * 80)
print("BASIC STATISTICS")
print("=" * 80)
print(df.describe())

# Missing values analysis
print("\n" + "=" * 80)
print("MISSING VALUES ANALYSIS")
print("=" * 80)
missing_df = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum().values,
    'Missing_Percentage': (df.isnull().sum().values / len(df) * 100).round(2)
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)
print(missing_df)

# Target variable distribution
print("\n" + "=" * 80)
print("TARGET VARIABLE: CLASSIFICATION")
print("=" * 80)
target_counts = df['classification'].value_counts()
print(f"Total unique classes: {df['classification'].nunique()}")
print(f"\nTop 20 classes:")
print(target_counts.head(20))
print(f"\nClasses with count = 1: {(target_counts == 1).sum()}")
print(f"Classes with count < 10: {(target_counts < 10).sum()}")

# Handle class imbalance - Keep top classes, group rare ones
print("\n" + "=" * 80)
print("HANDLING CLASS IMBALANCE")
print("=" * 80)

# Keep classes with at least 10 samples
min_samples = 10
valid_classes = target_counts[target_counts >= min_samples].index.tolist()
print(f"[INFO] Classes with >= {min_samples} samples: {len(valid_classes)}")

df['classification_grouped'] = df['classification'].apply(
    lambda x: x if x in valid_classes else 'OTHER'
)

print("\nNew class distribution:")
new_dist = df['classification_grouped'].value_counts()
print(new_dist)

# Stratified sampling for efficient processing
print("\n" + "=" * 80)
print("STRATIFIED SAMPLING (500-1000 samples)")
print("=" * 80)

sample_size = 800  # Target sample size

# Calculate samples per class proportionally
class_samples = {}
for cls in df['classification_grouped'].unique():
    cls_count = len(df[df['classification_grouped'] == cls])
    n_samples = int(sample_size * cls_count / len(df))
    n_samples = max(n_samples, 5)  # At least 5 samples per class
    class_samples[cls] = min(n_samples, cls_count)

print(f"Sample distribution per class:")
for cls, n in sorted(class_samples.items(), key=lambda x: -x[1]):
    print(f"  {cls}: {n}")

# Perform stratified sampling
sampled_dfs = []
for cls, n_samples in class_samples.items():
    cls_df = df[df['classification_grouped'] == cls]
    if len(cls_df) <= n_samples:
        sampled_dfs.append(cls_df)
    else:
        sampled_dfs.append(cls_df.sample(n=n_samples, random_state=42))

df_sampled = pd.concat(sampled_dfs, ignore_index=True).sample(frac=1, random_state=42)

print(f"\n[INFO] Original dataset: {len(df)} samples")
print(f"[INFO] Sampled dataset: {len(df_sampled)} samples ({len(df_sampled)/len(df)*100:.1f}%)")
print(f"\nSampled class distribution:")
print(df_sampled['classification_grouped'].value_counts())

# Select relevant features (exclude non-predictive columns)
feature_cols = [
    'experimentalTechnique', 'macromoleculeType', 'residueCount',
    'resolution', 'structureMolecularWeight', 'crystallizationMethod',
    'crystallizationTempK', 'densityMatthews', 'densityPercentSol', 'phValue'
]

# Keep only relevant columns
df_sampled_clean = df_sampled[feature_cols + ['classification_grouped']].copy()
df_full_clean = df[feature_cols + ['classification_grouped']].copy()

# Save datasets
df_full_clean.to_csv('/content/sample_data/data_full_clean.csv', index=False)
df_sampled_clean.to_csv('/content/sample_data/data_sampled_clean.csv', index=False)

print("\n" + "=" * 80)
print("FILES SAVED")
print("=" * 80)
print("✓ data_full_clean.csv - Full dataset with cleaned columns")
print("✓ data_sampled_clean.csv - Stratified sample (500-1000 rows)")



PART 1: DATA LOADING AND PREPROCESSING

[INFO] Data loaded successfully!
[INFO] Shape: (5000, 18)
[INFO] Columns: ['Unnamed: 0.1', 'Unnamed: 0', 'structureId', 'classification', 'experimentalTechnique', 'macromoleculeType', 'residueCount', 'resolution', 'structureMolecularWeight', 'crystallizationMethod', 'crystallizationTempK', 'densityMatthews', 'densityPercentSol', 'pdbxDetails', 'phValue', 'publicationYear', 'chainId', 'sequence']

FIRST 5 ROWS
   Unnamed: 0.1  Unnamed: 0 structureId classification experimentalTechnique  \
0             0       92478        4DF7      HYDROLASE     X-RAY DIFFRACTION   
1             1       62150        3CCB      HYDROLASE     X-RAY DIFFRACTION   
2             2       98342        4I7O      HYDROLASE     X-RAY DIFFRACTION   
3             3       10545        1II3      HYDROLASE     X-RAY DIFFRACTION   
4             4       81767        3SI7      HYDROLASE     X-RAY DIFFRACTION   

  macromoleculeType  residueCount  resolution  structureMolecularW

In [None]:


import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

print("=" * 80)

# Load sampled data
df = pd.read_csv('/content/sample_data/data_sampled_clean.csv')
print(f"[INFO] Loaded {len(df)} samples")
print(f"[INFO] Features: {df.shape[1]-1}, Target: classification_grouped")

# Separate features and target
X = df.drop('classification_grouped', axis=1)
y = df['classification_grouped']

print("\n" + "=" * 80)
print("STEP 1: HANDLE MISSING VALUES")
print("=" * 80)

# Strategy: Use median for numerical, mode for categorical
print("Missing values before imputation:")
print(X.isnull().sum()[X.isnull().sum() > 0])

# Numerical features
num_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"\nNumerical features: {len(num_features)}")
print(f"Categorical features: {len(cat_features)}")

# Impute numerical with median
for col in num_features:
    if X[col].isnull().sum() > 0:
        median_val = X[col].median()
        X[col].fillna(median_val, inplace=True)
        print(f"  Imputed {col} with median: {median_val:.2f}")

# Impute categorical with mode
for col in cat_features:
    if X[col].isnull().sum() > 0:
        mode_val = X[col].mode()[0]
        X[col].fillna(mode_val, inplace=True)
        print(f"  Imputed {col} with mode: {mode_val}")

print("\nMissing values after imputation:")
print(f"Total missing: {X.isnull().sum().sum()}")

# Encode categorical variables
print("\n" + "=" * 80)
print("STEP 2: ENCODE CATEGORICAL VARIABLES")
print("=" * 80)

label_encoders = {}
for col in cat_features:
    le = LabelEncoder()
    X[col + '_encoded'] = le.fit_transform(X[col])
    label_encoders[col] = le
    print(f"Encoded {col}: {len(le.classes_)} categories")

# Drop original categorical columns
X_encoded = X.drop(cat_features, axis=1)

print(f"\nFeatures after encoding: {X_encoded.shape[1]}")

# NOVELTY: Create interaction features
print("\n" + "=" * 80)
print("STEP 3: CREATE INTERACTION FEATURES (NOVELTY)")
print("=" * 80)

# Domain-specific interactions for protein structures
interactions = []

# Interaction 1: Density-related features
if 'densityMatthews' in X_encoded.columns and 'densityPercentSol' in X_encoded.columns:
    X_encoded['density_interaction'] = X_encoded['densityMatthews'] * X_encoded['densityPercentSol']
    interactions.append('density_interaction')
    print("✓ Created: density_interaction (Matthews × PercentSol)")

# Interaction 2: Structure size features
if 'residueCount' in X_encoded.columns and 'structureMolecularWeight' in X_encoded.columns:
    X_encoded['size_ratio'] = X_encoded['residueCount'] / (X_encoded['structureMolecularWeight'] + 1)
    interactions.append('size_ratio')
    print("✓ Created: size_ratio (residueCount / molecularWeight)")

# Interaction 3: Resolution quality indicator
if 'resolution' in X_encoded.columns:
    X_encoded['resolution_quality'] = 1 / (X_encoded['resolution'] + 0.1)
    interactions.append('resolution_quality')
    print("✓ Created: resolution_quality (inverse resolution)")

# Interaction 4: Temperature-pH interaction
if 'crystallizationTempK' in X_encoded.columns and 'phValue' in X_encoded.columns:
    X_encoded['temp_ph_interaction'] = X_encoded['crystallizationTempK'] * X_encoded['phValue']
    interactions.append('temp_ph_interaction')
    print("✓ Created: temp_ph_interaction (temperature × pH)")

# Interaction 5: Density ratio
if 'densityMatthews' in X_encoded.columns and 'structureMolecularWeight' in X_encoded.columns:
    X_encoded['density_per_weight'] = X_encoded['densityMatthews'] / (X_encoded['structureMolecularWeight'] / 1000 + 1)
    interactions.append('density_per_weight')
    print("✓ Created: density_per_weight")

print(f"\nTotal interaction features created: {len(interactions)}")

# NOVELTY: Create polynomial features for selected numerical features
print("\n" + "=" * 80)
print("STEP 4: POLYNOMIAL FEATURES (NOVELTY)")
print("=" * 80)

# Select key numerical features for polynomial expansion
poly_features = ['residueCount', 'resolution', 'structureMolecularWeight']
poly_features = [f for f in poly_features if f in X_encoded.columns]

print(f"Applying polynomial features (degree=2) to: {poly_features}")

if len(poly_features) > 0:
    poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
    X_poly = poly.fit_transform(X_encoded[poly_features])

    # Get feature names
    poly_feature_names = poly.get_feature_names_out(poly_features)

    # Add only new polynomial features (exclude original features)
    original_feature_names = poly_features
    new_poly_features = [name for name in poly_feature_names if name not in original_feature_names]

    # Add polynomial features to dataframe
    X_poly_df = pd.DataFrame(X_poly, columns=poly_feature_names)
    for feat in new_poly_features:
        X_encoded[feat] = X_poly_df[feat].values

    print(f"✓ Added {len(new_poly_features)} polynomial features")
    print(f"  Examples: {new_poly_features[:5]}")

print(f"\nTotal features after engineering: {X_encoded.shape[1]}")

# Create feature groups for later analysis
print("\n" + "=" * 80)
print("FEATURE SUMMARY")
print("=" * 80)

original_features = [col for col in X_encoded.columns if col in num_features or col.endswith('_encoded')]
print(f"Original features (encoded): {len(original_features)}")
print(f"Interaction features: {len(interactions)}")
print(f"Polynomial features: {len([c for c in X_encoded.columns if '^2' in c or ' ' in c])}")
print(f"TOTAL FEATURES: {X_encoded.shape[1]}")

# Save engineered features
X_encoded.to_csv('/content/sample_data/features_engineered.csv', index=False)
y.to_csv('/content/sample_data/target.csv', index=False)

# Save feature metadata
feature_metadata = {
    'total_features': X_encoded.shape[1],
    'original_features': original_features,
    'interaction_features': interactions,
    'polynomial_features': [c for c in X_encoded.columns if '^2' in c or (' ' in c and c not in interactions)],
    'feature_names': X_encoded.columns.tolist()
}

import json
with open('/content/sample_data/feature_metadata.json', 'w') as f:
    json.dump(feature_metadata, f, indent=2)

print("\n" + "=" * 80)
print("FILES SAVED")
print("=" * 80)
print("✓ features_engineered.csv - Engineered features")
print("✓ target.csv - Target variable")
print("✓ feature_metadata.json - Feature metadata")


[INFO] Loaded 841 samples
[INFO] Features: 10, Target: classification_grouped

STEP 1: HANDLE MISSING VALUES
Missing values before imputation:
macromoleculeType         16
resolution                48
crystallizationMethod    222
crystallizationTempK     215
densityMatthews           62
densityPercentSol         62
phValue                  178
dtype: int64

Numerical features: 7
Categorical features: 3
  Imputed resolution with median: 2.00
  Imputed crystallizationTempK with median: 293.00
  Imputed densityMatthews with median: 2.41
  Imputed densityPercentSol with median: 48.95
  Imputed phValue with median: 6.70
  Imputed macromoleculeType with mode: Protein
  Imputed crystallizationMethod with mode: VAPOR DIFFUSION, HANGING DROP

Missing values after imputation:
Total missing: 0

STEP 2: ENCODE CATEGORICAL VARIABLES
Encoded experimentalTechnique: 6 categories
Encoded macromoleculeType: 5 categories
Encoded crystallizationMethod: 22 categories

Features after encoding: 10

STEP 3: C

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

print("=" * 80)
print("PART 3: FEATURE SELECTION")
print("=" * 80)

# Load engineered features and target
X = pd.read_csv('/content/sample_data/features_engineered.csv')
y = pd.read_csv('/content/sample_data/target.csv')['classification_grouped']

print(f"[INFO] Loaded {X.shape[0]} samples with {X.shape[1]} features")
print(f"[INFO] Target classes: {y.nunique()}")

# Encode target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)
print(f"[INFO] Target encoded: {len(le.classes_)} classes")

# Scale features
print("\n" + "=" * 80)
print("STEP 1: FEATURE SCALING")
print("=" * 80)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

print("✓ Features scaled using StandardScaler")

# METHOD 1: Variance Threshold (Remove low variance features)
print("\n" + "=" * 80)
print("STEP 2: VARIANCE THRESHOLD")
print("=" * 80)

variance = X_scaled.var()
low_var_features = variance[variance < 0.01].index.tolist()
print(f"Features with variance < 0.01: {len(low_var_features)}")

if len(low_var_features) > 0:
    print(f"Low variance features: {low_var_features[:5]}")
    X_scaled_filtered = X_scaled.drop(low_var_features, axis=1)
else:
    X_scaled_filtered = X_scaled.copy()

print(f"Features after variance filtering: {X_scaled_filtered.shape[1]}")

# METHOD 2: ANOVA F-test (Univariate feature selection)
print("\n" + "=" * 80)
print("STEP 3: ANOVA F-TEST FEATURE SELECTION")
print("=" * 80)

k_best = min(30, X_scaled_filtered.shape[1])  # Select top 30 features
selector_anova = SelectKBest(f_classif, k=k_best)
X_anova = selector_anova.fit_transform(X_scaled_filtered, y_encoded)

# Get selected feature scores
anova_scores = pd.DataFrame({
    'feature': X_scaled_filtered.columns,
    'anova_score': selector_anova.scores_
}).sort_values('anova_score', ascending=False)

print(f"✓ Selected top {k_best} features using ANOVA F-test")
print(f"\nTop 10 features by ANOVA score:")
print(anova_scores.head(10))

# METHOD 3: Mutual Information
print("\n" + "=" * 80)
print("STEP 4: MUTUAL INFORMATION FEATURE SELECTION")
print("=" * 80)

mi_scores = mutual_info_classif(X_scaled_filtered, y_encoded, random_state=42)
mi_df = pd.DataFrame({
    'feature': X_scaled_filtered.columns,
    'mi_score': mi_scores
}).sort_values('mi_score', ascending=False)

print(f"✓ Calculated mutual information scores")
print(f"\nTop 10 features by Mutual Information:")
print(mi_df.head(10))

# METHOD 4: Random Forest Feature Importance
print("\n" + "=" * 80)
print("STEP 5: RANDOM FOREST FEATURE IMPORTANCE")
print("=" * 80)

rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X_scaled_filtered, y_encoded)

rf_importance = pd.DataFrame({
    'feature': X_scaled_filtered.columns,
    'rf_importance': rf.feature_importances_
}).sort_values('rf_importance', ascending=False)

print(f"✓ Calculated Random Forest feature importances")
print(f"\nTop 10 features by RF importance:")
print(rf_importance.head(10))

# NOVELTY: Ensemble Feature Selection (Combine multiple methods)
print("\n" + "=" * 80)
print("STEP 6: ENSEMBLE FEATURE SELECTION (NOVELTY)")
print("=" * 80)

# Normalize scores to 0-1 range
anova_scores['anova_norm'] = (anova_scores['anova_score'] - anova_scores['anova_score'].min()) / \
                              (anova_scores['anova_score'].max() - anova_scores['anova_score'].min())
mi_df['mi_norm'] = (mi_df['mi_score'] - mi_df['mi_score'].min()) / \
                   (mi_df['mi_score'].max() - mi_df['mi_score'].min())
rf_importance['rf_norm'] = (rf_importance['rf_importance'] - rf_importance['rf_importance'].min()) / \
                           (rf_importance['rf_importance'].max() - rf_importance['rf_importance'].min())

# Merge all scores
ensemble_scores = anova_scores[['feature', 'anova_norm']].merge(
    mi_df[['feature', 'mi_norm']], on='feature'
).merge(
    rf_importance[['feature', 'rf_norm']], on='feature'
)

# Calculate ensemble score (weighted average)
ensemble_scores['ensemble_score'] = (
    0.4 * ensemble_scores['anova_norm'] +
    0.3 * ensemble_scores['mi_norm'] +
    0.3 * ensemble_scores['rf_norm']
)

ensemble_scores = ensemble_scores.sort_values('ensemble_score', ascending=False)

print("✓ Combined feature selection methods with weighted average:")
print("  - ANOVA F-test: 40%")
print("  - Mutual Information: 30%")
print("  - Random Forest: 30%")

print(f"\nTop 15 features by ENSEMBLE score:")
print(ensemble_scores.head(15)[['feature', 'ensemble_score']])

# Select top features
n_features_selected = min(25, X_scaled_filtered.shape[1])
selected_features = ensemble_scores.head(n_features_selected)['feature'].tolist()

print(f"\n✓ Selected {len(selected_features)} features for modeling")

# Create final feature set
X_selected = X_scaled_filtered[selected_features]

print("\n" + "=" * 80)
print("FEATURE SELECTION SUMMARY")
print("=" * 80)
print(f"Original features: {X.shape[1]}")
print(f"After variance threshold: {X_scaled_filtered.shape[1]}")
print(f"Final selected features: {X_selected.shape[1]}")
print(f"Reduction: {(1 - X_selected.shape[1]/X.shape[1])*100:.1f}%")

# Save results
X_selected.to_csv('/content/sample_data/features_selected.csv', index=False)
ensemble_scores.to_csv('/content/sample_data/feature_scores.csv', index=False)
pd.Series(y_encoded).to_csv('/content/sample_data/target_encoded.csv', index=False)

# Save selected feature names
with open('/content/sample_data/selected_features.txt', 'w') as f:
    for feat in selected_features:
        f.write(feat + '\n')

print("\n" + "=" * 80)
print("FILES SAVED")
print("=" * 80)
print("✓ features_selected.csv - Selected features for modeling")
print("✓ feature_scores.csv - All feature scores from different methods")
print("✓ target_encoded.csv - Encoded target variable")
print("✓ selected_features.txt - List of selected feature names")



PART 3: FEATURE SELECTION
[INFO] Loaded 841 samples with 21 features
[INFO] Target classes: 29
[INFO] Target encoded: 29 classes

STEP 1: FEATURE SCALING
✓ Features scaled using StandardScaler

STEP 2: VARIANCE THRESHOLD
Features with variance < 0.01: 0
Features after variance filtering: 21

STEP 3: ANOVA F-TEST FEATURE SELECTION
✓ Selected top 21 features using ANOVA F-test

Top 10 features by ANOVA score:
                                  feature  anova_score
8               macromoleculeType_encoded   118.976743
11                             size_ratio   109.025013
2                structureMolecularWeight    35.264974
20             structureMolecularWeight^2    31.914778
19    resolution structureMolecularWeight    29.706094
17  residueCount structureMolecularWeight    28.847570
15                         residueCount^2    21.388839
0                            residueCount    15.106171
16                residueCount resolution    10.870562
14                     density_per_weig

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

print("=" * 80)
print("PART 4: MODEL TRAINING - HYBRID ENSEMBLE")
print("=" * 80)

# Load selected features and target
X = pd.read_csv('/content/sample_data/features_selected.csv')
y = pd.read_csv('/content/sample_data/target_encoded.csv').values.ravel()

print(f"[INFO] Loaded {X.shape[0]} samples with {X.shape[1]} features")
print(f"[INFO] Number of classes: {len(np.unique(y))}")

# Split data: 70% train, 15% validation, 15% test
print("\n" + "=" * 80)
print("STEP 1: TRAIN-VALIDATION-TEST SPLIT")
print("=" * 80)

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.176, random_state=42, stratify=y_train_full
)  # 0.176 * 0.85 ≈ 0.15 of total

print(f"Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Validation set: {X_val.shape[0]} samples ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f"Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")

# Define base models for ensemble
print("\n" + "=" * 80)
print("STEP 2: DEFINE BASE MODELS")
print("=" * 80)

base_models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42, C=0.1),
    'Decision Tree': DecisionTreeClassifier(max_depth=10, random_state=42, min_samples_split=10),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=50, max_depth=5, random_state=42),
    'SVM': SVC(kernel='rbf', C=1.0, probability=True, random_state=42),
    'Naive Bayes': GaussianNB(),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=7)
}

print(f"Number of base models: {len(base_models)}")
for name in base_models.keys():
    print(f"  - {name}")

# Train base models and evaluate
print("\n" + "=" * 80)
print("STEP 3: TRAIN BASE MODELS WITH CROSS-VALIDATION")
print("=" * 80)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model_scores = {}
trained_models = {}

for name, model in base_models.items():
    print(f"\n{name}:")
    print(f"  Training...")

    # Train on training set
    model.fit(X_train, y_train)
    trained_models[name] = model

    # Cross-validation on training set
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)

    # Validation accuracy
    val_acc = model.score(X_val, y_val)

    model_scores[name] = {
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'val_acc': val_acc
    }

    print(f"  Cross-validation: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
    print(f"  Validation accuracy: {val_acc:.4f}")

# Display model comparison
print("\n" + "=" * 80)
print("BASE MODEL COMPARISON")
print("=" * 80)

comparison_df = pd.DataFrame(model_scores).T
comparison_df = comparison_df.sort_values('val_acc', ascending=False)
print(comparison_df)

# Select top 3 models for stacking
top_3_models = comparison_df.head(3).index.tolist()
print(f"\nTop 3 models for stacking ensemble:")
for i, name in enumerate(top_3_models, 1):
    print(f"  {i}. {name} (Val Acc: {comparison_df.loc[name, 'val_acc']:.4f})")

# NOVELTY: Create Stacking Ensemble
print("\n" + "=" * 80)
print("STEP 4: CREATE STACKING ENSEMBLE (NOVELTY)")
print("=" * 80)

# Get predictions from top 3 base models
stacking_train_predictions = []
stacking_val_predictions = []

for name in top_3_models:
    model = trained_models[name]

    # Get probability predictions
    train_pred_proba = model.predict_proba(X_train)
    val_pred_proba = model.predict_proba(X_val)

    stacking_train_predictions.append(train_pred_proba)
    stacking_val_predictions.append(val_pred_proba)

    print(f"✓ Added {name} predictions to stacking features")

# Concatenate predictions
X_train_stacking = np.concatenate(stacking_train_predictions, axis=1)
X_val_stacking = np.concatenate(stacking_val_predictions, axis=1)

print(f"\nStacking features shape (train): {X_train_stacking.shape}")
print(f"Stacking features shape (val): {X_val_stacking.shape}")

# Train meta-learner (Logistic Regression)
print("\nTraining meta-learner (Logistic Regression)...")
meta_learner = LogisticRegression(max_iter=1000, random_state=42, C=1.0)
meta_learner.fit(X_train_stacking, y_train)

# Evaluate stacking ensemble
stacking_val_acc = meta_learner.score(X_val_stacking, y_val)
print(f"✓ Stacking ensemble validation accuracy: {stacking_val_acc:.4f}")

# Compare with simple voting ensemble
print("\n" + "=" * 80)
print("STEP 5: COMPARE WITH VOTING ENSEMBLE")
print("=" * 80)

# Voting ensemble (majority vote)
voting_predictions = []
for name in top_3_models:
    model = trained_models[name]
    val_pred = model.predict(X_val)
    voting_predictions.append(val_pred)

voting_predictions = np.array(voting_predictions)
final_voting_pred = []
for i in range(voting_predictions.shape[1]):
    # Majority vote
    votes = voting_predictions[:, i]
    unique, counts = np.unique(votes, return_counts=True)
    final_voting_pred.append(unique[counts.argmax()])

voting_val_acc = np.mean(np.array(final_voting_pred) == y_val)
print(f"Voting ensemble validation accuracy: {voting_val_acc:.4f}")

print("\n" + "=" * 80)
print("ENSEMBLE COMPARISON")
print("=" * 80)
print(f"Stacking Ensemble: {stacking_val_acc:.4f}")
print(f"Voting Ensemble: {voting_val_acc:.4f}")
print(f"Best Single Model ({top_3_models[0]}): {comparison_df.loc[top_3_models[0], 'val_acc']:.4f}")

if stacking_val_acc > voting_val_acc:
    print("\n✓ Stacking ensemble performs better!")
    final_ensemble_type = "stacking"
else:
    print("\n✓ Voting ensemble performs better!")
    final_ensemble_type = "voting"

# Save models and results
print("\n" + "=" * 80)
print("SAVING MODELS AND RESULTS")
print("=" * 80)

import pickle

# Save trained models
with open('/content/sample_data/trained_models.pkl', 'wb') as f:
    pickle.dump(trained_models, f)

with open('/content/sample_data/meta_learner.pkl', 'wb') as f:
    pickle.dump(meta_learner, f)

# Save data splits
X_test.to_csv('/content/sample_data/X_test.csv', index=False)
pd.Series(y_test).to_csv('/content/sample_data/y_test.csv', index=False)

# Save model comparison
comparison_df.to_csv('/content/sample_data/model_comparison.csv')

# Save configuration
config = {
    'top_models': top_3_models,
    'best_ensemble': final_ensemble_type,
    'stacking_val_acc': float(stacking_val_acc),
    'voting_val_acc': float(voting_val_acc)
}

import json
with open('/content/sample_data/ensemble_config.json', 'w') as f:
    json.dump(config, f, indent=2)

print("✓ trained_models.pkl - All trained base models")
print("✓ meta_learner.pkl - Meta-learner for stacking")
print("✓ model_comparison.csv - Model performance comparison")
print("✓ ensemble_config.json - Ensemble configuration")
print("✓ X_test.csv, y_test.csv - Test data for evaluation")



PART 4: MODEL TRAINING - HYBRID ENSEMBLE
[INFO] Loaded 841 samples with 21 features
[INFO] Number of classes: 29

STEP 1: TRAIN-VALIDATION-TEST SPLIT
Training set: 588 samples (69.9%)
Validation set: 126 samples (15.0%)
Test set: 127 samples (15.1%)

STEP 2: DEFINE BASE MODELS
Number of base models: 7
  - Logistic Regression
  - Decision Tree
  - Random Forest
  - Gradient Boosting
  - SVM
  - Naive Bayes
  - K-Nearest Neighbors

STEP 3: TRAIN BASE MODELS WITH CROSS-VALIDATION

Logistic Regression:
  Training...
  Cross-validation: 0.7653 (+/- 0.0094)
  Validation accuracy: 0.7778

Decision Tree:
  Training...
  Cross-validation: 0.7228 (+/- 0.0245)
  Validation accuracy: 0.7460

Random Forest:
  Training...
  Cross-validation: 0.7636 (+/- 0.0132)
  Validation accuracy: 0.7857

Gradient Boosting:
  Training...
  Cross-validation: 0.6649 (+/- 0.0346)
  Validation accuracy: 0.6508

SVM:
  Training...
  Cross-validation: 0.7687 (+/- 0.0031)
  Validation accuracy: 0.7698

Naive Bayes:
  Tr

In [None]:


import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
import warnings
import json
import pickle
warnings.filterwarnings('ignore')

np.random.seed(42)


print("=" * 80)

# Load data
X_train = pd.read_csv('/content/sample_data/features_selected.csv')
y_train = pd.read_csv('/content/sample_data/target_encoded.csv').values.ravel()

# Split for tuning (use 70% of data for faster tuning)
from sklearn.model_selection import train_test_split
X_tune, _, y_tune, _ = train_test_split(X_train, y_train, train_size=0.7, random_state=42, stratify=y_train)

print(f"[INFO] Using {len(X_tune)} samples for hyperparameter tuning")
print(f"[INFO] Features: {X_tune.shape[1]}")

# Load ensemble configuration to see which models performed best
with open('/content/sample_data/ensemble_config.json', 'r') as f:
    config = json.load(f)

top_models = config['top_models']
print(f"\nTop models to optimize: {top_models}")

# Define hyperparameter grids
print("\n" + "=" * 80)
print("STEP 1: DEFINE HYPERPARAMETER GRIDS")
print("=" * 80)

param_grids = {}

# Random Forest parameters
if 'Random Forest' in top_models:
    param_grids['Random Forest'] = {
        'model': RandomForestClassifier(random_state=42, n_jobs=-1),
        'params': {
            'n_estimators': [50, 100, 150],
            'max_depth': [10, 15, 20],
            'min_samples_split': [5, 10],
            'min_samples_leaf': [2, 4]
        }
    }
    print("✓ Random Forest: 3×3×2×2 = 36 combinations")

# Gradient Boosting parameters
if 'Gradient Boosting' in top_models:
    param_grids['Gradient Boosting'] = {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.2],
            'min_samples_split': [5, 10]
        }
    }
    print("✓ Gradient Boosting: 2×3×3×2 = 36 combinations")

# Logistic Regression parameters
if 'Logistic Regression' in top_models:
    param_grids['Logistic Regression'] = {
        'model': LogisticRegression(max_iter=1000, random_state=42),
        'params': {
            'C': [0.01, 0.1, 1.0, 10.0],
            'penalty': ['l2'],
            'solver': ['lbfgs', 'liblinear']
        }
    }
    print("✓ Logistic Regression: 4×1×2 = 8 combinations")

# SVM parameters
if 'SVM' in top_models:
    param_grids['SVM'] = {
        'model': SVC(probability=True, random_state=42),
        'params': {
            'C': [0.1, 1.0, 10.0],
            'kernel': ['rbf', 'linear'],
            'gamma': ['scale', 'auto']
        }
    }
    print("✓ SVM: 3×2×2 = 12 combinations")

# Perform Grid Search
print("\n" + "=" * 80)
print("STEP 2: GRID SEARCH CV ")
print("=" * 80)

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)  # 3-fold for speed
optimized_models = {}
best_params = {}
tuning_results = {}

for model_name, config_dict in param_grids.items():
    print(f"\n{'='*60}")
    print(f"Optimizing: {model_name}")
    print(f"{'='*60}")

    model = config_dict['model']
    params = config_dict['params']

    print(f"Testing {np.prod([len(v) for v in params.values()])} parameter combinations...")

    # Grid Search
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=params,
        cv=cv,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_tune, y_tune)

    # Store results
    optimized_models[model_name] = grid_search.best_estimator_
    best_params[model_name] = grid_search.best_params_
    tuning_results[model_name] = {
        'best_score': grid_search.best_score_,
        'best_params': grid_search.best_params_
    }

    print(f"\n✓ Best CV Score: {grid_search.best_score_:.4f}")
    print(f"✓ Best Parameters:")
    for param, value in grid_search.best_params_.items():
        print(f"    {param}: {value}")

# Summary of optimization results
print("\n" + "=" * 80)
print("HYPERPARAMETER OPTIMIZATION SUMMARY")
print("=" * 80)

for model_name, results in tuning_results.items():
    print(f"\n{model_name}:")
    print(f"  Best CV Score: {results['best_score']:.4f}")
    print(f"  Best Parameters: {results['best_params']}")

# Retrain optimized models on full training data
print("\n" + "=" * 80)
print("STEP 3: RETRAIN WITH OPTIMIZED PARAMETERS")
print("=" * 80)

X_train_full = pd.read_csv('/content/sample_data/features_selected.csv')
y_train_full = pd.read_csv('/content/sample_data/target_encoded.csv').values.ravel()

final_optimized_models = {}

for model_name, model in optimized_models.items():
    print(f"Retraining {model_name} on full training data...")
    model.fit(X_train_full, y_train_full)
    final_optimized_models[model_name] = model
    print(f"✓ {model_name} retrained")

# Evaluate on validation/test set
X_test = pd.read_csv('/content/sample_data/X_test.csv')
y_test = pd.read_csv('/content/sample_data/y_test.csv').values.ravel()

print("\n" + "=" * 80)
print("STEP 4: EVALUATE OPTIMIZED MODELS ON TEST SET")
print("=" * 80)

test_results = {}
for model_name, model in final_optimized_models.items():
    test_acc = model.score(X_test, y_test)
    test_results[model_name] = test_acc
    print(f"{model_name}: {test_acc:.4f}")

# Compare with baseline (before tuning)
print("\n" + "=" * 80)
print("IMPROVEMENT ANALYSIS")
print("=" * 80)

# Load baseline results
baseline_comparison = pd.read_csv('/content/sample_data/model_comparison.csv', index_col=0)

print("Model Performance :")
for model_name in final_optimized_models.keys():
    if model_name in baseline_comparison.index:
        baseline_val = baseline_comparison.loc[model_name, 'val_acc']
        tuned_test = test_results[model_name]
        improvement = (tuned_test - baseline_val) * 100
        print(f"{model_name}:")
        print(f"  Baseline (validation): {baseline_val:.4f}")
        print(f"  Tuned (test): {tuned_test:.4f}")
        print(f"  Change: {improvement:+.2f}%")

# Save optimized models
print("\n" + "=" * 80)
print("SAVING OPTIMIZED MODELS")
print("=" * 80)

with open('/content/sample_data/optimized_models.pkl', 'wb') as f:
    pickle.dump(final_optimized_models, f)

with open('/content/sample_data/best_hyperparameters.json', 'w') as f:
    json.dump(best_params, f, indent=2)

test_results_df = pd.DataFrame({
    'Model': list(test_results.keys()),
    'Test_Accuracy': list(test_results.values())
}).sort_values('Test_Accuracy', ascending=False)
test_results_df.to_csv('/content/sample_data/optimized_test_results.csv', index=False)

print("✓ optimized_models.pkl - Optimized models")
print("✓ best_hyperparameters.json - Best hyperparameters")
print("✓ optimized_test_results.csv - Test results")



[INFO] Using 588 samples for hyperparameter tuning
[INFO] Features: 21

Top models to optimize: ['Random Forest', 'Logistic Regression', 'SVM']

STEP 1: DEFINE HYPERPARAMETER GRIDS
✓ Random Forest: 3×3×2×2 = 36 combinations
✓ Logistic Regression: 4×1×2 = 8 combinations
✓ SVM: 3×2×2 = 12 combinations

STEP 2: GRID SEARCH CV 

Optimizing: Random Forest
Testing 36 parameter combinations...
Fitting 3 folds for each of 36 candidates, totalling 108 fits

✓ Best CV Score: 0.7602
✓ Best Parameters:
    max_depth: 10
    min_samples_leaf: 2
    min_samples_split: 5
    n_estimators: 50

Optimizing: Logistic Regression
Testing 8 parameter combinations...
Fitting 3 folds for each of 8 candidates, totalling 24 fits

✓ Best CV Score: 0.7653
✓ Best Parameters:
    C: 0.1
    penalty: l2
    solver: lbfgs

Optimizing: SVM
Testing 12 parameter combinations...
Fitting 3 folds for each of 12 candidates, totalling 36 fits

✓ Best CV Score: 0.7738
✓ Best Parameters:
    C: 0.1
    gamma: scale
    kernel:

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)
import pickle
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

print("=" * 80)

# Load test data
X_test = pd.read_csv('/content/sample_data/X_test.csv')
y_test = pd.read_csv('/content/sample_data/y_test.csv').values.ravel()

print(f"[INFO] Test set: {len(y_test)} samples")
print(f"[INFO] Number of classes: {len(np.unique(y_test))}")

# Load optimized models
with open('/content/sample_data/optimized_models.pkl', 'rb') as f:
    models = pickle.load(f)

print(f"[INFO] Loaded {len(models)} optimized models")

# Evaluate each model
print("\n" + "=" * 80)
print("STEP 1: EVALUATE INDIVIDUAL MODELS")
print("=" * 80)

evaluation_results = {}

for model_name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Evaluating: {model_name}")
    print(f"{'='*60}")

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

    evaluation_results[model_name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'predictions': y_pred
    }

    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1:.4f}")

# Create evaluation comparison table
print("\n" + "=" * 80)
print("MODEL COMPARISON TABLE")
print("=" * 80)

comparison_df = pd.DataFrame({
    'Model': list(evaluation_results.keys()),
    'Accuracy': [r['accuracy'] for r in evaluation_results.values()],
    'Precision': [r['precision'] for r in evaluation_results.values()],
    'Recall': [r['recall'] for r in evaluation_results.values()],
    'F1-Score': [r['f1_score'] for r in evaluation_results.values()]
})

comparison_df = comparison_df.sort_values('F1-Score', ascending=False)
print(comparison_df.to_string(index=False))

# Identify best model
best_model_name = comparison_df.iloc[0]['Model']
best_model = models[best_model_name]
best_predictions = evaluation_results[best_model_name]['predictions']

print(f"\n✓ Best Model: {best_model_name}")
print(f"  F1-Score: {comparison_df.iloc[0]['F1-Score']:.4f}")

# Generate confusion matrix for best model
print("\n" + "=" * 80)
print("STEP 2: CONFUSION MATRIX (BEST MODEL)")
print("=" * 80)

cm = confusion_matrix(y_test, best_predictions)
print(f"\nConfusion Matrix for {best_model_name}:")
print(cm)

# Calculate per-class metrics
print("\n" + "=" * 80)
print("STEP 3: PER-CLASS PERFORMANCE")
print("=" * 80)

# Get unique classes
unique_classes = np.unique(y_test)
n_classes = len(unique_classes)

print(f"\nDetailed metrics for each class:")
print(f"{'Class':<10} {'Precision':<12} {'Recall':<12} {'F1-Score':<12} {'Support':<10}")
print("-" * 60)

for i, class_id in enumerate(unique_classes):
    # Create binary classification for this class
    y_test_binary = (y_test == class_id).astype(int)
    y_pred_binary = (best_predictions == class_id).astype(int)

    # Calculate metrics
    prec = precision_score(y_test_binary, y_pred_binary, zero_division=0)
    rec = recall_score(y_test_binary, y_pred_binary, zero_division=0)
    f1 = f1_score(y_test_binary, y_pred_binary, zero_division=0)
    support = np.sum(y_test == class_id)

    print(f"Class {class_id:<4} {prec:<12.4f} {rec:<12.4f} {f1:<12.4f} {support:<10}")

# Generate classification report
print("\n" + "=" * 80)
print("STEP 4: CLASSIFICATION REPORT")
print("=" * 80)

print(f"\nClassification Report for {best_model_name}:")
print(classification_report(y_test, best_predictions))

# Calculate and display confusion matrix statistics
print("\n" + "=" * 80)
print("STEP 5: CONFUSION MATRIX STATISTICS")
print("=" * 80)

# True Positives, False Positives, etc. for each class
tp = np.diag(cm)
fp = cm.sum(axis=0) - tp
fn = cm.sum(axis=1) - tp
tn = cm.sum() - (tp + fp + fn)

print(f"\n{'Class':<10} {'TP':<8} {'FP':<8} {'FN':<8} {'TN':<8}")
print("-" * 45)
for i in range(n_classes):
    print(f"Class {i:<4} {tp[i]:<8} {fp[i]:<8} {fn[i]:<8} {tn[i]:<8}")

# Calculate overall error metrics
print("\n" + "=" * 80)
print("STEP 6: ERROR ANALYSIS")
print("=" * 80)

total_samples = len(y_test)
correct_predictions = np.sum(y_test == best_predictions)
incorrect_predictions = total_samples - correct_predictions

print(f"Total Test Samples: {total_samples}")
print(f"Correct Predictions: {correct_predictions} ({correct_predictions/total_samples*100:.2f}%)")
print(f"Incorrect Predictions: {incorrect_predictions} ({incorrect_predictions/total_samples*100:.2f}%)")

# Calculate bias and variance indicators
print("\n" + "=" * 80)
print("STEP 7: BIAS-VARIANCE ANALYSIS")
print("=" * 80)

# Compare training and test performance (if training data available)
try:
    X_train = pd.read_csv('/content/sample_data/features_selected.csv')
    y_train = pd.read_csv('/content/sample_data/target_encoded.csv').values.ravel()

    train_pred = best_model.predict(X_train)
    train_acc = accuracy_score(y_train, train_pred)
    test_acc = evaluation_results[best_model_name]['accuracy']

    print(f"Training Accuracy: {train_acc:.4f}")
    print(f"Test Accuracy: {test_acc:.4f}")
    print(f"Difference: {abs(train_acc - test_acc):.4f}")

    if train_acc - test_acc > 0.10:
        print("\n⚠ High variance detected (overfitting)")
        print("   Model performs significantly better on training data")
    elif test_acc < 0.60:
        print("\n⚠ High bias detected (underfitting)")
        print("   Model performs poorly on both training and test data")
    else:
        print("\n✓ Good bias-variance tradeoff")
        print("   Model generalizes well")

except Exception as e:
    print(f"Could not perform bias-variance analysis: {e}")

# Save all evaluation results
print("\n" + "=" * 80)
print("SAVING EVALUATION RESULTS")
print("=" * 80)

# Save comparison table
comparison_df.to_csv('/content/sample_data/model_evaluation_comparison.csv', index=False)

# Save confusion matrix
cm_df = pd.DataFrame(cm)
cm_df.to_csv('/content/sample_data/confusion_matrix.csv', index=False)

# Save detailed results
detailed_results = {
    'best_model': best_model_name,
    'metrics': {
        'accuracy': float(evaluation_results[best_model_name]['accuracy']),
        'precision': float(evaluation_results[best_model_name]['precision']),
        'recall': float(evaluation_results[best_model_name]['recall']),
        'f1_score': float(evaluation_results[best_model_name]['f1_score'])
    },
    'confusion_matrix': cm.tolist(),
    'n_classes': int(n_classes),
    'test_samples': int(total_samples)
}

import json
with open('/content/sample_data/evaluation_results.json', 'w') as f:
    json.dump(detailed_results, f, indent=2)

print("✓ model_evaluation_comparison.csv - Model comparison table")
print("✓ confusion_matrix.csv - Confusion matrix")
print("✓ evaluation_results.json - Detailed evaluation results")



[INFO] Test set: 127 samples
[INFO] Number of classes: 24
[INFO] Loaded 3 optimized models

STEP 1: EVALUATE INDIVIDUAL MODELS

Evaluating: Random Forest
Accuracy:  0.8031
Precision: 0.6942
Recall:    0.8031
F1-Score:  0.7255

Evaluating: Logistic Regression
Accuracy:  0.7953
Precision: 0.6578
Recall:    0.7953
F1-Score:  0.7098

Evaluating: SVM
Accuracy:  0.7874
Precision: 0.6215
Recall:    0.7874
F1-Score:  0.6942

MODEL COMPARISON TABLE
              Model  Accuracy  Precision   Recall  F1-Score
      Random Forest  0.803150   0.694215 0.803150  0.725513
Logistic Regression  0.795276   0.657803 0.795276  0.709846
                SVM  0.787402   0.621471 0.787402  0.694208

✓ Best Model: Random Forest
  F1-Score: 0.7255

STEP 2: CONFUSION MATRIX (BEST MODEL)

Confusion Matrix for Random Forest:
[[ 1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  0  0  0  0  0  0  0  0  0  

In [None]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 8)
plt.rcParams['font.size'] = 10

print("=" * 80)
print("VISUALIZATION")
print("=" * 80)

# Load data
X_test = pd.read_csv('/content/sample_data/X_test.csv')
y_test = pd.read_csv('/content/sample_data/y_test.csv').values.ravel()
comparison_df = pd.read_csv('/content/sample_data/model_evaluation_comparison.csv')
cm = pd.read_csv('/content/sample_data/confusion_matrix.csv').values

print(f"[INFO] Loaded evaluation data")

# Load evaluation results to get best model
import json
with open('/content/sample_data/evaluation_results.json', 'r') as f:
    eval_results = json.load(f)
best_model_name = eval_results['best_model']

print(f"[INFO] Best model: {best_model_name}")

# Visualization 1: Confusion Matrix Heatmap
print("\n" + "=" * 80)
print("VISUALIZATION 1: CONFUSION MATRIX HEATMAP")
print("=" * 80)

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', square=True,
            xticklabels=range(len(cm)), yticklabels=range(len(cm)),
            cbar_kws={'label': 'Count'})
plt.title(f'Confusion Matrix - {best_model_name}', fontsize=14, fontweight='bold')
plt.xlabel('Predicted Class', fontsize=12)
plt.ylabel('True Class', fontsize=12)
plt.tight_layout()
plt.savefig('/content/sample_data/confusion_matrix_heatmap.png', dpi=300, bbox_inches='tight')
print("✓ Saved: confusion_matrix_heatmap.png")
plt.close()

# Visualization 2: Model Comparison - Accuracy
print("\n" + "=" * 80)
print("VISUALIZATION 2: MODEL COMPARISON - ACCURACY")
print("=" * 80)

fig, ax = plt.subplots(figsize=(12, 6))
bars = ax.bar(comparison_df['Model'], comparison_df['Accuracy'],
              color=['#2ecc71' if m == best_model_name else '#3498db'
                     for m in comparison_df['Model']])
ax.set_xlabel('Model', fontsize=12)
ax.set_ylabel('Accuracy', fontsize=12)
ax.set_title('Model Comparison - Accuracy on Test Set', fontsize=14, fontweight='bold')
ax.set_ylim([0, 1.0])
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.3f}',
            ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('/content/sample_data/model_comparison_accuracy.png', dpi=300, bbox_inches='tight')
print("✓ Saved: model_comparison_accuracy.png")
plt.close()

# Visualization 3: Model Comparison - All Metrics
print("\n" + "=" * 80)
print("VISUALIZATION 3: MODEL COMPARISON - ALL METRICS")
print("=" * 80)

fig, ax = plt.subplots(figsize=(14, 8))

x = np.arange(len(comparison_df))
width = 0.2

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
colors = ['#3498db', '#e74c3c', '#f39c12', '#2ecc71']

for i, metric in enumerate(metrics):
    ax.bar(x + i*width, comparison_df[metric], width,
           label=metric, color=colors[i], alpha=0.8)

ax.set_xlabel('Model', fontsize=12)
ax.set_ylabel('Score', fontsize=12)
ax.set_title('Model Performance Comparison - Multiple Metrics', fontsize=14, fontweight='bold')
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels(comparison_df['Model'], rotation=45, ha='right')
ax.legend(loc='lower right', fontsize=10)
ax.set_ylim([0, 1.0])
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('/content/sample_data/model_comparison_all_metrics.png', dpi=300, bbox_inches='tight')
print("✓ Saved: model_comparison_all_metrics.png")
plt.close()

# Visualization 4: Feature Importance (for tree-based models)
print("\n" + "=" * 80)
print("VISUALIZATION 4: FEATURE IMPORTANCE")
print("=" * 80)

try:
    with open('/content/sample_data/optimized_models.pkl', 'rb') as f:
        models = pickle.load(f)

    # Try to get feature importance from best model
    if best_model_name in models:
        best_model = models[best_model_name]

        if hasattr(best_model, 'feature_importances_'):
            feature_importance = best_model.feature_importances_
            feature_names = X_test.columns

            # Create dataframe
            importance_df = pd.DataFrame({
                'Feature': feature_names,
                'Importance': feature_importance
            }).sort_values('Importance', ascending=False).head(20)

            # Plot
            fig, ax = plt.subplots(figsize=(12, 10))
            bars = ax.barh(importance_df['Feature'], importance_df['Importance'],
                          color='#3498db')
            ax.set_xlabel('Importance', fontsize=12)
            ax.set_ylabel('Feature', fontsize=12)
            ax.set_title(f'Top 20 Feature Importances - {best_model_name}',
                        fontsize=14, fontweight='bold')
            ax.invert_yaxis()
            plt.grid(axis='x', alpha=0.3)
            plt.tight_layout()
            plt.savefig('/content/sample_data/feature_importance.png', dpi=300, bbox_inches='tight')
            print("✓ Saved: feature_importance.png")
            plt.close()
        else:
            print("  Best model doesn't have feature_importances_ attribute")

except Exception as e:
    print(f"  Could not create feature importance plot: {e}")

# Visualization 5: Correlation Matrix
print("\n" + "=" * 80)
print("VISUALIZATION 5: CORRELATION MATRIX")
print("=" * 80)

# Load original features for correlation
X_features = pd.read_csv('/content/sample_data/features_selected.csv')

# Select top 15 features for clearer visualization
feature_scores = pd.read_csv('/content/sample_data/feature_scores.csv')
top_15_features = feature_scores.head(15)['feature'].tolist()
X_subset = X_features[top_15_features]

# Calculate correlation matrix
corr_matrix = X_subset.corr()

# Plot
fig, ax = plt.subplots(figsize=(14, 12))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm',
            center=0, square=True, linewidths=0.5,
            cbar_kws={'label': 'Correlation'})
plt.title('Feature Correlation Matrix (Top 15 Features)',
          fontsize=14, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('/content/sample_data/correlation_matrix.png', dpi=300, bbox_inches='tight')
print("✓ Saved: correlation_matrix.png")
plt.close()

# Visualization 6: Class Distribution
print("\n" + "=" * 80)
print("VISUALIZATION 6: CLASS DISTRIBUTION")
print("=" * 80)

# Count classes in test set
class_counts = pd.Series(y_test).value_counts().sort_index()

fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(range(len(class_counts)), class_counts.values,
              color='#3498db', alpha=0.8)
ax.set_xlabel('Class', fontsize=12)
ax.set_ylabel('Count', fontsize=12)
ax.set_title('Test Set Class Distribution', fontsize=14, fontweight='bold')
ax.set_xticks(range(len(class_counts)))
ax.set_xticklabels([f'Class {i}' for i in class_counts.index])
plt.grid(axis='y', alpha=0.3)

# Add value labels
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height)}',
            ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('/content/sample_data/class_distribution.png', dpi=300, bbox_inches='tight')
print("✓ Saved: class_distribution.png")
plt.close()

# Create summary visualization
print("\n" + "=" * 80)
print("VISUALIZATION 7: SUMMARY DASHBOARD")
print("=" * 80)

fig = plt.figure(figsize=(16, 12))
gs = fig.add_gridspec(3, 2, hspace=0.3, wspace=0.3)

# Subplot 1: Model Comparison
ax1 = fig.add_subplot(gs[0, :])
x = np.arange(len(comparison_df))
width = 0.2
for i, metric in enumerate(['Accuracy', 'Precision', 'Recall', 'F1-Score']):
    ax1.bar(x + i*width, comparison_df[metric], width,
           label=metric, alpha=0.8)
ax1.set_xlabel('Model')
ax1.set_ylabel('Score')
ax1.set_title('Model Performance Comparison', fontweight='bold')
ax1.set_xticks(x + width * 1.5)
ax1.set_xticklabels(comparison_df['Model'], rotation=45, ha='right')
ax1.legend()
ax1.grid(axis='y', alpha=0.3)
ax1.set_ylim([0, 1.0])

# Subplot 2: Confusion Matrix
ax2 = fig.add_subplot(gs[1, 0])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', square=True,
            cbar=False, ax=ax2)
ax2.set_title(f'Confusion Matrix - {best_model_name}', fontweight='bold')
ax2.set_xlabel('Predicted')
ax2.set_ylabel('True')

# Subplot 3: Class Distribution
ax3 = fig.add_subplot(gs[1, 1])
ax3.bar(range(len(class_counts)), class_counts.values, color='#3498db', alpha=0.8)
ax3.set_xlabel('Class')
ax3.set_ylabel('Count')
ax3.set_title('Test Set Class Distribution', fontweight='bold')
ax3.grid(axis='y', alpha=0.3)

# Subplot 4: Metrics Table
ax4 = fig.add_subplot(gs[2, :])
ax4.axis('tight')
ax4.axis('off')

table_data = []
table_data.append(['Metric', 'Value'])
table_data.append(['Best Model', best_model_name])
table_data.append(['Accuracy', f"{eval_results['metrics']['accuracy']:.4f}"])
table_data.append(['Precision', f"{eval_results['metrics']['precision']:.4f}"])
table_data.append(['Recall', f"{eval_results['metrics']['recall']:.4f}"])
table_data.append(['F1-Score', f"{eval_results['metrics']['f1_score']:.4f}"])
table_data.append(['Test Samples', str(eval_results['test_samples'])])
table_data.append(['Classes', str(eval_results['n_classes'])])

table = ax4.table(cellText=table_data, cellLoc='left', loc='center',
                  colWidths=[0.3, 0.3])
table.auto_set_font_size(False)
table.set_fontsize(11)
table.scale(1, 2)

# Style header row
for i in range(2):
    table[(0, i)].set_facecolor('#3498db')
    table[(0, i)].set_text_props(weight='bold', color='white')

# Style data rows
for i in range(1, len(table_data)):
    for j in range(2):
        if i % 2 == 0:
            table[(i, j)].set_facecolor('#ecf0f1')

ax4.set_title('Model Performance Summary', fontweight='bold', fontsize=14, pad=20)

plt.savefig('/content/sample_data/summary_dashboard.png', dpi=300, bbox_inches='tight')
print("✓ Saved: summary_dashboard.png")
plt.close()

# Summary
print("\n" + "=" * 80)
print("ALL VISUALIZATIONS SAVED")
print("=" * 80)
print("✓ confusion_matrix_heatmap.png")
print("✓ model_comparison_accuracy.png")
print("✓ model_comparison_all_metrics.png")
print("✓ feature_importance.png")
print("✓ correlation_matrix.png")
print("✓ class_distribution.png")
print("✓ summary_dashboard.png")


VISUALIZATION
[INFO] Loaded evaluation data
[INFO] Best model: Random Forest

VISUALIZATION 1: CONFUSION MATRIX HEATMAP
✓ Saved: confusion_matrix_heatmap.png

VISUALIZATION 2: MODEL COMPARISON - ACCURACY
✓ Saved: model_comparison_accuracy.png

VISUALIZATION 3: MODEL COMPARISON - ALL METRICS
✓ Saved: model_comparison_all_metrics.png

VISUALIZATION 4: FEATURE IMPORTANCE
✓ Saved: feature_importance.png

VISUALIZATION 5: CORRELATION MATRIX
✓ Saved: correlation_matrix.png

VISUALIZATION 6: CLASS DISTRIBUTION
✓ Saved: class_distribution.png

VISUALIZATION 7: SUMMARY DASHBOARD
✓ Saved: summary_dashboard.png

ALL VISUALIZATIONS SAVED
✓ confusion_matrix_heatmap.png
✓ model_comparison_accuracy.png
✓ model_comparison_all_metrics.png
✓ feature_importance.png
✓ correlation_matrix.png
✓ class_distribution.png
✓ summary_dashboard.png


In [None]:


import pandas as pd
import numpy as np
import json
from datetime import datetime

print("=" * 80)
print("PART 8: FINAL PROJECT REPORT GENERATION")
print("=" * 80)

# Load all results
try:
    comparison_df = pd.read_csv('/content/sample_data/model_evaluation_comparison.csv')
    eval_results = json.load(open('/content/sample_data/evaluation_results.json'))
    feature_metadata = json.load(open('/content/sample_data/feature_metadata.json'))
    best_params = json.load(open('/content/sample_data/best_hyperparameters.json'))
    feature_scores = pd.read_csv('/content/sample_data/feature_scores.csv')

    print("[INFO] All result files loaded successfully")
except Exception as e:
    print(f"[ERROR] Could not load result files: {e}")
    exit(1)

# Generate report
report = []
report.append("=" * 80)
report.append("PROTEIN STRUCTURE CLASSIFICATION - FINAL PROJECT REPORT")
report.append("=" * 80)
report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
report.append("")

# Section 1: Project Overview
report.append("=" * 80)
report.append("1. PROJECT OVERVIEW")
report.append("=" * 80)
report.append("")
report.append("Objective: Classify protein structures based on their biochemical properties")
report.append("Dataset: Protein structure data with 10 features")
report.append("Target: Protein classification (multi-class classification)")
report.append("")

# Section 2: Novelty
report.append("=" * 80)
report.append("2. PROJECT NOVELTY")
report.append("=" * 80)
report.append("")
report.append("This project incorporates several novel approaches:")
report.append("")
report.append("2.1 Advanced Feature Engineering")
report.append("   - Interaction features: Domain-specific feature combinations")
report.append("   - Polynomial features: Degree-2 polynomial expansion")
report.append(f"   - Total engineered features: {feature_metadata['total_features']}")
report.append("")
report.append("2.2 Ensemble Feature Selection")
report.append("   - Combined ANOVA F-test, Mutual Information, and Random Forest")
report.append("   - Weighted ensemble scoring (40% + 30% + 30%)")
report.append("")
report.append("2.3 Hybrid Ensemble Learning")
report.append("   - Stacking ensemble with diverse base models")
report.append("   - Meta-learner: Logistic Regression")
report.append("   - Comparison with voting ensemble")
report.append("")
report.append("2.4 Hyperparameter Optimization")
report.append("   - Grid Search CV with stratified k-fold")
report.append("   - Optimized all ensemble components")
report.append("")

# Section 3: Methodology
report.append("=" * 80)
report.append("3. METHODOLOGY")
report.append("=" * 80)
report.append("")
report.append("3.1 Data Preprocessing")
report.append("   - Stratified sampling: 800 samples from 5000")
report.append("   - Missing value imputation: Median (numerical), Mode (categorical)")
report.append("   - Label encoding for categorical variables")
report.append("   - Standard scaling for numerical features")
report.append("")
report.append("3.2 Feature Engineering")
report.append(f"   - Interaction features: {len(feature_metadata['interaction_features'])}")
report.append(f"   - Polynomial features: {len(feature_metadata['polynomial_features'])}")
report.append(f"   - Original features (encoded): {len(feature_metadata['original_features'])}")
report.append("")
report.append("3.3 Feature Selection")
report.append(f"   - Features before selection: {feature_metadata['total_features']}")
report.append(f"   - Features after selection: {len(feature_metadata['feature_names'])}")
report.append("")
report.append("3.4 Model Training")
report.append("   - Train/Val/Test split: 70%/15%/15%")
report.append("   - Cross-validation: 5-fold stratified")
report.append("   - Base models: Logistic Regression, Decision Tree, Random Forest,")
report.append("     Gradient Boosting, SVM, Naive Bayes, K-Nearest Neighbors")
report.append("")

# Section 4: Results
report.append("=" * 80)
report.append("4. RESULTS")
report.append("=" * 80)
report.append("")
report.append("4.1 Best Model Performance")
report.append(f"   Best Model: {eval_results['best_model']}")
report.append(f"   Accuracy:   {eval_results['metrics']['accuracy']:.4f}")
report.append(f"   Precision:  {eval_results['metrics']['precision']:.4f}")
report.append(f"   Recall:     {eval_results['metrics']['recall']:.4f}")
report.append(f"   F1-Score:   {eval_results['metrics']['f1_score']:.4f}")
report.append("")
report.append("4.2 Model Comparison")
report.append("")
report.append(f"{'Model':<30} {'Accuracy':<12} {'Precision':<12} {'Recall':<12} {'F1-Score':<12}")
report.append("-" * 88)
for _, row in comparison_df.iterrows():
    report.append(f"{row['Model']:<30} {row['Accuracy']:<12.4f} {row['Precision']:<12.4f} "
                 f"{row['Recall']:<12.4f} {row['F1-Score']:<12.4f}")
report.append("")
report.append("4.3 Test Set Statistics")
report.append(f"   Total test samples: {eval_results['test_samples']}")
report.append(f"   Number of classes: {eval_results['n_classes']}")
report.append("")

# Section 5: Hyperparameters
report.append("=" * 80)
report.append("5. OPTIMIZED HYPERPARAMETERS")
report.append("=" * 80)
report.append("")
for model_name, params in best_params.items():
    report.append(f"5.{list(best_params.keys()).index(model_name)+1} {model_name}")
    for param, value in params.items():
        report.append(f"   {param}: {value}")
    report.append("")

# Section 6: Top Features
report.append("=" * 80)
report.append("6. TOP FEATURES (by Ensemble Score)")
report.append("=" * 80)
report.append("")
top_15 = feature_scores.head(15)
report.append(f"{'Rank':<6} {'Feature':<40} {'Score':<10}")
report.append("-" * 60)
for i, (_, row) in enumerate(top_15.iterrows(), 1):
    report.append(f"{i:<6} {row['feature']:<40} {row['ensemble_score']:<10.4f}")
report.append("")

# Section 7: Confusion Matrix
report.append("=" * 80)
report.append("7. CONFUSION MATRIX")
report.append("=" * 80)
report.append("")
cm = eval_results['confusion_matrix']
report.append("Confusion Matrix (rows=true, columns=predicted):")
report.append("")

# Format confusion matrix
cm_array = np.array(cm)
n_classes = len(cm_array)

# Header
header = "     " + "".join([f"P{i:<4}" for i in range(n_classes)])
report.append(header)
report.append("     " + "-" * (5 * n_classes))

# Rows
for i, row in enumerate(cm_array):
    row_str = f"T{i}   " + "".join([f"{val:<5}" for val in row])
    report.append(row_str)
report.append("")

# Section 8: Evaluation Metrics Formulas
report.append("=" * 80)
report.append("8. EVALUATION METRICS (FORMULAS)")
report.append("=" * 80)
report.append("")
report.append("Accuracy  = (TP + TN) / (TP + TN + FP + FN)")
report.append("Precision = TP / (TP + FP)")
report.append("Recall    = TP / (TP + FN)")
report.append("F1-Score  = 2 * (Precision * Recall) / (Precision + Recall)")
report.append("")
report.append("Where:")
report.append("  TP = True Positives")
report.append("  TN = True Negatives")
report.append("  FP = False Positives")
report.append("  FN = False Negatives")
report.append("")

# Section 9: Cross-Validation
report.append("=" * 80)
report.append("9. CROSS-VALIDATION STRATEGY")
report.append("=" * 80)
report.append("")
report.append("Method: Stratified K-Fold Cross-Validation")
report.append("K-Folds: 5")
report.append("Benefits:")
report.append("  - Maintains class distribution in each fold")
report.append("  - Reduces overfitting")
report.append("  - Provides robust performance estimates")
report.append("  - Better for imbalanced datasets")
report.append("")

# Section 10: Bias-Variance Analysis
report.append("=" * 80)
report.append("10. BIAS-VARIANCE TRADEOFF")
report.append("=" * 80)
report.append("")
report.append("Model Complexity: Medium-High")
report.append("  - Ensemble methods reduce variance")
report.append("  - Feature engineering increases model expressiveness")
report.append("  - Regularization (L2 in Logistic Regression) controls overfitting")
report.append("")
report.append("Expected Behavior:")
report.append("  - Low bias: Model captures complex patterns")
report.append("  - Controlled variance: Cross-validation and ensemble prevent overfitting")
report.append("")

# Section 11: Key Findings
report.append("=" * 80)
report.append("11. KEY FINDINGS")
report.append("=" * 80)
report.append("")
report.append("1. Feature Engineering Impact:")
report.append("   Interaction and polynomial features improved model performance")
report.append("")
report.append("2. Ensemble Approach:")
report.append("   Stacking ensemble outperformed individual models")
report.append("")
report.append("3. Hyperparameter Optimization:")
report.append("   Grid search improved model accuracy by fine-tuning parameters")
report.append("")
report.append("4. Feature Selection:")
report.append("   Ensemble feature selection identified most predictive features")
report.append("")

# Section 12: Conclusion
report.append("=" * 80)
report.append("12. CONCLUSION")
report.append("=" * 80)
report.append("")
report.append("This project successfully implemented a hybrid ensemble approach for")
report.append("protein structure classification with the following novelties:")
report.append("")
report.append("- Advanced feature engineering with domain-specific interactions")
report.append("- Ensemble feature selection combining multiple methods")
report.append("- Stacking ensemble with optimized hyperparameters")
report.append("- Comprehensive evaluation using multiple metrics")
report.append("")
report.append(f"Final Model Accuracy: {eval_results['metrics']['accuracy']:.4f}")
report.append(f"Final Model F1-Score: {eval_results['metrics']['f1_score']:.4f}")
report.append("")
report.append("=" * 80)

# Save report
report_text = "\n".join(report)

with open('/content/sample_data/PROJECT_REPORT.txt', 'w') as f:
    f.write(report_text)

print("\n" + report_text)

print("\n" + "=" * 80)
print("REPORT SAVED")
print("=" * 80)
print("✓ PROJECT_REPORT.txt - Complete project report")

print("\n[SUCCESS] Part 8 completed!")
print("\nAll project parts completed successfully!")
print("\nGenerated files:")
print("  Data: data_sampled_clean.csv, features_engineered.csv")
print("  Models: trained_models.pkl, optimized_models.pkl")
print("  Results: model_evaluation_comparison.csv, confusion_matrix.csv")
print("  Visualizations: *.png files")
print("  Report: PROJECT_REPORT.txt")

PART 8: FINAL PROJECT REPORT GENERATION
[INFO] All result files loaded successfully

PROTEIN STRUCTURE CLASSIFICATION - FINAL PROJECT REPORT
Generated: 2025-12-19 14:16:19

1. PROJECT OVERVIEW

Objective: Classify protein structures based on their biochemical properties
Dataset: Protein structure data with 10 features
Target: Protein classification (multi-class classification)

2. PROJECT NOVELTY

This project incorporates several novel approaches:

2.1 Advanced Feature Engineering
   - Interaction features: Domain-specific feature combinations
   - Polynomial features: Degree-2 polynomial expansion
   - Total engineered features: 21

2.2 Ensemble Feature Selection
   - Combined ANOVA F-test, Mutual Information, and Random Forest
   - Weighted ensemble scoring (40% + 30% + 30%)

2.3 Hybrid Ensemble Learning
   - Stacking ensemble with diverse base models
   - Meta-learner: Logistic Regression
   - Comparison with voting ensemble

2.4 Hyperparameter Optimization
   - Grid Search CV with