In [1]:
# -*- coding: utf-8 -*-
"""
CODE 1: UPDATED FOR COLUMN COMPATIBILITY
"""

# =============================================================================
# 1. INITIAL SETUP
# =============================================================================
!pip install pandas numpy plotly scikit-learn -q
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

# =============================================================================
# 2. DATA LOADING WITH COLUMN VALIDATION
# =============================================================================
from google.colab import files

def load_data():
    """Handles CSV upload and column checks"""
    try:
        uploaded = files.upload()
        file_name = list(uploaded.keys())[0]
        df = pd.read_csv(file_name)

        print(f"‚úÖ Successfully loaded {file_name}")
        print(f"üìä Dataset shape: {df.shape}")
        print("\nColumns in your data:", df.columns.tolist())  # Critical for debugging

        return df
    except Exception as e:
        print(f"‚ùå Error: {str(e)}")
        return None

df = load_data()

# Validate expected columns
expected_columns = [
    'age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
    'contact', 'month', 'day_of_wk', 'duration', 'campaign', 'pdays',
    'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
    'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'
]

if not df.empty:
    missing_cols = set(expected_columns) - set(df.columns)
    extra_cols = set(df.columns) - set(expected_columns)

    if missing_cols:
        print(f"\n‚ö†Ô∏è Missing columns: {list(missing_cols)}")
    if extra_cols:
        print(f"‚ö†Ô∏è Extra columns: {list(extra_cols)}")

    print("\n==============================================")
    print("‚úÖ DATA INSPECTION COMPLETE")
    print("==============================================")

# =============================================================================
# 3. ENHANCED DATA CLEANING (UPDATED TO HANDLE NAN)
# =============================================================================
if not df.empty:
    # Create a copy for cleaning
    df_clean = df.copy()

    # A. Handle missing values - COMPREHENSIVE VERSION
    print("\nüîç Missing values before cleaning:")
    print(df_clean.isnull().sum())

    # Special handling for pdays (999 = no previous contact)
    if 'pdays' in df_clean:
        # Create contact flag first
        df_clean['previous_contact'] = df_clean['pdays'].apply(lambda x: 0 if x == 999 else 1)
        # Now convert 999 to NaN
        df_clean['pdays'] = df_clean['pdays'].replace(999, np.nan)
        print("\n‚ôªÔ∏è Created 'previous_contact' flag and replaced pdays=999 with NaN")

    # Fill numerical missing values with median
    num_cols = df_clean.select_dtypes(include=np.number).columns
    for col in num_cols:
        if df_clean[col].isnull().any():
            median_val = df_clean[col].median()
            df_clean[col].fillna(median_val, inplace=True)
            print(f"  ‚Ä¢ Filled missing values in {col} with median: {median_val:.2f}")

    # Fill categorical missing values with mode
    cat_cols = df_clean.select_dtypes(include='object').columns
    for col in cat_cols:
        if df_clean[col].isnull().any():
            mode_val = df_clean[col].mode()[0]
            df_clean[col].fillna(mode_val, inplace=True)
            print(f"  ‚Ä¢ Filled missing values in {col} with mode: '{mode_val}'")

    # B. Handle duplicates
    initial_count = len(df_clean)
    df_clean = df_clean.drop_duplicates()
    final_count = len(df_clean)
    dup_count = initial_count - final_count
    print(f"\n‚ôªÔ∏è Removed {dup_count} duplicate rows")

    # C. Clean categorical inconsistencies
    for col in cat_cols:
        df_clean[col] = df_clean[col].str.lower().str.strip()
        print(f"  ‚Ä¢ Standardized casing/spaces in {col}")

    print("\nüîç Missing values after cleaning:")
    print(df_clean.isnull().sum())

    print("\n==============================================")
    print("‚úÖ DATA CLEANING COMPLETE")
    print(f"Final shape: {df_clean.shape}")
    print("==============================================")

    # Replace original df with cleaned version
    df = df_clean
else:
    print("Skipping cleaning due to empty dataframe")

# =============================================================================
# 4. UPDATED PREPROCESSING WITH NAN HANDLING
# =============================================================================
if not df.empty:
    print("\n==============================================")
    print("‚öôÔ∏è UPDATED FEATURE ENGINEERING")
    print("==============================================")

    # A. Encode target variable
    df['y_encoded'] = df['y'].map({'no': 0, 'yes': 1})
    print("‚úÖ Target variable encoded: 'no'‚Üí0, 'yes'‚Üí1")

    # B. Prepare features and target
    X = df.drop(['y', 'y_encoded'], axis=1)
    y = df['y_encoded']

    # C. Identify feature types
    categorical_cols = X.select_dtypes(include='object').columns
    numerical_cols = X.select_dtypes(include=np.number).columns

    print(f"\nüìä Feature Types:")
    print(f"- Categorical ({len(categorical_cols)}): {list(categorical_cols)}")
    print(f"- Numerical ({len(numerical_cols)}): {list(numerical_cols)}")

    # D. Create preprocessing pipelines with imputation
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    from sklearn.impute import SimpleImputer
    from sklearn.pipeline import Pipeline

    # Preprocessing pipeline with NaN handling
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ]), numerical_cols),
            ('cat', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
            ]), categorical_cols)
        ])

    # E. Apply preprocessing
    print("\nüîÑ Applying preprocessing with NaN handling:")
    print("- Impute + Scale numerical features")
    print("- Impute + Encode categorical features")

    X_processed = preprocessor.fit_transform(X)

    # Get feature names after transformation
    num_features = numerical_cols.tolist()
    cat_encoder = preprocessor.named_transformers_['cat'].named_steps['encoder']
    cat_features = cat_encoder.get_feature_names_out(categorical_cols)
    all_features = num_features + cat_features.tolist()

    # Create processed DataFrame
    X_processed_df = pd.DataFrame(X_processed, columns=all_features)

    # Final NaN check
    nan_check = X_processed_df.isna().sum().sum()
    print(f"\nüîç NaN values in processed data: {nan_check}")
    if nan_check > 0:
        print("‚ùå WARNING: NaN values still present in processed data")
        print(X_processed_df.isna().sum())
    else:
        print("‚úÖ Successfully removed all NaN values")

    print("\nüîç Processed Features Preview:")
    print(X_processed_df.head())

    print(f"\nüìê Processed dataset shape: {X_processed_df.shape}")
    print("‚úÖ All features processed and ready for modeling")

    # Update our datasets
    processed_df = pd.concat([X_processed_df, y.reset_index(drop=True)], axis=1)

    print("\n==============================================")
    print("‚úÖ PREPROCESSING COMPLETE")
    print("==============================================")
else:
    print("Skipping preprocessing due to empty dataframe")

Saving bankmarketing.csv to bankmarketing.csv
‚úÖ Successfully loaded bankmarketing.csv
üìä Dataset shape: (41188, 21)

Columns in your data: ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']

‚ö†Ô∏è Missing columns: ['day_of_wk']
‚ö†Ô∏è Extra columns: ['day_of_week']

‚úÖ DATA INSPECTION COMPLETE

üîç Missing values before cleaning:
age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

‚ôªÔ∏è Created 'previous_contact'

In [2]:
# =============================================================================
# 5. TRAIN-TEST SPLIT
# =============================================================================

# Split into train (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(
    X_processed_df, y, test_size=0.2, random_state=42, stratify=y
)

print("\n==============================================")
print("‚úÇÔ∏è TRAIN-TEST SPLIT COMPLETE")
print("==============================================")
print(f"Train set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Positive class in train: {y_train.mean():.2%}")
print(f"Positive class in test: {y_test.mean():.2%}")

# =============================================================================
# 6. MODEL TRAINING & EVALUATION
# =============================================================================

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5)
}

# Store results
results = []

# Train and evaluate models
print("\n==============================================")
print("ü§ñ MODEL TRAINING & EVALUATION")
print("==============================================")

for name, model in models.items():
    print(f"\nüöÄ Training {name}...")

    # Train model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Store results
    results.append({
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    })

    # Print metrics
    print(f"\nüìä {name} Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(cm)

    # Classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Display results comparison
print("\n==============================================")
print("üèÜ MODEL COMPARISON")
print("==============================================")
print(results_df)

# Visualize metrics comparison
fig = px.bar(
    results_df.melt(id_vars="Model", var_name="Metric", value_name="Value"),
    x="Model",
    y="Value",
    color="Metric",
    barmode="group",
    title="Model Performance Comparison",
    text="Value",
    facet_col="Metric",
    facet_col_wrap=2,
    height=500
)
fig.update_traces(texttemplate="%{text:.3f}", textposition="outside")
fig.update_layout(uniformtext_minsize=8, uniformtext_mode="hide")
fig.show()

# =============================================================================
# 7. KNN HYPERPARAMETER TUNING (BONUS)
# =============================================================================

print("\n==============================================")
print("üéõÔ∏è KNN HYPERPARAMETER TUNING")
print("==============================================")

# Define parameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='f1', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_knn = grid_search.best_estimator_

print(f"Best Parameters: {best_params}")

# Evaluate best model
y_pred_knn = best_knn.predict(X_test)
f1_tuned = f1_score(y_test, y_pred_knn)

print(f"\nüîç Before Tuning F1: {results[2]['F1-Score']:.4f}")
print(f"After Tuning F1: {f1_tuned:.4f}")

# Update results
results_df.loc[results_df['Model'] == 'K-Nearest Neighbors', 'F1-Score'] = f1_tuned

print("\n==============================================")
print("‚úÖ ALL MODELS EVALUATED")
print("==============================================")


‚úÇÔ∏è TRAIN-TEST SPLIT COMPLETE
Train set: 32940 samples
Test set: 8236 samples
Positive class in train: 11.27%
Positive class in test: 11.27%

ü§ñ MODEL TRAINING & EVALUATION

üöÄ Training Logistic Regression...

üìä Logistic Regression Performance:
Accuracy: 0.9093
Precision: 0.6486
Recall: 0.4256
F1-Score: 0.5140

Confusion Matrix:
[[7094  214]
 [ 533  395]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      7308
           1       0.65      0.43      0.51       928

    accuracy                           0.91      8236
   macro avg       0.79      0.70      0.73      8236
weighted avg       0.90      0.91      0.90      8236


üöÄ Training Random Forest...

üìä Random Forest Performance:
Accuracy: 0.9112
Precision: 0.6374
Recall: 0.4925
F1-Score: 0.5556

Confusion Matrix:
[[7048  260]
 [ 471  457]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94


üéõÔ∏è KNN HYPERPARAMETER TUNING
Best Parameters: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}

üîç Before Tuning F1: 0.4984
After Tuning F1: 0.5109

‚úÖ ALL MODELS EVALUATED
