In [None]:
# Employee Salary Prediction - Advanced ML & Deep Learning Analysis
# This notebook demonstrates comprehensive salary prediction using various AI/ML techniques
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

In [None]:
# Load and Explore Dataset
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset from uploaded file
data = pd.read_csv('adult.csv')
data.replace('?', np.nan, inplace=True)
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)
data

In [None]:
# Data Visualization and Analysis
def visualize_data(data):
    """Create comprehensive visualizations of the dataset"""
    
    # Set up the plotting area
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    
    # Age distribution
    axes[0,0].hist(data['age'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
    axes[0,0].set_title('Age Distribution')
    axes[0,0].set_xlabel('Age')
    axes[0,0].set_ylabel('Frequency')
    
    # Income distribution
    income_counts = data['income'].value_counts()
    axes[0,1].pie(income_counts.values, labels=income_counts.index, autopct='%1.1f%%', 
                  colors=['lightcoral', 'lightblue'])
    axes[0,1].set_title('Income Distribution')
    
    # Education levels
    education_counts = data['education'].value_counts()
    axes[0,2].barh(education_counts.index[:10], education_counts.values[:10])
    axes[0,2].set_title('Top 10 Education Levels')
    axes[0,2].set_xlabel('Count')
    
    # Workclass distribution
    workclass_counts = data['workclass'].value_counts()
    axes[1,0].bar(workclass_counts.index[:8], workclass_counts.values[:8], 
                  color='lightgreen', alpha=0.7)
    axes[1,0].set_title('Workclass Distribution')
    axes[1,0].set_xlabel('Workclass')
    axes[1,0].set_ylabel('Count')
    axes[1,0].tick_params(axis='x', rotation=45)
    
    # Hours per week vs Income
    income_groups = data.groupby('income')['hours-per-week'].mean()
    axes[1,1].bar(income_groups.index, income_groups.values, color=['orange', 'purple'])
    axes[1,1].set_title('Average Hours per Week by Income')
    axes[1,1].set_xlabel('Income Category')
    axes[1,1].set_ylabel('Average Hours/Week')
    
    # Gender and Income correlation
    gender_income = pd.crosstab(data['gender'], data['income'])
    gender_income.plot(kind='bar', ax=axes[1,2], color=['pink', 'lightblue'])
    axes[1,2].set_title('Gender vs Income')
    axes[1,2].set_xlabel('Gender')
    axes[1,2].set_ylabel('Count')
    axes[1,2].legend(['<=50K', '>50K'])
    
    plt.tight_layout()
    plt.show()
    
    # Correlation heatmap for numerical features
    numerical_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
    if all(col in data.columns for col in numerical_cols):
        plt.figure(figsize=(10, 8))
        correlation_matrix = data[numerical_cols].corr()
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
                   square=True, fmt='.2f')
        plt.title('Correlation Matrix of Numerical Features')
        plt.show()

visualize_data(data)


In [None]:
# Data Preprocessing and Cleaning
def preprocess_data(data):
    """Comprehensive data preprocessing pipeline"""
    
    print("Starting data preprocessing...")
    
    processed_data = data.copy()
    
    print("Handling missing values...")
    processed_data.replace('?', np.nan, inplace=True)
    
    null_counts = processed_data.isnull().sum()
    print(f"Null values per column:\n{null_counts[null_counts > 0]}")
    
    for col in processed_data.columns:
        if processed_data[col].dtype == 'object' and processed_data[col].isnull().sum() > 0:
            mode_val = processed_data[col].mode()[0]
            processed_data[col].fillna(mode_val, inplace=True)
    
    processed_data = processed_data[~processed_data['workclass'].isin(['Never-worked', 'Without-pay'])]
    
    processed_data = processed_data[~processed_data['education'].isin(['Preschool', '1st-4th', '5th-6th'])]
    
    Q1 = processed_data['age'].quantile(0.25)
    Q3 = processed_data['age'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    processed_data = processed_data[(processed_data['age'] >= max(17, lower_bound)) & 
                                   (processed_data['age'] <= min(75, upper_bound))]
    
    print(f"Data shape after preprocessing: {processed_data.shape}")
    return processed_data

processed_data = preprocess_data(data)


In [None]:
# Feature Engineering and Encoding
def feature_engineering(data):
    """Advanced feature engineering and encoding"""
    
    print("Starting feature engineering...")
    
    data['age_group'] = pd.cut(data['age'], bins=[0, 25, 35, 50, 65, 100], 
                              labels=['Young', 'Adult', 'Middle-aged', 'Senior', 'Elderly'])
    
    data['hours_category'] = pd.cut(data['hours-per-week'], bins=[0, 20, 40, 60, 100], 
                                   labels=['Part-time', 'Full-time', 'Overtime', 'Workaholic'])
    
    data['capital_net'] = data['capital-gain'] - data['capital-loss']
    
    data['has_capital_gain'] = (data['capital-gain'] > 0).astype(int)
    data['has_capital_loss'] = (data['capital-loss'] > 0).astype(int)
    
    education_mapping = {
        'Doctorate': 'Advanced',
        'Prof-school': 'Advanced',
        'Masters': 'Advanced',
        'Bachelors': 'Bachelors',
        'Some-college': 'Some-college',
        'Assoc-acdm': 'Associate',
        'Assoc-voc': 'Associate',
        'HS-grad': 'High-school',
        '12th': 'High-school',
        '11th': 'High-school',
        '10th': 'High-school',
        '9th': 'High-school',
        '7th-8th': 'Elementary',
    }
    data['education_grouped'] = data['education'].map(education_mapping).fillna('Other')
    
    columns_to_drop = ['education', 'fnlwgt']  # fnlwgt is a sampling weight, not useful for prediction
    data = data.drop(columns=[col for col in columns_to_drop if col in data.columns])
    
    print(f"Feature engineering completed. New shape: {data.shape}")
    return data

engineered_data = feature_engineering(processed_data)


In [None]:
data

In [None]:
# Advanced Label Encoding and Normalization
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

def encode_and_normalize(data):
    X = data.drop("income", axis=1)
    y = data["income"]

    label_encoders = {}

    for col in X.columns:
        if X[col].dtype == 'object' or X[col].dtype.name == 'category':
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col])
            label_encoders[col] = le

    target_encoder = LabelEncoder()
    y = target_encoder.fit_transform(y)

    scaler_minmax = MinMaxScaler()
    scaler_standard = StandardScaler()

    X_minmax = scaler_minmax.fit_transform(X)
    X_standard = scaler_standard.fit_transform(X)

    print(f"Encoded features shape: {X.shape}")
    return X, y, X_minmax, X_standard, label_encoders, target_encoder, scaler_minmax, scaler_standard

X, y, X_minmax, X_standard, label_encoders, target_encoder, scaler_minmax, scaler_standard = encode_and_normalize(engineered_data)


In [None]:
# Train-Test Split with Stratification
def create_train_test_split(X_minmax, X_standard, y, test_size=0.2, random_state=42):
    """Create stratified train-test splits for both scaling methods"""
    
    X_train_mm, X_test_mm, y_train_mm, y_test_mm = train_test_split(
        X_minmax, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    X_train_std, X_test_std, y_train_std, y_test_std = train_test_split(
        X_standard, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    print(f"Training set shape: {X_train_mm.shape}")
    print(f"Test set shape: {X_test_mm.shape}")
    print(f"Class distribution in training: {np.bincount(y_train_mm)}")
    print(f"Class distribution in test: {np.bincount(y_test_mm)}")
    
    return X_train_mm, X_test_mm, y_train_mm, y_test_mm, X_train_std, X_test_std, y_train_std, y_test_std

X_train_mm, X_test_mm, y_train_mm, y_test_mm, X_train_std, X_test_std, y_train_std, y_test_std = create_train_test_split(X_minmax, X_standard, y)


In [None]:
# Traditional Machine Learning Models with Hyperparameter Tuning
def train_traditional_models(X_train, X_test, y_train, y_test, scaling_method="MinMax"):
    """Train and evaluate traditional ML models with hyperparameter tuning"""
    
    print(f"Training traditional ML models with {scaling_method} scaling...")
    
    models = {
        'KNN': KNeighborsClassifier(),
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
        'Random Forest': RandomForestClassifier(random_state=42),
        'SVM': SVC(random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42)
    }
    
    param_grids = {
        'KNN': {'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance']},
        'Logistic Regression': {'C': [0.1, 1, 10], 'solver': ['liblinear', 'lbfgs']},
        'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, None]},
        'SVM': {'C': [0.1, 1, 10], 'kernel': ['rbf', 'linear']},
        'Gradient Boosting': {'n_estimators': [50, 100], 'learning_rate': [0.1, 0.2], 'max_depth': [3, 5]}
    }
    
    results = {}
    
    for name, model in models.items():
        print(f"\nTraining {name}...")
        
        grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        best_model = grid_search.best_estimator_
        
        y_pred = best_model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        
        results[name] = {
            'model': best_model,
            'accuracy': accuracy,
            'best_params': grid_search.best_params_,
            'predictions': y_pred
        }
        
        print(f"{name} - Accuracy: {accuracy:.4f}")
        print(f"Best parameters: {grid_search.best_params_}")
    
    return results

In [None]:
print("=== Training with MinMax Scaling ===")
results_minmax = train_traditional_models(X_train_mm, X_test_mm, y_train_mm, y_test_mm, "MinMax")

In [None]:
print("\n=== Training with Standard Scaling ===")
results_standard = train_traditional_models(X_train_std, X_test_std, y_train_std, y_test_std, "Standard")

In [None]:
# Deep Learning Model with TensorFlow/Keras
def create_deep_learning_model(input_dim, hidden_layers=[128, 64, 32], dropout_rate=0.3):
    """Create a deep neural network for salary prediction"""
    
    model = Sequential()
    
    model.add(Dense(hidden_layers[0], input_dim=input_dim, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))
    
    for units in hidden_layers[1:]:
        model.add(Dense(units, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dropout(dropout_rate))
    
    model.add(Dense(1, activation='sigmoid'))
    
    return model

def train_deep_learning_model(X_train, X_test, y_train, y_test):
    """Train and evaluate deep learning model"""
    
    print("Training Deep Learning Model...")
    
    model = create_deep_learning_model(X_train.shape[1])
    
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    
    history = model.fit(X_train, y_train,
                       validation_data=(X_test, y_test),
                       epochs=100,
                       batch_size=32,
                       callbacks=[early_stopping],
                       verbose=1)
    
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    
    y_pred_proba = model.predict(X_test)
    y_pred = (y_pred_proba > 0.5).astype(int).flatten()
    
    print(f"Deep Learning Model - Accuracy: {accuracy:.4f}")
    
    return model, history, y_pred, accuracy

# Train deep learning model
dl_model, dl_history, dl_predictions, dl_accuracy = train_deep_learning_model(X_train_std, X_test_std, y_train_std, y_test_std)


In [None]:
# Accuracy Comparison Between MinMax and Standard Scaling
def plot_scaling_comparison(results_minmax, results_standard, dl_accuracy):
    """Plot model accuracy comparison for MinMax vs Standard scaling"""
    import numpy as np
    import matplotlib.pyplot as plt

    # Collect accuracies
    model_names = []
    minmax_accuracies = []
    standard_accuracies = []

    for name in results_minmax.keys():
        model_names.append(name)
        minmax_accuracies.append(results_minmax[name]['accuracy'])
        standard_accuracies.append(results_standard[name]['accuracy'])

    # Add deep learning
    model_names.append('Deep Learning')
    minmax_accuracies.append(dl_accuracy)
    standard_accuracies.append(dl_accuracy)

    # Plotting
    x = np.arange(len(model_names))
    width = 0.35

    plt.figure(figsize=(10, 6))
    plt.bar(x - width/2, minmax_accuracies, width, label='MinMax Scaling', alpha=0.8)
    plt.bar(x + width/2, standard_accuracies, width, label='Standard Scaling', alpha=0.8)

    plt.xlabel('Models')
    plt.ylabel('Accuracy')
    plt.title('Model Performance Comparison: MinMax vs Standard Scaling')
    plt.xticks(x, model_names, rotation=45, ha='right')
    plt.legend(loc='upper left', bbox_to_anchor=(1.02, 1))
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

plot_scaling_comparison(results_minmax, results_standard, dl_accuracy)

In [None]:
# Highlighting the Best Performing Model
def highlight_best_model(results_standard, dl_accuracy):
    """Highlight the best performing model based on standard scaling"""
    import matplotlib.pyplot as plt
    import numpy as np

    model_names = list(results_standard.keys())
    standard_accuracies = [results_standard[name]['accuracy'] for name in model_names]

    # Add Deep Learning
    model_names.append('Deep Learning')
    standard_accuracies.append(dl_accuracy)

    # Identify best model
    best_idx = np.argmax(standard_accuracies)
    best_model = model_names[best_idx]
    best_accuracy = standard_accuracies[best_idx]

    # Plotting
    plt.figure(figsize=(10, 6))
    colors = ['red' if i == best_idx else 'skyblue' for i in range(len(model_names))]

    plt.bar(model_names, standard_accuracies, color=colors)
    plt.title(f'Best Model: {best_model} (Accuracy: {best_accuracy:.4f})')
    plt.ylabel('Accuracy')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

    return best_model, best_accuracy

best_model_name, best_accuracy = highlight_best_model(results_standard, dl_accuracy)

In [None]:
# Confusion Matrix
def plot_confusion_matrix_only(y_test, y_pred, model_name, class_labels):
    """Display only the confusion matrix graph."""
    cm = confusion_matrix(y_test, y_pred)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_labels,
                yticklabels=class_labels)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

plot_confusion_matrix_only(y_test_std, results_standard[best_model_name]['predictions'], best_model_name, target_encoder.classes_)

In [None]:
# Displays only the top 10 feature importances (for tree-based models)
def plot_feature_importance(model, model_name):
    """Display only the feature importance graph for tree-based models."""
    feature_importance = model.feature_importances_
    feature_names = [f'Feature_{i}' for i in range(len(feature_importance))]

    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': feature_importance
    }).sort_values('importance', ascending=False)

    plt.figure(figsize=(10, 6))
    sns.barplot(data=importance_df.head(10), x='importance', y='feature')
    plt.title(f'Top 10 Feature Importance - {model_name}')
    plt.xlabel('Importance')
    plt.show()

if best_model_name in ['Random Forest', 'Gradient Boosting']:
    plot_feature_importance(results_standard[best_model_name]['model'], best_model_name)

In [None]:
# Cross-Validation Analysis
def cross_validation_analysis(results_standard, X_train_std, y_train_std):
    """Perform cross-validation analysis for model stability"""
    
    print("Cross-Validation Analysis:")
    print("-" * 50)
    
    cv_results = {}
    
    for name, result in results_standard.items():
        model = result['model']
        cv_scores = cross_val_score(model, X_train_std, y_train_std, cv=5, scoring='accuracy')
        
        cv_results[name] = {
            'mean': cv_scores.mean(),
            'std': cv_scores.std(),
            'scores': cv_scores
        }
        
        print(f"{name}:")
        print(f"  Mean CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        print(f"  Individual CV Scores: {cv_scores}")
        print()
    
    # Visualization
    plt.figure(figsize=(12, 6))
    
    model_names = list(cv_results.keys())
    means = [cv_results[name]['mean'] for name in model_names]
    stds = [cv_results[name]['std'] for name in model_names]
    
    plt.bar(model_names, means, yerr=stds, capsize=5, alpha=0.7)
    plt.title('Cross-Validation Results (5-Fold)')
    plt.ylabel('Accuracy')
    plt.xlabel('Models')
    plt.xticks(rotation=45, ha='right')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    return cv_results

cv_results = cross_validation_analysis(results_standard, X_train_std, y_train_std)

In [None]:
data

In [None]:
# Fixed Improved Prediction Function - Clean and Concise
def predict_salary_clean(new_data, model_name, results_standard, scaler_standard, label_encoders, target_encoder):
    """Clean function to predict salary for new data"""
    
    print(f"Making predictions using {model_name}...")
    
    # Create a copy and preprocess
    processed_data = new_data.copy()
    
    # Handle education-num column name
    if 'education-num' in processed_data.columns:
        processed_data['educational-num'] = processed_data['education-num']
        processed_data.drop('education-num', axis=1, inplace=True)
    
    # Feature engineering
    processed_data['age_group'] = pd.cut(processed_data['age'], 
                                       bins=[0, 25, 35, 50, 65, 100], 
                                       labels=['Young', 'Adult', 'Middle-aged', 'Senior', 'Elderly'])
    
    processed_data['hours_category'] = pd.cut(processed_data['hours-per-week'], 
                                            bins=[0, 20, 40, 60, 100], 
                                            labels=['Part-time', 'Full-time', 'Overtime', 'Workaholic'])
    
    processed_data['capital_net'] = processed_data['capital-gain'] - processed_data['capital-loss']
    processed_data['has_capital_gain'] = (processed_data['capital-gain'] > 0).astype(int)
    processed_data['has_capital_loss'] = (processed_data['capital-loss'] > 0).astype(int)
    
    # Education mapping
    education_mapping = {
        'Doctorate': 'Advanced', 'Prof-school': 'Advanced', 'Masters': 'Advanced',
        'Bachelors': 'Bachelors', 'Some-college': 'Some-college',
        'Assoc-acdm': 'Associate', 'Assoc-voc': 'Associate',
        'HS-grad': 'High-school', '12th': 'High-school', '11th': 'High-school',
        '10th': 'High-school', '9th': 'High-school', '7th-8th': 'Elementary'
    }
    
    education_num_mapping = {
        16: 'Doctorate', 15: 'Prof-school', 14: 'Masters', 13: 'Bachelors',
        12: 'Some-college', 11: 'Assoc-acdm', 10: 'Assoc-voc', 9: 'HS-grad',
        8: '12th', 7: '11th', 6: '10th', 5: '9th', 4: '7th-8th'
    }
    
    if 'educational-num' in processed_data.columns:
        processed_data['education'] = processed_data['educational-num'].map(education_num_mapping).fillna('HS-grad')
        processed_data['education_grouped'] = processed_data['education'].map(education_mapping).fillna('Other')

    # Define expected feature order
    training_columns = [
        'age', 'workclass', 'educational-num', 'marital-status', 'occupation',
        'relationship', 'race', 'gender', 'capital-gain', 'capital-loss',
        'hours-per-week', 'native-country', 'age_group', 'hours_category',
        'capital_net', 'has_capital_gain', 'has_capital_loss', 'education_grouped'
    ]
    
    # Create final dataframe with correct order
    final_data = pd.DataFrame()
    for col in training_columns:
        if col in processed_data.columns:
            final_data[col] = processed_data[col]
        else:
            # Add default values for missing columns
            final_data[col] = 0 if col in ['has_capital_gain', 'has_capital_loss', 'capital_net'] else processed_data.iloc[0, 0]
    
    # Label encoding with error handling
    for col, encoder in label_encoders.items():
        if col in final_data.columns:
            try:
                final_data[col] = encoder.transform(final_data[col])
            except ValueError:
                # Use first class as fallback for unseen labels
                final_data[col] = 0
    
    # Scale the data
    scaled_new_data = scaler_standard.transform(final_data)
    
    # Make prediction
    if model_name == 'Deep Learning':
        prediction_proba = dl_model.predict(scaled_new_data)
        prediction = (prediction_proba > 0.5).astype(int).flatten()
    else:
        model = results_standard[model_name]['model']
        prediction = model.predict(scaled_new_data)
    
    # Decode prediction
    prediction_decoded = target_encoder.inverse_transform(prediction)
    
    return prediction_decoded

# Test with new sample
new_sample = pd.DataFrame({
    'age': [27], 'workclass': ['Private'], 'education-num': [12], 'marital-status': ['Married-civ-spouse'],
    'occupation': ['Tech-support'], 'relationship': ['Wife'], 'race': ['White'], 'gender': ['Female'],
    'capital-gain': [0], 'capital-loss': [0], 'hours-per-week': [38],
    'native-country': ['United-States']
})

# Clean prediction call
try:
    prediction = predict_salary_clean(new_sample, best_model_name, results_standard, scaler_standard, label_encoders, target_encoder)
    print(f"Predicted salary category: {prediction[0]}")
    print(f"Confidence: {best_accuracy:.1%}")
except Exception as e:
    print(f"Error in clean prediction: {e}")

In [None]:
# Even more robust version with better error handling
def predict_salary_robust(new_data, model_name="Gradient Boosting"):
    """Most robust prediction function with comprehensive error handling"""
    
    try:
        # Quick preprocessing
        data = new_data.copy()
        
        # Handle education-num column name
        if 'education-num' in data.columns:
            data['educational-num'] = data['education-num']
            data.drop('education-num', axis=1, inplace=True)
        elif 'educational-num' not in data.columns:
            data['educational-num'] = 9  # Default to HS-grad equivalent
        
        # Add required features with safe defaults
        data['age_group'] = pd.cut(data['age'], bins=[0, 25, 35, 50, 65, 100], 
                                  labels=['Young', 'Adult', 'Middle-aged', 'Senior', 'Elderly'])
        data['hours_category'] = pd.cut(data['hours-per-week'], bins=[0, 20, 40, 60, 100], 
                                       labels=['Part-time', 'Full-time', 'Overtime', 'Workaholic'])
        data['capital_net'] = data['capital-gain'] - data['capital-loss']
        data['has_capital_gain'] = (data['capital-gain'] > 0).astype(int)
        data['has_capital_loss'] = (data['capital-loss'] > 0).astype(int)
        data['education_grouped'] = 'High-school'
        
        # Define training columns
        training_columns = [
            'age', 'workclass', 'educational-num', 'marital-status', 'occupation',
            'relationship', 'race', 'gender', 'capital-gain', 'capital-loss',
            'hours-per-week', 'native-country', 'age_group', 'hours_category',
            'capital_net', 'has_capital_gain', 'has_capital_loss', 'education_grouped'
        ]
        
        # Create final dataframe with correct order and safe defaults
        final_data = pd.DataFrame()
        for col in training_columns:
            if col in data.columns:
                final_data[col] = data[col]
            else:
                # Safe default values
                if col in ['has_capital_gain', 'has_capital_loss', 'capital_net']:
                    final_data[col] = 0
                elif col == 'age_group':
                    final_data[col] = 'Adult'
                elif col == 'hours_category':
                    final_data[col] = 'Full-time'
                elif col == 'education_grouped':
                    final_data[col] = 'High-school'
                else:
                    # Use the first available value or a default
                    final_data[col] = data.iloc[0, 0] if len(data.columns) > 0 else 0
        
        # Encode with comprehensive error handling
        for col, encoder in label_encoders.items():
            if col in final_data.columns:
                try:
                    final_data[col] = encoder.transform(final_data[col])
                except:
                    # Use mode of training data or 0 as fallback
                    final_data[col] = 0
        
        # Scale and predict
        scaled_data = scaler_standard.transform(final_data)
        prediction = results_standard[model_name]['model'].predict(scaled_data)
        
        return target_encoder.inverse_transform(prediction)[0]
        
    except Exception as e:
        print(f"Error in prediction: {e}")
        return "Unable to predict - please check input data format"

# Test robust function
print("\n=== Testing Robust Prediction Function ===")
robust_result = predict_salary_robust(new_sample)
print(f"Robust prediction result: {robust_result}")

In [None]:
# Alternative: Even more concise version
def predict_salary_simple(new_data, model_name="Gradient Boosting"):
    """Ultra-simple prediction function"""
    
    # Quick preprocessing
    data = new_data.copy()
    
    # Handle education-num column name
    if 'education-num' in data.columns:
        data['educational-num'] = data['education-num']
        data.drop('education-num', axis=1, inplace=True)
    
    # Add required features
    data['age_group'] = pd.cut(data['age'], bins=[0, 25, 35, 50, 65, 100], 
                              labels=['Young', 'Adult', 'Middle-aged', 'Senior', 'Elderly'])
    data['hours_category'] = pd.cut(data['hours-per-week'], bins=[0, 20, 40, 60, 100], 
                                   labels=['Part-time', 'Full-time', 'Overtime', 'Workaholic'])
    data['capital_net'] = data['capital-gain'] - data['capital-loss']
    data['has_capital_gain'] = (data['capital-gain'] > 0).astype(int)
    data['has_capital_loss'] = (data['capital-loss'] > 0).astype(int)
    data['education_grouped'] = 'High-school'  # Default value
    
    # Define training columns (moved inside function)
    training_columns = [
        'age', 'workclass', 'educational-num', 'marital-status', 'occupation',
        'relationship', 'race', 'gender', 'capital-gain', 'capital-loss',
        'hours-per-week', 'native-country', 'age_group', 'hours_category',
        'capital_net', 'has_capital_gain', 'has_capital_loss', 'education_grouped'
    ]
    
    # Create final dataframe with correct order
    final_data = pd.DataFrame()
    for col in training_columns:
        if col in data.columns:
            final_data[col] = data[col]
        else:
            # Add default values for missing columns
            final_data[col] = 0 if col in ['has_capital_gain', 'has_capital_loss', 'capital_net'] else data.iloc[0, 0]
    
    # Encode and predict
    for col, encoder in label_encoders.items():
        if col in final_data.columns:
            try:
                final_data[col] = encoder.transform(final_data[col])
            except:
                final_data[col] = 0
    
    # Scale and predict
    scaled_data = scaler_standard.transform(final_data)
    prediction = results_standard[model_name]['model'].predict(scaled_data)
    
    return target_encoder.inverse_transform(prediction)[0]

# Usage example for simple function
try:
    result = predict_salary_simple(new_sample)
    print(f"\nSimple prediction result: {result}")
except Exception as e:
    print(f"Error in simple prediction: {e}")

In [None]:
# Cell 14: Model Optimization Summary
def optimization_summary():
    """Summarize all optimization techniques used"""
    
    print("=== MODEL OPTIMIZATION SUMMARY ===")
    print()
    
    print("1. DATA PREPROCESSING:")
    print("   - Handled missing values (replaced '?' with mode)")
    print("   - Removed inconsistent entries (Never-worked, Without-pay)")
    print("   - Filtered out very low education levels")
    print("   - Outlier detection and removal using IQR method")
    print()
    
    print("2. FEATURE ENGINEERING:")
    print("   - Created age groups and work hour categories")
    print("   - Generated capital net feature (gain - loss)")
    print("   - Added binary features for capital gain/loss")
    print("   - Grouped education levels for better representation")
    print("   - Removed redundant features (fnlwgt)")
    print()
    
    print("3. NORMALIZATION TECHNIQUES:")
    print("   - MinMax Scaling (0-1 range)")
    print("   - Standard Scaling (mean=0, std=1)")
    print("   - Compared performance of both methods")
    print()
    
    print("4. HYPERPARAMETER OPTIMIZATION:")
    print("   - Grid Search CV for all traditional ML models")
    print("   - 5-fold cross-validation for model selection")
    print("   - Early stopping for deep learning model")
    print()
    
    print("5. DEEP LEARNING OPTIMIZATIONS:")
    print("   - Batch normalization for stable training")
    print("   - Dropout layers for regularization")
    print("   - Adam optimizer with learning rate scheduling")
    print("   - Multiple hidden layers with decreasing units")
    print()
    
    print("6. MODEL EVALUATION:")
    print("   - Stratified train-test split")
    print("   - Cross-validation analysis")
    print("   - Confusion matrix and classification reports")
    print("   - Feature importance analysis")
    print()
    
    print(f"BEST MODEL: {best_model_name}")
    print(f"BEST ACCURACY: {best_accuracy:.4f}")

optimization_summary()

print("\n" + "="*60)
print("EMPLOYEE SALARY PREDICTION ANALYSIS COMPLETE!")
print("="*60)