In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import warnings
warnings.filterwarnings('ignore')

# 1. Load the data
print("=== LOADING EMPLOYEE SALARY DATA ===")
data = pd.read_excel("dummy_users.xlsx")  # Update with your file path

print(f"Dataset shape: {data.shape}")
print("\nFirst 5 rows:")
print(data.head())

# 2. Data Analysis and Preprocessing
print("\n=== DATA EXPLORATION ===")
print(f"Data types:\n{data.dtypes}")
print(f"\nMissing values:\n{data.isnull().sum()}")
print(f"\nTarget variable (salary) statistics:")
print(data['salary'].describe())

# Remove unnecessary columns
columns_to_drop = ['id', 'name', 'phone_number']
available_drops = [col for col in columns_to_drop if col in data.columns]
if available_drops:
    data = data.drop(columns=available_drops)
    print(f"Dropped columns: {available_drops}")

# 3. Define features and target
target = 'salary'
X = data.drop(columns=[target])
y = data[target]

print(f"\nFeatures: {list(X.columns)}")
print(f"Target: {target}")

# 4. Identify categorical and numerical columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"\nCategorical features: {categorical_features}")
print(f"Numerical features: {numerical_features}")

# Show unique values for each categorical feature
print("\n=== CATEGORICAL FEATURE ANALYSIS ===")
for col in categorical_features:
    unique_vals = X[col].unique()
    print(f"{col}: {len(unique_vals)} unique values -> {list(unique_vals)}")

# 5. Encode categorical variables using LabelEncoder (better for small datasets)
print("\n=== ENCODING CATEGORICAL VARIABLES ===")
X_encoded = X.copy()
label_encoders = {}

for column in categorical_features:
    le = LabelEncoder()
    X_encoded[column] = le.fit_transform(X[column])
    label_encoders[column] = le
    print(f"Encoded {column}: {dict(zip(le.classes_, le.transform(le.classes_)))}")

# 6. Scale features
scaler = StandardScaler()
X_scaled = X_encoded.copy()
if numerical_features:
    X_scaled[numerical_features] = scaler.fit_transform(X_encoded[numerical_features])

print(f"\nFinal dataset shape: {X_scaled.shape}")
print("First 5 rows of processed data:")
print(X_scaled.head())

# 7. Handle small dataset - use stratified split based on salary ranges
# Create salary bins for stratification
y_bins = pd.cut(y, bins=3, labels=['Low', 'Medium', 'High'])
print(f"\nSalary distribution:")
print(y_bins.value_counts())

# Split data with stratification
try:
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.25, random_state=42, stratify=y_bins
    )
except:
    # If stratification fails, use random split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.25, random_state=42
    )

print(f"\nTraining set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

# 8. Define models suitable for small datasets
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression (α=0.1)': Ridge(alpha=0.1),
    'Ridge Regression (α=1.0)': Ridge(alpha=1.0),
    'Ridge Regression (α=10.0)': Ridge(alpha=10.0),
    'Lasso Regression (α=0.1)': Lasso(alpha=0.1),
    'Lasso Regression (α=1.0)': Lasso(alpha=1.0),
    'Decision Tree (depth=3)': DecisionTreeRegressor(random_state=42, max_depth=3),
    'Decision Tree (depth=5)': DecisionTreeRegressor(random_state=42, max_depth=5),
    'Random Forest (10 trees)': RandomForestRegressor(n_estimators=10, random_state=42, max_depth=5),
    'Random Forest (50 trees)': RandomForestRegressor(n_estimators=50, random_state=42, max_depth=5),
    'K-Nearest Neighbors (k=3)': KNeighborsRegressor(n_neighbors=3),
    'K-Nearest Neighbors (k=5)': KNeighborsRegressor(n_neighbors=5),
}

# 9. Train and evaluate models using Leave-One-Out CV for small dataset
print("\n=== TRAINING AND EVALUATING MODELS ===")
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    try:
        # Train model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        r2 = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        
        # Leave-One-Out Cross-validation for small datasets
        loo = LeaveOneOut()
        loo_scores = cross_val_score(model, X_scaled, y, cv=loo, scoring='r2')
        loo_mean = loo_scores.mean()
        loo_std = loo_scores.std()
        
        results[name] = {
            'R²': r2,
            'MSE': mse,
            'MAE': mae,
            'RMSE': rmse,
            'LOO_R²_mean': loo_mean,
            'LOO_R²_std': loo_std,
            'Model': model,
            'Predictions': y_pred
        }
        
        print(f"✅ R² Score: {r2:.4f}")
        print(f"   MSE: {mse:.2f}")
        print(f"   RMSE: {rmse:.2f}")
        print(f"   MAE: {mae:.2f}")
        print(f"   Leave-One-Out CV R²: {loo_mean:.4f} (±{loo_std:.4f})")
        
    except Exception as e:
        print(f"❌ Failed to train {name}: {str(e)}")
        continue

# 10. Compare all models
print("\n=== MODEL COMPARISON ===")
if results:
    comparison_df = pd.DataFrame({
        name: {
            'R²': metrics['R²'],
            'RMSE': metrics['RMSE'],
            'MAE': metrics['MAE'],
            'LOO_CV_R²': metrics['LOO_R²_mean']
        }
        for name, metrics in results.items()
    }).T
    
    comparison_df = comparison_df.sort_values('R²', ascending=False)
    print("Model Performance (sorted by R²):")
    print(comparison_df.round(4))
    
    # Get best model
    best_model_name = comparison_df.index[0]
    best_model = results[best_model_name]['Model']
    best_r2 = results[best_model_name]['R²']
    
    print(f"\n🏆 BEST MODEL: {best_model_name}")
    print(f"   R² Score: {best_r2:.4f}")
    print(f"   RMSE: ${results[best_model_name]['RMSE']:,.2f}")
    print(f"   Leave-One-Out CV: {results[best_model_name]['LOO_R²_mean']:.4f}")
    
    # 11. Feature Importance Analysis
    print(f"\n=== FEATURE ANALYSIS ===")
    
    # For linear models, show coefficients
    if hasattr(best_model, 'coef_'):
        feature_importance = pd.DataFrame({
            'Feature': X_scaled.columns,
            'Coefficient': best_model.coef_,
            'Abs_Coefficient': np.abs(best_model.coef_)
        }).sort_values('Abs_Coefficient', ascending=False)
        
        print(f"Feature Coefficients ({best_model_name}):")
        print(feature_importance)
    
    # For tree-based models, show feature importance
    elif hasattr(best_model, 'feature_importances_'):
        feature_importance = pd.DataFrame({
            'Feature': X_scaled.columns,
            'Importance': best_model.feature_importances_
        }).sort_values('Importance', ascending=False)
        
        print(f"Feature Importance ({best_model_name}):")
        print(feature_importance)
    
    # 12. Detailed Predictions Analysis
    print(f"\n=== DETAILED PREDICTIONS ANALYSIS ===")
    print("Test Set Predictions:")
    test_results = pd.DataFrame({
        'Actual': y_test.values,
        'Predicted': results[best_model_name]['Predictions'],
        'Error': y_test.values - results[best_model_name]['Predictions'],
        'Error_%': ((y_test.values - results[best_model_name]['Predictions']) / y_test.values * 100)
    })
    test_results.index = y_test.index
    print(test_results.round(2))
    
    # 13. Visualizations
    print("\n=== CREATING VISUALIZATIONS ===")
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. Actual vs Predicted
    y_pred_best = results[best_model_name]['Predictions']
    axes[0, 0].scatter(y_test, y_pred_best, alpha=0.8, s=100, color='blue')
    axes[0, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    axes[0, 0].set_xlabel('Actual Salary ($)')
    axes[0, 0].set_ylabel('Predicted Salary ($)')
    axes[0, 0].set_title(f'Actual vs Predicted - {best_model_name}')
    axes[0, 0].grid(True, alpha=0.3)
    
    # Add point labels
    for i, (actual, pred) in enumerate(zip(y_test, y_pred_best)):
        axes[0, 0].annotate(f'${actual:,.0f}\n${pred:,.0f}', 
                           (actual, pred), xytext=(5, 5), 
                           textcoords='offset points', fontsize=8)
    
    # 2. Residuals plot
    residuals = y_test - y_pred_best
    axes[0, 1].scatter(y_pred_best, residuals, alpha=0.8, s=100, color='green')
    axes[0, 1].axhline(y=0, color='r', linestyle='--')
    axes[0, 1].set_xlabel('Predicted Salary ($)')
    axes[0, 1].set_ylabel('Residuals ($)')
    axes[0, 1].set_title('Residual Plot')
    axes[0, 1].grid(True, alpha=0.3)
    
    # 3. Model comparison
    model_names = list(comparison_df.index[:8])  # Top 8 models
    r2_scores = list(comparison_df['R²'][:8])
    colors = plt.cm.viridis(np.linspace(0, 1, len(model_names)))
    
    bars = axes[1, 0].barh(model_names, r2_scores, color=colors)
    axes[1, 0].set_xlabel('R² Score')
    axes[1, 0].set_title('Model Performance Comparison')
    axes[1, 0].grid(True, alpha=0.3)
    
    # Add value labels on bars
    for bar, score in zip(bars, r2_scores):
        width = bar.get_width()
        axes[1, 0].text(width + 0.01, bar.get_y() + bar.get_height()/2, 
                       f'{score:.3f}', ha='left', va='center', fontsize=8)
    
    # 4. Salary distribution
    axes[1, 1].hist(y, bins=8, alpha=0.7, color='orange', edgecolor='black')
    axes[1, 1].axvline(y.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: ${y.mean():,.0f}')
    axes[1, 1].axvline(y.median(), color='blue', linestyle='--', linewidth=2, label=f'Median: ${y.median():,.0f}')
    axes[1, 1].set_xlabel('Salary ($)')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].set_title('Salary Distribution')
    axes[1, 1].legend()
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # 14. Make predictions on new data example
    print("\n=== EXAMPLE PREDICTION ===")
    
    # Create a sample new employee (using the same encoding)
    sample_data = {
        'role': 'Developer',
        'age': 35,
        'gender': 'Female',
        'alive': 'No',
        'department': 'IT',
        'alone': 'Yes',
        'city': 'New York',
        'state': 'NY',
        'single': 'No'
    }
    
    # Encode the sample data
    sample_encoded = {}
    for feature, value in sample_data.items():
        if feature in categorical_features:
            if value in label_encoders[feature].classes_:
                sample_encoded[feature] = label_encoders[feature].transform([value])[0]
            else:
                # Handle unseen category by using the most common category
                most_common = X[feature].mode()[0]
                sample_encoded[feature] = label_encoders[feature].transform([most_common])[0]
                print(f"Warning: '{value}' not seen in training for {feature}. Using '{most_common}' instead.")
        else:
            sample_encoded[feature] = value
    
    # Scale numerical features
    sample_df = pd.DataFrame([sample_encoded])
    if numerical_features:
        sample_df[numerical_features] = scaler.transform(sample_df[numerical_features])
    
    predicted_salary = best_model.predict(sample_df)[0]
    print(f"\nSample Employee Profile:")
    for key, value in sample_data.items():
        print(f"  {key}: {value}")
    print(f"\nPredicted Salary: ${predicted_salary:,.2f}")
    
    # 15. Final Summary
    print("\n=== FINAL SUMMARY ===")
    print(f"🎯 Dataset: {data.shape[0]} employees with {len(X.columns)} features")
    print(f"🏆 Best Model: {best_model_name}")
    print(f"📊 Performance Metrics:")
    print(f"   • R² Score: {best_r2:.4f} ({best_r2*100:.1f}% variance explained)")
    print(f"   • RMSE: ${results[best_model_name]['RMSE']:,.2f}")
    print(f"   • MAE: ${results[best_model_name]['MAE']:,.2f}")
    print(f"   • Leave-One-Out CV: {results[best_model_name]['LOO_R²_mean']:.4f}")
    
    # Performance interpretation
    if best_r2 > 0.8:
        print("\n✅ EXCELLENT: Model performance is very good!")
        print("   The model explains >80% of salary variance.")
    elif best_r2 > 0.6:
        print("\n✅ GOOD: Model performance is acceptable.")
        print("   The model explains >60% of salary variance.")
    elif best_r2 > 0.3:
        print("\n⚠️ MODERATE: Model performance is moderate.")
        print("   Consider feature engineering or more data.")
    else:
        print("\n❌ POOR: Model performance needs improvement.")
        print("   Consider different approach or more data.")
    
    # Data insights
    print(f"\n💡 Key Dataset Insights:")
    print(f"   • Salary range: ${y.min():,} - ${y.max():,}")
    print(f"   • Average salary: ${y.mean():,.0f}")
    print(f"   • Age range: {X['age'].min()}-{X['age'].max()} years")
    print(f"   • Job roles: {len(X['role'].unique())} different roles")
    print(f"   • Departments: {len(X['department'].unique())} departments")
    print(f"   • Locations: {len(X['city'].unique())} cities, {len(X['state'].unique())} states")
    
    # Save the best model
    import joblib
    model_filename = f'best_salary_model_{best_model_name.replace(" ", "_").replace("(", "").replace(")", "").lower()}.pkl'
    scaler_filename = 'salary_scaler.pkl'
    encoders_filename = 'salary_encoders.pkl'
    
    joblib.dump(best_model, model_filename)
    joblib.dump(scaler, scaler_filename)
    joblib.dump(label_encoders, encoders_filename)
    
    print(f"\n💾 Model files saved:")
    print(f"   • {model_filename}")
    print(f"   • {scaler_filename}")
    print(f"   • {encoders_filename}")
    
    print("\n🚀 Model is ready for deployment!")
    
else:
    print("❌ No models were successfully trained. Check your data and try again.")