In [5]:
# pip install sqlalchemy pymysql pandas scikit-learn imbalanced-learn joblib fastapi uvicorn pydantic

In [6]:
# pip install pandas sqlalchemy pymysql scikit-learn imbalanced-learn xgboost joblib

In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
import xgboost as xgb
import joblib
import warnings
warnings.filterwarnings('ignore')

# Database connection parameters
DB_USER = "root"
DB_PASS = "050901"
DB_HOST = "localhost"
DB_NAME = "test_db"

try:
    # Create database connection
    engine = create_engine(f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}/{DB_NAME}")
    
    # Fetch data from students table
    query = "SELECT * FROM students"
    df = pd.read_sql(query, engine)
    print("Data loaded successfully. Shape:", df.shape)
    
except Exception as e:
    print(f"Error connecting to database: {e}")
    # Create sample data for demonstration if database connection fails
    print("Creating sample data for demonstration...")
    np.random.seed(42)
    n_samples = 1000
    df = pd.DataFrame({
        'student_id': range(1, n_samples+1),
        'name': [f'Student_{i}' for i in range(1, n_samples+1)],
        'age': np.random.randint(18, 25, n_samples),
        'gender': np.random.choice(['Male', 'Female'], n_samples),
        'major': np.random.choice(['CS', 'Engineering', 'Business', 'Arts'], n_samples),
        'gpa': np.random.uniform(2.0, 4.0, n_samples),
        'study_hours': np.random.randint(5, 40, n_samples),
        'sleep_hours': np.random.randint(4, 10, n_samples),
        'extracurricular': np.random.choice(['Yes', 'No'], n_samples),
        'stress_level': np.random.choice([0, 1, 2], n_samples, p=[0.7, 0.2, 0.1])
    })

print("Sample data:")
print(df.head())
print("\nData info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())

# Drop unnecessary columns
columns_to_drop = ['student_id', 'name']
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')
print(f"\nData after dropping columns: {df.shape}")

# Check if stress_level column exists
if 'stress_level' not in df.columns:
    print("Error: 'stress_level' column not found in the dataset!")
    print("Available columns:", df.columns.tolist())
    # Create a sample stress_level column for demonstration
    df['stress_level'] = np.random.choice([0, 1, 2], len(df), p=[0.7, 0.2, 0.1])
    print("Created sample 'stress_level' column for demonstration")

# Check class distribution
print("\nClass distribution:")
class_distribution = df['stress_level'].value_counts()
print(class_distribution)

# If there are too many classes or some classes have very few samples, we might need to bin them
if len(df['stress_level'].unique()) > 5:
    print("Too many stress levels detected. Binning into 3 categories...")
    # Bin into 3 categories: Low (0), Medium (1), High (2)
    if df['stress_level'].max() > 2:
        df['stress_level'] = pd.cut(df['stress_level'], bins=3, labels=[0, 1, 2])
    print("New class distribution:")
    print(df['stress_level'].value_counts())

# Separate features and target
X = df.drop('stress_level', axis=1)
y = df['stress_level']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Categorical columns: {categorical_cols}")
print(f"Numerical columns: {numerical_cols}")

# Create preprocessing pipelines for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Split the data into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")
print("Training set class distribution:")
train_class_dist = pd.Series(y_train).value_counts()
print(train_class_dist)
print("Test set class distribution:")
print(pd.Series(y_test).value_counts())

# Check if any class has too few samples for SMOTE
min_class_size = train_class_dist.min()
print(f"Minimum class size in training data: {min_class_size}")

# Define models with regularization to prevent overfitting
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000, C=0.1),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100, max_depth=5),
    'SVM': SVC(random_state=42, C=1.0, kernel='rbf', probability=True),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=3, learning_rate=0.1),
    'XGBoost': xgb.XGBClassifier(random_state=42, n_estimators=100, max_depth=3, learning_rate=0.1, 
                                reg_alpha=0.1, reg_lambda=1.0, eval_metric='mlogloss')
}

# Define parameter grids for hyperparameter tuning
param_grids = {
    'Logistic Regression': {
        'model__C': [0.01, 0.1, 1, 10],
        'model__solver': ['liblinear', 'lbfgs']
    },
    'Random Forest': {
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [3, 5, 7, None],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    },
    'SVM': {
        'model__C': [0.1, 1, 10],
        'model__kernel': ['linear', 'rbf'],
        'model__gamma': ['scale', 'auto']
    },
    'Gradient Boosting': {
        'model__n_estimators': [50, 100, 200],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__max_depth': [3, 4, 5],
        'model__subsample': [0.8, 0.9, 1.0]
    },
    'XGBoost': {
        'model__n_estimators': [50, 100, 200],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__max_depth': [3, 4, 5],
        'model__reg_alpha': [0, 0.1, 0.5],
        'model__reg_lambda': [0.5, 1.0, 1.5],
        'model__subsample': [0.8, 0.9, 1.0]
    }
}

# Train and evaluate models
results = {}
best_accuracy = 0
best_model = None
best_model_name = ""
best_pipeline = None

print("\n" + "="*50)
print("MODEL EVALUATION WITH CROSS-VALIDATION")
print("="*50)

# Use stratified k-fold for cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    print(f"\nTraining {name}...")
    try:
        # Adjust SMOTE parameters based on class sizes
        # If minimum class size is too small, reduce n_neighbors or don't use SMOTE
        if min_class_size <= 5:
            print("Small class sizes detected. Using RandomUnderSampler instead of SMOTE.")
            # Create pipeline with preprocessing and model (without SMOTE)
            pipeline = ImbPipeline(steps=[
                ('preprocessor', preprocessor),
                ('sampling', RandomUnderSampler(random_state=42)),
                ('model', model)
            ])
        else:
            # Use SMOTE with adjusted n_neighbors
            n_neighbors = min(5, min_class_size - 1)  # Ensure n_neighbors <= min_class_size
            print(f"Using SMOTE with n_neighbors={n_neighbors}")
            pipeline = ImbPipeline(steps=[
                ('preprocessor', preprocessor),
                ('sampling', SMOTE(random_state=42, sampling_strategy='not majority', k_neighbors=n_neighbors)),
                ('model', model)
            ])
        
        # Perform cross-validation
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy')
        
        # Fit the pipeline
        pipeline.fit(X_train, y_train)
        
        # Make predictions
        y_pred = pipeline.predict(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        
        results[name] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std()
        }
        
        print(f"{name} Results:")
        print(f"CV Accuracy: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")
        print(f"Test Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")
        
        # Check if this is the best model so far
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model
            best_model_name = name
            best_pipeline = pipeline
            
    except Exception as e:
        print(f"Error training {name}: {e}")
        import traceback
        traceback.print_exc()

if best_model is None:
    print("No suitable model found. Trying simple pipeline without sampling...")
    # Try without any sampling
    for name, model in models.items():
        try:
            print(f"Trying {name} without sampling...")
            pipeline = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('model', model)
            ])
            
            cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy')
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model = model
                best_model_name = name
                best_pipeline = pipeline
                
            print(f"{name} without sampling - Accuracy: {accuracy:.4f}")
            
        except Exception as e:
            print(f"Error training {name} without sampling: {e}")

if best_model is None:
    print("All models failed. Using simple Logistic Regression as fallback.")
    best_model_name = "Logistic Regression"
    best_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', LogisticRegression(random_state=42))
    ])
    best_pipeline.fit(X_train, y_train)
    y_pred = best_pipeline.predict(X_test)
    best_accuracy = accuracy_score(y_test, y_pred)

print("\n" + "="*50)
print("SUMMARY")
print("="*50)
print(f"Best model: {best_model_name} with accuracy: {best_accuracy:.4f}")

# Hyperparameter tuning for the best model (if we have a parameter grid for it)
if best_model_name in param_grids:
    print(f"\nPerforming hyperparameter tuning for {best_model_name}...")

    # Create a new pipeline for tuning (without sampling to avoid issues)
    tuning_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', models[best_model_name])
    ])

    # Get the parameter grid for the best model
    param_grid = param_grids[best_model_name]

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(
        tuning_pipeline, 
        param_grid, 
        cv=cv, 
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )

    # Fit the grid search
    grid_search.fit(X_train, y_train)

    # Get the best parameters and score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best parameters for {best_model_name}: {best_params}")
    print(f"Best cross-validation score: {best_score:.4f}")

    # Train final model with best parameters
    final_pipeline = grid_search.best_estimator_
else:
    # If no hyperparameter tuning is needed, use the best pipeline
    final_pipeline = best_pipeline

final_pipeline.fit(X_train, y_train)

# Evaluate final model
y_pred_final = final_pipeline.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred_final)
final_precision = precision_score(y_test, y_pred_final, average='weighted', zero_division=0)
final_recall = recall_score(y_test, y_pred_final, average='weighted', zero_division=0)
final_f1 = f1_score(y_test, y_pred_final, average='weighted', zero_division=0)

print(f"\nFinal model test accuracy: {final_accuracy:.4f}")
print(f"Final model test precision: {final_precision:.4f}")
print(f"Final model test recall: {final_recall:.4f}")
print(f"Final model test F1-score: {final_f1:.4f}")

print("\nClassification Report for Final Model:")
print(classification_report(y_test, y_pred_final, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_final))

# Save the final pipeline
joblib.dump(final_pipeline, 'stress_level_pipeline.pkl')

# Save the preprocessor separately as well
joblib.dump(preprocessor, 'preprocessor.pkl')

print("\n✅ Model pipeline and preprocessor saved successfully!")

# Test the saved pipeline
loaded_pipeline = joblib.load('stress_level_pipeline.pkl')
y_pred_loaded = loaded_pipeline.predict(X_test)
loaded_accuracy = accuracy_score(y_test, y_pred_loaded)

print(f"Loaded pipeline accuracy: {loaded_accuracy:.4f}")
print("✅ Pipeline loaded and tested successfully!")

# Feature importance for tree-based models
if hasattr(final_pipeline.named_steps['model'], 'feature_importances_'):
    print("\nFeature Importances:")
    feature_importances = final_pipeline.named_steps['model'].feature_importances_
    
    # Get feature names after preprocessing
    try:
        # For numerical features
        num_features = numerical_cols
        
        # For categorical features
        cat_transformer = preprocessor.named_transformers_['cat']
        cat_features = cat_transformer.named_steps['onehot'].get_feature_names_out(categorical_cols)
        
        # Combine all feature names
        all_features = list(num_features) + list(cat_features)
        
        # Create a DataFrame of feature importances
        importance_df = pd.DataFrame({
            'feature': all_features,
            'importance': feature_importances
        })
        
        # Sort by importance
        importance_df = importance_df.sort_values('importance', ascending=False)
        
        print(importance_df.head(10))
        
    except Exception as e:
        print(f"Could not extract feature names: {e}")
        print("Top 10 feature importances (values):")
        print(sorted(feature_importances, reverse=True)[:10])

Data loaded successfully. Shape: (20, 17)
Sample data:
   student_id     name  age  gender  academic_performance  stress_level  \
0           1    Alice   20  Female                     4             6   
1           2      Bob   22    Male                     3             8   
2           3  Charlie   23    Male                     5             4   
3           4    David   21    Male                     2             9   
4           5      Eva   24  Female                     5             3   

   sleep_quality  anxiety_level  exercise_hours  study_hours  social_activity  \
0              3              7             2.5          5.0                2   
1              2              9             1.0          6.0                1   
2              4              5             4.0          7.0                3   
3              2              8             0.5          4.5                0   
4              4              4             5.0          7.5                4   

   fina