# FEMbalance - Model Development

This notebook contains the development and training of ML models for:
1. PCOS Risk Prediction
2. Cycle Prediction
3. Symptom Analysis

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## 1. Data Preparation

In [None]:
# Load and prepare data (using sample data for demonstration)
# In production, this would load from your database

# Generate sample training data
n_samples = 5000
np.random.seed(42)

# Create synthetic dataset for PCOS prediction
data = {
    'age': np.random.normal(28, 6, n_samples),
    'bmi': np.random.normal(24, 4, n_samples),
    'cycle_length': np.random.normal(28, 3, n_samples),
    'period_length': np.random.normal(5, 1, n_samples),
    'exercise_frequency': np.random.poisson(3, n_samples),
    'stress_level': np.random.randint(1, 5, n_samples),
    'family_history': np.random.binomial(1, 0.3, n_samples),
    'sleep_quality': np.random.randint(1, 5, n_samples)
}

# Create target variable with realistic relationships
pcos_risk = (
    0.1 * (data['bmi'] > 25) +
    0.15 * (data['cycle_length'] > 35) +
    0.2 * data['family_history'] +
    0.1 * (data['stress_level'] > 3) +
    0.05 * (data['exercise_frequency'] < 2) +
    np.random.normal(0, 0.1, n_samples)
)

data['has_pcos'] = (pcos_risk > 0.3).astype(int)

df = pd.DataFrame(data)
print(f"Dataset shape: {df.shape}")
print(f"PCOS prevalence: {df['has_pcos'].mean():.2%}")
df.head()

## 2. PCOS Risk Prediction Model

In [None]:
# Prepare features for PCOS prediction
feature_columns = ['age', 'bmi', 'cycle_length', 'period_length', 
                  'exercise_frequency', 'stress_level', 'family_history', 'sleep_quality']

X = df[feature_columns]
y = df['has_pcos']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Feature columns: {feature_columns}")