# FEMbalance - Exploratory Data Analysis

This notebook contains exploratory data analysis for the FEMbalance ML models.
We'll analyze cycle data, symptoms, and PCOS risk factors to understand patterns and relationships.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1. Data Loading and Overview

In [None]:
# Load sample data (replace with actual data loading)
# For demonstration, we'll create sample data

# Sample cycle data
np.random.seed(42)
n_users = 1000
n_cycles_per_user = 6

cycle_data = []
for user_id in range(n_users):
    base_cycle_length = np.random.normal(28, 3)
    base_period_length = np.random.normal(5, 1)
    
    for cycle_num in range(n_cycles_per_user):
        cycle_length = max(21, min(35, np.random.normal(base_cycle_length, 2)))
        period_length = max(2, min(8, np.random.normal(base_period_length, 0.5)))
        
        start_date = datetime.now() - timedelta(days=cycle_num * cycle_length)
        
        cycle_data.append({
            'user_id': user_id,
            'cycle_number': cycle_num,
            'start_date': start_date,
            'cycle_length': int(cycle_length),
            'period_length': int(period_length)
        })

cycles_df = pd.DataFrame(cycle_data)
print(f"Cycle data shape: {cycles_df.shape}")
cycles_df.head()

In [None]:
# Sample user data with PCOS risk factors
user_data = []
for user_id in range(n_users):
    age = np.random.randint(18, 45)
    height = np.random.normal(165, 8)  # cm
    weight = np.random.normal(65, 12)  # kg
    bmi = weight / ((height/100) ** 2)
    
    # PCOS risk factors
    has_pcos = np.random.choice([0, 1], p=[0.85, 0.15])  # 15% prevalence
    family_history = np.random.choice([0, 1], p=[0.7, 0.3])
    exercise_frequency = np.random.randint(0, 8)  # days per week
    stress_level = np.random.randint(1, 5)  # 1-4 scale
    
    user_data.append({
        'user_id': user_id,
        'age': age,
        'height': height,
        'weight': weight,
        'bmi': bmi,
        'has_pcos': has_pcos,
        'family_history': family_history,
        'exercise_frequency': exercise_frequency,
        'stress_level': stress_level
    })

users_df = pd.DataFrame(user_data)
print(f"User data shape: {users_df.shape}")
users_df.head()

## 2. Cycle Data Analysis

In [None]:
# Basic statistics for cycle data
print("Cycle Length Statistics:")
print(cycles_df['cycle_length'].describe())
print("\nPeriod Length Statistics:")
print(cycles_df['period_length'].describe())

In [None]:
# Visualize cycle length distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Cycle length distribution
axes[0].hist(cycles_df['cycle_length'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
axes[0].axvline(cycles_df['cycle_length'].mean(), color='red', linestyle='--', 
                label=f'Mean: {cycles_df["cycle_length"].mean():.1f} days')
axes[0].set_xlabel('Cycle Length (days)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Cycle Lengths')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Period length distribution
axes[1].hist(cycles_df['period_length'], bins=10, alpha=0.7, color='lightcoral', edgecolor='black')
axes[1].axvline(cycles_df['period_length'].mean(), color='red', linestyle='--',
                label=f'Mean: {cycles_df["period_length"].mean():.1f} days')
axes[1].set_xlabel('Period Length (days)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Period Lengths')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Cycle regularity analysis
cycle_regularity = cycles_df.groupby('user_id')['cycle_length'].agg(['std', 'mean', 'count']).reset_index()
cycle_regularity = cycle_regularity[cycle_regularity['count'] >= 3]  # At least 3 cycles

# Classify regularity
cycle_regularity['regularity'] = pd.cut(cycle_regularity['std'], 
                                       bins=[0, 2, 5, float('inf')], 
                                       labels=['Very Regular', 'Regular', 'Irregular'])

print("Cycle Regularity Distribution:")
print(cycle_regularity['regularity'].value_counts())

# Visualize regularity
plt.figure(figsize=(10, 6))
regularity_counts = cycle_regularity['regularity'].value_counts()
plt.pie(regularity_counts.values, labels=regularity_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Cycle Regularity Distribution')
plt.axis('equal')
plt.show()

## 3. PCOS Risk Factor Analysis

In [None]:
# PCOS prevalence
pcos_prevalence = users_df['has_pcos'].mean()
print(f"PCOS Prevalence in dataset: {pcos_prevalence:.1%}")

# Risk factors comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# BMI comparison
users_df.boxplot(column='bmi', by='has_pcos', ax=axes[0,0])
axes[0,0].set_title('BMI Distribution by PCOS Status')
axes[0,0].set_xlabel('PCOS Status (0=No, 1=Yes)')
axes[0,0].set_ylabel('BMI')

# Age comparison
users_df.boxplot(column='age', by='has_pcos', ax=axes[0,1])
axes[0,1].set_title('Age Distribution by PCOS Status')
axes[0,1].set_xlabel('PCOS Status (0=No, 1=Yes)')
axes[0,1].set_ylabel('Age')

# Exercise frequency comparison
users_df.boxplot(column='exercise_frequency', by='has_pcos', ax=axes[1,0])
axes[1,0].set_title('Exercise Frequency by PCOS Status')
axes[1,0].set_xlabel('PCOS Status (0=No, 1=Yes)')
axes[1,0].set_ylabel('Exercise Days per Week')

# Stress level comparison
users_df.boxplot(column='stress_level', by='has_pcos', ax=axes[1,1])
axes[1,1].set_title('Stress Level by PCOS Status')
axes[1,1].set_xlabel('PCOS Status (0=No, 1=Yes)')
axes[1,1].set_ylabel('Stress Level (1-4)')

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
correlation_matrix = users_df[['age', 'bmi', 'exercise_frequency', 'stress_level', 'family_history', 'has_pcos']].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5)
plt.title('Correlation Matrix of PCOS Risk Factors')
plt.tight_layout()
plt.show()

## 4. Cycle Patterns and PCOS Relationship

In [None]:
# Merge cycle and user data
merged_df = cycles_df.merge(users_df, on='user_id')

# Compare cycle characteristics by PCOS status
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Cycle length by PCOS status
merged_df.boxplot(column='cycle_length', by='has_pcos', ax=axes[0])
axes[0].set_title('Cycle Length by PCOS Status')
axes[0].set_xlabel('PCOS Status (0=No, 1=Yes)')
axes[0].set_ylabel('Cycle Length (days)')

# Period length by PCOS status
merged_df.boxplot(column='period_length', by='has_pcos', ax=axes[1])
axes[1].set_title('Period Length by PCOS Status')
axes[1].set_xlabel('PCOS Status (0=No, 1=Yes)')
axes[1].set_ylabel('Period Length (days)')

plt.tight_layout()
plt.show()

In [None]:
# Statistical tests
from scipy import stats

# T-test for cycle length differences
pcos_cycles = merged_df[merged_df['has_pcos'] == 1]['cycle_length']
no_pcos_cycles = merged_df[merged_df['has_pcos'] == 0]['cycle_length']

t_stat, p_value = stats.ttest_ind(pcos_cycles, no_pcos_cycles)
print(f"Cycle Length T-test:")
print(f"PCOS group mean: {pcos_cycles.mean():.2f} days")
print(f"No PCOS group mean: {no_pcos_cycles.mean():.2f} days")
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.3f}")
print(f"Significant difference: {'Yes' if p_value < 0.05 else 'No'}")

## 5. Feature Engineering Insights

In [None]:
# Calculate cycle regularity for each user
user_cycle_stats = cycles_df.groupby('user_id').agg({
    'cycle_length': ['mean', 'std', 'count'],
    'period_length': ['mean', 'std']
}).round(2)

user_cycle_stats.columns = ['avg_cycle_length', 'cycle_std', 'cycle_count', 
                           'avg_period_length', 'period_std']
user_cycle_stats = user_cycle_stats.reset_index()

# Merge with user data
enhanced_users = users_df.merge(user_cycle_stats, on='user_id')

# Create BMI categories
enhanced_users['bmi_category'] = pd.cut(enhanced_users['bmi'], 
                                       bins=[0, 18.5, 25, 30, 50],
                                       labels=['Underweight', 'Normal', 'Overweight', 'Obese'])

# Create regularity score
enhanced_users['regularity_score'] = 1 / (1 + enhanced_users['cycle_std'].fillna(0))

print("Enhanced features created:")
print(enhanced_users[['user_id', 'bmi_category', 'regularity_score', 'avg_cycle_length']].head())

In [None]:
# Feature importance visualization
features_for_analysis = ['age', 'bmi', 'exercise_frequency', 'stress_level', 
                        'family_history', 'avg_cycle_length', 'cycle_std', 'regularity_score']

# Calculate correlation with PCOS
feature_correlations = enhanced_users[features_for_analysis + ['has_pcos']].corr()['has_pcos'].drop('has_pcos')
feature_correlations = feature_correlations.abs().sort_values(ascending=True)

plt.figure(figsize=(10, 8))
feature_correlations.plot(kind='barh', color='steelblue')
plt.title('Feature Correlation with PCOS (Absolute Values)')
plt.xlabel('Absolute Correlation')
plt.ylabel('Features')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("Feature correlations with PCOS:")
for feature, corr in feature_correlations.items():
    print(f"{feature}: {corr:.3f}")

## 6. Key Insights and Recommendations

In [None]:
print("=== KEY INSIGHTS FROM EXPLORATORY ANALYSIS ===")
print()

print("1. CYCLE PATTERNS:")
print(f"   - Average cycle length: {cycles_df['cycle_length'].mean():.1f} days")
print(f"   - Average period length: {cycles_df['period_length'].mean():.1f} days")
print(f"   - Cycle regularity: {(cycle_regularity['regularity'] == 'Very Regular').mean():.1%} very regular")
print()

print("2. PCOS PREVALENCE:")
print(f"   - Overall prevalence: {pcos_prevalence:.1%}")
print(f"   - Family history correlation: {users_df[['family_history', 'has_pcos']].corr().iloc[0,1]:.3f}")
print()

print("3. RISK FACTORS:")
pcos_users = users_df[users_df['has_pcos'] == 1]
no_pcos_users = users_df[users_df['has_pcos'] == 0]
print(f"   - Average BMI (PCOS): {pcos_users['bmi'].mean():.1f}")
print(f"   - Average BMI (No PCOS): {no_pcos_users['bmi'].mean():.1f}")
print(f"   - Exercise frequency difference: {(no_pcos_users['exercise_frequency'].mean() - pcos_users['exercise_frequency'].mean()):.1f} days/week")
print()

print("4. RECOMMENDATIONS FOR MODEL DEVELOPMENT:")
print("   - Include cycle regularity as a key feature")
print("   - BMI and family history are strong predictors")
print("   - Consider lifestyle factors (exercise, stress)")
print("   - Use ensemble methods for better prediction")
print("   - Implement proper cross-validation")

## 7. Data Quality Assessment

In [None]:
# Data quality checks
print("=== DATA QUALITY ASSESSMENT ===")
print()

print("Cycle Data:")
print(f"- Total records: {len(cycles_df):,}")
print(f"- Missing values: {cycles_df.isnull().sum().sum()}")
print(f"- Duplicate records: {cycles_df.duplicated().sum()}")
print(f"- Outliers (cycle length > 35 or < 21): {((cycles_df['cycle_length'] > 35) | (cycles_df['cycle_length'] < 21)).sum()}")
print()

print("User Data:")
print(f"- Total records: {len(users_df):,}")
print(f"- Missing values: {users_df.isnull().sum().sum()}")
print(f"- Duplicate records: {users_df.duplicated().sum()}")
print(f"- BMI outliers (< 15 or > 40): {((users_df['bmi'] < 15) | (users_df['bmi'] > 40)).sum()}")
print()

print("Data Coverage:")
print(f"- Users with >= 3 cycles: {(cycle_regularity['count'] >= 3).sum():,} ({(cycle_regularity['count'] >= 3).mean():.1%})")
print(f"- Age range: {users_df['age'].min()}-{users_df['age'].max()} years")
print(f"- BMI range: {users_df['bmi'].min():.1f}-{users_df['bmi'].max():.1f}")