In [6]:
import pandas as pd
import numpy as np
import openpyxl

# Load datasets
train_df = pd.read_excel("cleaned_training_data_v3.xlsx")
test_df = pd.read_excel("cleaned_test_data_v3.xlsx")
realuse_df = pd.read_excel("cleaned_realuse_data_v3.xlsx")

# Combine datasets
combined_df = pd.concat([train_df, test_df, realuse_df], ignore_index=True)

# Define metrics
metrics = [
    'Menstruation_y1n0', 'Mood state', 'Energy levels', 
    'Muscle readiness', 'Sleep quality', 'Sleep duration',
    'sleep_score', 'total_training_load', 'ACWR', 'RTT'
]

# Initialize result list
results = []

# Function to determine correlation strength
def correlation_strength(corr_value):
    if abs(corr_value) > 0.5:
        return 'Strong'
    elif abs(corr_value) > 0.3:
        return 'Moderate'
    elif abs(corr_value) > 0.02:
        return 'Weak'
    else:
        return 'No Significant'

# Analyze female data
female_df = combined_df[combined_df['Gender_m2f1'] == 1].copy()
female_stats = {
    'Group': 'Female Only',
    'Sample Size': len(female_df),
    'Injury Rate (%)': female_df['injured'].mean() * 100,
    'Illness Rate (%)': female_df['illed'].mean() * 100
}

# Calculate correlations for female data
for metric in metrics:
    female_stats[f'{metric} (Injury Corr)'] = female_df[[metric, 'injured']].corr().iloc[0, 1]
    female_stats[f'{metric} (Illness Corr)'] = female_df[[metric, 'illed']].corr().iloc[0, 1]
    female_stats[f'{metric} (Injury Corr Strength)'] = correlation_strength(female_df[[metric, 'injured']].corr().iloc[0, 1])
    female_stats[f'{metric} (Illness Corr Strength)'] = correlation_strength(female_df[[metric, 'illed']].corr().iloc[0, 1])

results.append(female_stats)

# Analyze all-gender data
all_stats = {
    'Group': 'All Genders',
    'Sample Size': len(combined_df),
    'Injury Rate (%)': combined_df['injured'].mean() * 100,
    'Illness Rate (%)': combined_df['illed'].mean() * 100
}

for metric in metrics:
    all_stats[f'{metric} (Injury Corr)'] = combined_df[[metric, 'injured']].corr().iloc[0, 1]
    all_stats[f'{metric} (Illness Corr)'] = combined_df[[metric, 'illed']].corr().iloc[0, 1]
    all_stats[f'{metric} (Injury Corr Strength)'] = correlation_strength(combined_df[[metric, 'injured']].corr().iloc[0, 1])
    all_stats[f'{metric} (Illness Corr Strength)'] = correlation_strength(combined_df[[metric, 'illed']].corr().iloc[0, 1])

results.append(all_stats)

# Create DataFrame for comparison
comparison_table = pd.DataFrame(results)

# Calculate differences between female and all-gender data
diff_row = {
    'Group': 'Difference (Female - All)',
    'Sample Size': comparison_table.iloc[0]['Sample Size'] - comparison_table.iloc[1]['Sample Size'],
    'Injury Rate (%)': comparison_table.iloc[0]['Injury Rate (%)'] - comparison_table.iloc[1]['Injury Rate (%)'],
    'Illness Rate (%)': comparison_table.iloc[0]['Illness Rate (%)'] - comparison_table.iloc[1]['Illness Rate (%)']
}

# Fix for handling only numeric correlations for difference calculation
for metric in metrics:
    injury_corr_female = comparison_table.iloc[0][f'{metric} (Injury Corr)']
    injury_corr_all = comparison_table.iloc[1][f'{metric} (Injury Corr)']
    illness_corr_female = comparison_table.iloc[0][f'{metric} (Illness Corr)']
    illness_corr_all = comparison_table.iloc[1][f'{metric} (Illness Corr)']
    
    # Subtract only the numeric correlations (not the strength)
    if isinstance(injury_corr_female, (int, float)) and isinstance(injury_corr_all, (int, float)):
        diff_row[f'{metric} (Injury Corr)'] = injury_corr_female - injury_corr_all
    else:
        diff_row[f'{metric} (Injury Corr)'] = None  # Or a default value indicating no numeric difference
    
    if isinstance(illness_corr_female, (int, float)) and isinstance(illness_corr_all, (int, float)):
        diff_row[f'{metric} (Illness Corr)'] = illness_corr_female - illness_corr_all
    else:
        diff_row[f'{metric} (Illness Corr)'] = None  # Or a default value indicating no numeric difference

    # Keep the strength labels as they are
    diff_row[f'{metric} (Injury Corr Strength)'] = comparison_table.iloc[0][f'{metric} (Injury Corr Strength)']
    diff_row[f'{metric} (Illness Corr Strength)'] = comparison_table.iloc[0][f'{metric} (Illness Corr Strength)']

comparison_table = pd.concat([comparison_table, pd.DataFrame([diff_row])], ignore_index=True)

# Add explanations for differences directly in the table (vertically)
explanation_rows = []

for i, metric in enumerate(metrics):
    injury_corr_diff = diff_row[f'{metric} (Injury Corr)']
    illness_corr_diff = diff_row[f'{metric} (Illness Corr)']

    # Explanation for injury correlation
    if abs(injury_corr_diff) > 0.05:
        injury_corr_explanation = f"The injury correlation difference for {metric} indicates a notable change between Female and All Genders."
    else:
        injury_corr_explanation = f"The injury correlation difference for {metric} is minimal between Female and All Genders."

    # Explanation for illness correlation
    if abs(illness_corr_diff) > 0.05:
        illness_corr_explanation = f"The illness correlation difference for {metric} indicates a notable change between Female and All Genders."
    else:
        illness_corr_explanation = f"The illness correlation difference for {metric} is minimal between Female and All Genders."

    # Add the explanation in a row format
    explanation_rows.append(['Explanation', f'{metric} (Injury Corr)', injury_corr_explanation])
    explanation_rows.append(['Explanation', f'{metric} (Illness Corr)', illness_corr_explanation])

# Create a DataFrame for the explanations
explanation_df = pd.DataFrame(explanation_rows, columns=['Group', 'Metric', 'Explanation'])

# Write to Excel with formatting
with pd.ExcelWriter('female_vs_all_clear_comparison_with_explanations.xlsx', engine='openpyxl') as writer:
    # Main comparison table
    comparison_table.to_excel(writer, sheet_name='Main Comparison', index=False)

    # Add explanations in a separate sheet
    explanation_df.to_excel(writer, sheet_name='Explanations', index=False)

    # Apply conditional formatting to the Main Comparison sheet
    workbook = writer.book
    worksheet = workbook['Main Comparison']  # Accessing the sheet by name
    
    # Add borders and background colors to the table
    for row in worksheet.iter_rows(min_row=2, max_row=comparison_table.shape[0] + 1, min_col=1, max_col=comparison_table.shape[1]):
        for cell in row:
            cell.border = openpyxl.styles.Border(
                left=openpyxl.styles.Side(border_style="thin", color="000000"),
                right=openpyxl.styles.Side(border_style="thin", color="000000"),
                top=openpyxl.styles.Side(border_style="thin", color="000000"),
                bottom=openpyxl.styles.Side(border_style="thin", color="000000")
            )
            # Check if the cell value is numeric before applying abs()
            if isinstance(cell.value, (int, float)):
                if abs(cell.value) > 0.05:  # Highlight values with notable change
                    cell.fill = openpyxl.styles.PatternFill(start_color="FFEB9C", end_color="FFEB9C", fill_type="solid")
                    cell.font = openpyxl.styles.Font(bold=True, color="FF0000")

    print("Analysis complete. Results saved to 'female_vs_all_clear_comparison_with_explanations.xlsx'")



  combined_df = pd.concat([train_df, test_df, realuse_df], ignore_index=True)


Analysis complete. Results saved to 'female_vs_all_clear_comparison_with_explanations.xlsx'


In [8]:
import pandas as pd
import numpy as np
import openpyxl
from openpyxl.styles import Alignment

# Load datasets
train_df = pd.read_excel("cleaned_training_data_v3.xlsx")
test_df = pd.read_excel("cleaned_test_data_v3.xlsx")
realuse_df = pd.read_excel("cleaned_realuse_data_v3.xlsx")

# Combine datasets
combined_df = pd.concat([train_df, test_df, realuse_df], ignore_index=True)

# Define metrics
metrics = [
    'Menstruation_y1n0', 'Mood state', 'Energy levels', 
    'Muscle readiness', 'Sleep quality', 'Sleep duration',
    'sleep_score', 'total_training_load', 'ACWR', 'RTT'
]

# Function to determine correlation strength
def correlation_strength(corr_value):
    if abs(corr_value) > 0.5:
        return 'Strong'
    elif abs(corr_value) > 0.3:
        return 'Moderate'
    elif abs(corr_value) > 0.02:
        return 'Weak'
    else:
        return 'No Significant'

# Function to generate explanation
def generate_explanation(metric, value, corr_type):
    if "Rate" in metric:
        if abs(value) > 5:
            return f"Significant difference in {corr_type.lower()} rates between groups"
        else:
            return f"Minor difference in {corr_type.lower()} rates between groups"
    else:
        if abs(value) > 0.1:
            strength = "strong" if abs(value) > 0.2 else "moderate"
            return f"{strength.capitalize()} difference in {corr_type.lower()} correlation for {metric.split(' (')[0]}"
        elif abs(value) > 0.05:
            return f"Noticeable difference in {corr_type.lower()} correlation for {metric.split(' (')[0]}"
        else:
            return f"Minimal difference in {corr_type.lower()} correlation for {metric.split(' (')[0]}"

# Prepare data in vertical format
vertical_data = []

# Add basic stats first
for group_name, df in [('Female Only', combined_df[combined_df['Gender_m2f1'] == 1]), 
                       ('All Genders', combined_df)]:
    group_data = {
        'Metric': 'Basic Statistics',
        'Group': group_name,
        'Sample Size': len(df),
        'Injury Rate (%)': df['injured'].mean() * 100,
        'Illness Rate (%)': df['illed'].mean() * 100,
        'Injury Rate Explanation': '',
        'Illness Rate Explanation': ''
    }
    vertical_data.append(group_data)

# Add difference row for basic stats
diff_basic = {
    'Metric': 'Basic Statistics',
    'Group': 'Difference (Female - All)',
    'Sample Size': vertical_data[0]['Sample Size'] - vertical_data[1]['Sample Size'],
    'Injury Rate (%)': vertical_data[0]['Injury Rate (%)'] - vertical_data[1]['Injury Rate (%)'],
    'Illness Rate (%)': vertical_data[0]['Illness Rate (%)'] - vertical_data[1]['Illness Rate (%)'],
    'Injury Rate Explanation': generate_explanation('Injury Rate', vertical_data[0]['Injury Rate (%)'] - vertical_data[1]['Injury Rate (%)'], 'Injury'),
    'Illness Rate Explanation': generate_explanation('Illness Rate', vertical_data[0]['Illness Rate (%)'] - vertical_data[1]['Illness Rate (%)'], 'Illness')
}
vertical_data.append(diff_basic)

# Add correlation data for each metric
for metric in metrics:
    for group_name, df in [('Female Only', combined_df[combined_df['Gender_m2f1'] == 1]), 
                           ('All Genders', combined_df)]:
        corr_injury = df[[metric, 'injured']].corr().iloc[0, 1]
        corr_illness = df[[metric, 'illed']].corr().iloc[0, 1]
        
        metric_data = {
            'Metric': metric,
            'Group': group_name,
            'Injury Correlation': corr_injury,
            'Illness Correlation': corr_illness,
            'Injury Correlation Strength': correlation_strength(corr_injury),
            'Illness Correlation Strength': correlation_strength(corr_illness),
            'Injury Correlation Explanation': '',
            'Illness Correlation Explanation': ''
        }
        vertical_data.append(metric_data)
    
    # Add difference row for this metric
    female_injury = vertical_data[-2]['Injury Correlation']
    all_injury = vertical_data[-1]['Injury Correlation']
    female_illness = vertical_data[-2]['Illness Correlation']
    all_illness = vertical_data[-1]['Illness Correlation']
    
    diff_metric = {
        'Metric': metric,
        'Group': 'Difference (Female - All)',
        'Injury Correlation': female_injury - all_injury,
        'Illness Correlation': female_illness - all_illness,
        'Injury Correlation Strength': vertical_data[-2]['Injury Correlation Strength'],
        'Illness Correlation Strength': vertical_data[-2]['Illness Correlation Strength'],
        'Injury Correlation Explanation': generate_explanation(f'{metric} Injury Correlation', female_injury - all_injury, 'Injury'),
        'Illness Correlation Explanation': generate_explanation(f'{metric} Illness Correlation', female_illness - all_illness, 'Illness')
    }
    vertical_data.append(diff_metric)

# Create DataFrame
vertical_df = pd.DataFrame(vertical_data)

# Reorder columns for better readability
column_order = [
    'Metric', 'Group', 'Sample Size', 
    'Injury Rate (%)', 'Injury Rate Explanation',
    'Illness Rate (%)', 'Illness Rate Explanation',
    'Injury Correlation', 'Injury Correlation Strength', 'Injury Correlation Explanation',
    'Illness Correlation', 'Illness Correlation Strength', 'Illness Correlation Explanation'
]
vertical_df = vertical_df[column_order]

# Write to Excel with enhanced formatting
with pd.ExcelWriter('female_vs_all_vertical_comparison.xlsx', engine='openpyxl') as writer:
    vertical_df.to_excel(writer, sheet_name='Comparison', index=False)
    
    # Access workbook and worksheet for formatting
    workbook = writer.book
    worksheet = workbook['Comparison']
    
    # Apply formatting
    header_fill = openpyxl.styles.PatternFill(start_color="4F81BD", end_color="4F81BD", fill_type="solid")
    header_font = openpyxl.styles.Font(color="FFFFFF", bold=True)
    explanation_fill = openpyxl.styles.PatternFill(start_color="FFF2CC", end_color="FFF2CC", fill_type="solid")
    difference_fill = openpyxl.styles.PatternFill(start_color="F2F2F2", end_color="F2F2F2", fill_type="solid")
    
    # Format headers
    for cell in worksheet[1]:
        cell.fill = header_fill
        cell.font = header_font
    
    # Format data rows
    for row in worksheet.iter_rows(min_row=2, max_row=worksheet.max_row):
        # Add borders
        for cell in row:
            cell.border = openpyxl.styles.Border(
                left=openpyxl.styles.Side(border_style="thin", color="000000"),
                right=openpyxl.styles.Side(border_style="thin", color="000000"),
                top=openpyxl.styles.Side(border_style="thin", color="000000"),
                bottom=openpyxl.styles.Side(border_style="thin", color="000000")
            )
            cell.alignment = Alignment(horizontal='left', vertical='center', wrap_text=True)
        
        # Highlight difference rows
        if row[1].value == 'Difference (Female - All)':
            for cell in row:
                cell.fill = difference_fill
        
        # Highlight explanation cells
        for cell in row:
            if "Explanation" in str(cell.value):
                cell.fill = explanation_fill
    
    # Auto-adjust column widths with some minimums
    col_widths = {
        'A': 20,  # Metric
        'B': 20,  # Group
        'C': 12,  # Sample Size
        'D': 15,  # Injury Rate (%)
        'E': 40,  # Injury Rate Explanation
        'F': 15,  # Illness Rate (%)
        'G': 40,  # Illness Rate Explanation
        'H': 18,  # Injury Correlation
        'I': 22,  # Injury Correlation Strength
        'J': 50,  # Injury Correlation Explanation
        'K': 18,  # Illness Correlation
        'L': 22,  # Illness Correlation Strength
        'M': 50   # Illness Correlation Explanation
    }
    
    for col, width in col_widths.items():
        worksheet.column_dimensions[col].width = width
    
    # Freeze the header row
    worksheet.freeze_panes = 'A2'
    
    print("Analysis complete. Results saved to 'female_vs_all_vertical_comparison.xlsx'")

  combined_df = pd.concat([train_df, test_df, realuse_df], ignore_index=True)


Analysis complete. Results saved to 'female_vs_all_vertical_comparison.xlsx'
