In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv("C:\\Users\\roydi\\Downloads\\Dataset for long term Covid-19 mental health research.sav-addmaple-conversion.csv")
print(f"Shape: {df.shape[0]} participants × {df.shape[1]} variables")


In [None]:
# CLEAN COLUMN NAMES (remove newlines and extra spaces)
df.columns = df.columns.str.replace(r'\s+', ' ', regex=True).str.replace('\n', ' ').str.strip()

# PRINT to verify
print(df.columns[:15])


In [None]:
#VARIABLE INVENTORY BY DOMAIN

In [None]:
print("\n" + "="*80)
print("VARIABLE INVENTORY BY RESEARCH DOMAIN")
print("="*80)

def categorize_variables(columns):
    """Systematically categorize all variables by research domain"""
    
    categories = {
        'participant_ids': [],
        'demographics': [],
        'mental_health_scales': {
            'gad_anxiety': [],
            'core_distress': [],
            'perma_wellbeing': [],
            'loneliness': []
        },
        'covid_impact': {
            'academic_stress': [],
            'health_concerns': [],
            'social_impact': [],
            'coping_behaviors': [],
            'media_consumption': []
        },
        'time_points': {
            't1_baseline': [],
            't2_followup': [],
            't3_followup': [],
            't4_latest': []
        },
        'composite_scores': [],
        'severity_categories': [],
        'auxiliary_variables': [],
        'unclassified': []
    }
    
    for col in columns:
        col_lower = col.lower()
        classified = False
        
        # Participant IDs
        if 'participant' in col_lower and 'id' in col_lower:
            categories['participant_ids'].append(col)
            classified = True
        
        # Demographics
        elif any(term in col_lower for term in ['age', 'gender', 'student', 'country', 'prolific']):
            categories['demographics'].append(col)
            classified = True
        
        # Mental Health Scales
        elif 'gad' in col_lower:
            categories['mental_health_scales']['gad_anxiety'].append(col)
            classified = True
        elif 'core' in col_lower:
            categories['mental_health_scales']['core_distress'].append(col)
            classified = True
        elif 'perma' in col_lower:
            categories['mental_health_scales']['perma_wellbeing'].append(col)
            classified = True
        elif 'lone' in col_lower:
            categories['mental_health_scales']['loneliness'].append(col)
            classified = True
        
        # COVID Impact Assessment
        elif 'covid' in col_lower:
            # Further subcategorization of COVID variables
            if any(term in col_lower for term in ['university', 'studies', 'academic']):
                categories['covid_impact']['academic_stress'].append(col)
            elif any(term in col_lower for term in ['stressed', 'contracting', 'family', 'friends']):
                categories['covid_impact']['health_concerns'].append(col)
            elif any(term in col_lower for term in ['social', 'distancing', 'lonely', 'together']):
                categories['covid_impact']['social_impact'].append(col)
            elif any(term in col_lower for term in ['media', 'social media']):
                categories['covid_impact']['media_consumption'].append(col)
            else:
                categories['covid_impact']['coping_behaviors'].append(col)
            classified = True
        
        # Composite Scores
        elif 'total' in col_lower or 'severity' in col_lower:
            categories['composite_scores'].append(col)
            classified = True
        
        # Time Point Classification
        if not classified:
            if 't1' in col_lower:
                categories['time_points']['t1_baseline'].append(col)
                classified = True
            elif 't2' in col_lower:
                categories['time_points']['t2_followup'].append(col)
                classified = True
            elif 't3' in col_lower:
                categories['time_points']['t3_followup'].append(col)
                classified = True
            elif 't4' in col_lower:
                categories['time_points']['t4_latest'].append(col)
                classified = True
        
        # Auxiliary variables
        if not classified and any(term in col_lower for term in ['item', 'filter', '(2)', '(3)']):
            categories['auxiliary_variables'].append(col)
            classified = True
        
        # Unclassified
        if not classified:
            categories['unclassified'].append(col)
    
    return categories

# Perform categorization
variable_inventory = categorize_variables(df.columns)

# Display inventory
print("PARTICIPANT IDENTIFICATION:")
print(f"  Participant IDs: {len(variable_inventory['participant_ids'])} variables")
for var in variable_inventory['participant_ids']:
    print(f"    - {var}")

print(f"\nDEMOGRAPHICS:")
print(f"  Total: {len(variable_inventory['demographics'])} variables")
for var in variable_inventory['demographics'][:5]:
    print(f"    - {var}")
if len(variable_inventory['demographics']) > 5:
    print(f"    ... and {len(variable_inventory['demographics']) - 5} more")

print(f"\nMENTAL HEALTH SCALES:")
for scale_type, variables in variable_inventory['mental_health_scales'].items():
    if variables:
        print(f"  {scale_type.replace('_', ' ').title()}: {len(variables)} variables")
        for var in variables[:3]:
            print(f"    - {var}")
        if len(variables) > 3:
            print(f"    ... and {len(variables) - 3} more")

print(f"\nCOVID-19 IMPACT ASSESSMENT:")
for impact_type, variables in variable_inventory['covid_impact'].items():
    if variables:
        print(f"  {impact_type.replace('_', ' ').title()}: {len(variables)} variables")

print(f"\nTIME POINTS:")
for timepoint, variables in variable_inventory['time_points'].items():
    if variables:
        print(f"  {timepoint.replace('_', ' ').title()}: {len(variables)} variables")

print(f"\nCOMPOSITE SCORES: {len(variable_inventory['composite_scores'])} variables")
print(f"AUXILIARY VARIABLES: {len(variable_inventory['auxiliary_variables'])} variables")
print(f"UNCLASSIFIED: {len(variable_inventory['unclassified'])} variables")


In [None]:
#DATA AVAILABILITY MATRIX

In [None]:
print("\n" + "="*80)
print("DATA AVAILABILITY MATRIX")
print("="*80)

def create_availability_matrix(df, categories):
    """Create comprehensive data availability analysis"""
    
    availability_stats = {}
    
    # Overall statistics
    total_vars = len(df.columns)
    total_cells = df.shape[0] * df.shape[1]
    missing_cells = df.isnull().sum().sum()
    
    availability_stats['overall'] = {
        'total_variables': total_vars,
        'total_data_points': total_cells,
        'missing_data_points': missing_cells,
        'data_completeness_pct': ((total_cells - missing_cells) / total_cells) * 100
    }
    
    # By domain
    for domain, variables in categories.items():
        if isinstance(variables, list) and variables:
            domain_data = df[variables]
            domain_missing = domain_data.isnull().sum().sum()
            domain_total = domain_data.shape[0] * domain_data.shape[1]
            
            availability_stats[domain] = {
                'variables': len(variables),
                'completeness_pct': ((domain_total - domain_missing) / domain_total) * 100 if domain_total > 0 else 0,
                'avg_missing_per_var': domain_missing / len(variables) if variables else 0
            }
        elif isinstance(variables, dict):
            for subdomain, subvars in variables.items():
                if subvars:
                    subdomain_data = df[subvars]
                    subdomain_missing = subdomain_data.isnull().sum().sum()
                    subdomain_total = subdomain_data.shape[0] * subdomain_data.shape[1]
                    
                    availability_stats[f"{domain}_{subdomain}"] = {
                        'variables': len(subvars),
                        'completeness_pct': ((subdomain_total - subdomain_missing) / subdomain_total) * 100 if subdomain_total > 0 else 0,
                        'avg_missing_per_var': subdomain_missing / len(subvars) if subvars else 0
                    }
    
    return availability_stats

availability_matrix = create_availability_matrix(df, variable_inventory)

# Display availability matrix
print("OVERALL DATA AVAILABILITY:")
overall = availability_matrix['overall']
print(f"  Total variables: {overall['total_variables']:,}")
print(f"  Total data points: {overall['total_data_points']:,}")
print(f"  Missing data points: {overall['missing_data_points']:,}")
print(f"  Overall completeness: {overall['data_completeness_pct']:.1f}%")

print(f"\nDOMAIN-SPECIFIC AVAILABILITY:")
domain_order = [
    'participant_ids', 'demographics', 
    'mental_health_scales_gad_anxiety', 'mental_health_scales_core_distress',
    'mental_health_scales_perma_wellbeing', 'mental_health_scales_loneliness',
    'covid_impact_academic_stress', 'covid_impact_health_concerns',
    'covid_impact_social_impact', 'covid_impact_coping_behaviors',
    'time_points_t1_baseline', 'time_points_t2_followup', 
    'time_points_t3_followup', 'time_points_t4_latest'
]

for domain in domain_order:
    if domain in availability_matrix:
        stats = availability_matrix[domain]
        domain_name = domain.replace('_', ' ').replace('mental health scales ', '').replace('covid impact ', '').replace('time points ', '').title()
        print(f"  {domain_name}: {stats['completeness_pct']:.1f}% complete ({stats['variables']} variables)")

In [None]:
#CRITICAL VARIABLE ASSESSMENT

In [None]:
print("\n" + "="*80)
print("CRITICAL VARIABLE ASSESSMENT")
print("="*80)

def assess_critical_variables(df, categories):
    """Identify presence and quality of research-critical variables"""
    
    critical_assessment = {
        'primary_outcomes': {},
        'key_predictors': {},
        'essential_demographics': {},
        'time_structure': {}
    }
    
    # Primary mental health outcomes
    primary_outcomes = ['GADTotalT4', 'CORETotalT4', 'GADTotalT1', 'CORETotalT1']
    for outcome in primary_outcomes:
        if outcome in df.columns:
            data_quality = {
                'present': True,
                'sample_size': df[outcome].notna().sum(),
                'missing_pct': (df[outcome].isnull().sum() / len(df)) * 100,
                'data_range': f"{df[outcome].min():.1f} - {df[outcome].max():.1f}" if df[outcome].notna().any() else "No data"
            }
        else:
            data_quality = {'present': False}
        
        critical_assessment['primary_outcomes'][outcome] = data_quality
    
    # Key predictor variables
    key_predictors = ['COVID1T4', 'COVID2T4', 'LoneT4', 'What is your age? (4)', 'GenderT4']
    for predictor in key_predictors:
        matching_cols = [col for col in df.columns if predictor.lower() in col.lower()]
        if matching_cols:
            col = matching_cols[0]
            data_quality = {
                'present': True,
                'variable_name': col,
                'sample_size': df[col].notna().sum(),
                'missing_pct': (df[col].isnull().sum() / len(df)) * 100
            }
        else:
            data_quality = {'present': False}
        
        critical_assessment['key_predictors'][predictor] = data_quality
    
    # Essential demographics
    essential_demos = ['age', 'gender', 'student', 'country']
    for demo in essential_demos:
        matching_cols = [col for col in df.columns if demo in col.lower()]
        if matching_cols:
            col = matching_cols[0]
            data_quality = {
                'present': True,
                'variable_name': col,
                'sample_size': df[col].notna().sum(),
                'unique_values': df[col].nunique()
            }
        else:
            data_quality = {'present': False}
        
        critical_assessment['essential_demographics'][demo] = data_quality
    
    # Time structure assessment
    timepoints = ['T1', 'T2', 'T3', 'T4']
    for tp in timepoints:
        tp_vars = [col for col in df.columns if tp.lower() in col.lower()]
        if tp_vars:
            # Check data availability across time points
            tp_data = df[tp_vars]
            participants_with_data = (tp_data.notna().any(axis=1)).sum()
            avg_completeness = tp_data.notna().sum().sum() / (tp_data.shape[0] * tp_data.shape[1]) * 100
            
            critical_assessment['time_structure'][tp] = {
                'variables_available': len(tp_vars),
                'participants_with_data': participants_with_data,
                'avg_completeness': avg_completeness
            }
    
    return critical_assessment

critical_vars = assess_critical_variables(df, variable_inventory)

# Display critical variable assessment
print("PRIMARY MENTAL HEALTH OUTCOMES:")
for outcome, quality in critical_vars['primary_outcomes'].items():
    if quality['present']:
        print(f"  {outcome}: Available (n={quality['sample_size']}, {quality['missing_pct']:.1f}% missing)")
        print(f"    Range: {quality['data_range']}")
    else:
        print(f"  {outcome}: NOT FOUND")

print(f"\nKEY PREDICTOR VARIABLES:")
for predictor, quality in critical_vars['key_predictors'].items():
    if quality['present']:
        print(f"  {predictor}: Available as '{quality['variable_name']}'")
        print(f"    n={quality['sample_size']}, {quality['missing_pct']:.1f}% missing")
    else:
        print(f"  {predictor}: NOT FOUND")

print(f"\nESSENTIAL DEMOGRAPHICS:")
for demo, quality in critical_vars['essential_demographics'].items():
    if quality['present']:
        print(f"  {demo}: Available as '{quality['variable_name']}'")
        print(f"    n={quality['sample_size']}, {quality['unique_values']} unique values")
    else:
        print(f"  {demo}: NOT FOUND")

print(f"\nLONGITUDINAL STRUCTURE:")
for timepoint, structure in critical_vars['time_structure'].items():
    print(f"  {timepoint}: {structure['variables_available']} variables")
    print(f"    {structure['participants_with_data']} participants with data ({structure['avg_completeness']:.1f}% complete)")