In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv("C:\\Users\\roydi\\Downloads\\Dataset for long term Covid-19 mental health research.sav-addmaple-conversion.csv")
print(f"Shape: {df.shape[0]} participants × {df.shape[1]} variables")


In [None]:
# CLEAN COLUMN NAMES (remove newlines and extra spaces)
df.columns = df.columns.str.replace(r'\s+', ' ', regex=True).str.replace('\n', ' ').str.strip()

# PRINT to verify
print(df.columns[:15])


In [None]:
#VARIABLE INVENTORY BY DOMAIN

In [None]:
print("\n" + "="*80)
print("VARIABLE INVENTORY BY RESEARCH DOMAIN")
print("="*80)

def categorize_variables(columns):
    """Systematically categorize all variables by research domain"""
    
    categories = {
        'participant_ids': [],
        'demographics': [],
        'mental_health_scales': {
            'gad_anxiety': [],
            'core_distress': [],
            'perma_wellbeing': [],
            'loneliness': []
        },
        'covid_impact': {
            'academic_stress': [],
            'health_concerns': [],
            'social_impact': [],
            'coping_behaviors': [],
            'media_consumption': []
        },
        'time_points': {
            't1_baseline': [],
            't2_followup': [],
            't3_followup': [],
            't4_latest': []
        },
        'composite_scores': [],
        'severity_categories': [],
        'auxiliary_variables': [],
        'unclassified': []
    }
    
    for col in columns:
        col_lower = col.lower()
        classified = False
        
        # Participant IDs
        if 'participant' in col_lower and 'id' in col_lower:
            categories['participant_ids'].append(col)
            classified = True
        
        # Demographics
        elif any(term in col_lower for term in ['age', 'gender', 'student', 'country', 'prolific']):
            categories['demographics'].append(col)
            classified = True
        
        # Mental Health Scales
        elif 'gad' in col_lower:
            categories['mental_health_scales']['gad_anxiety'].append(col)
            classified = True
        elif 'core' in col_lower:
            categories['mental_health_scales']['core_distress'].append(col)
            classified = True
        elif 'perma' in col_lower:
            categories['mental_health_scales']['perma_wellbeing'].append(col)
            classified = True
        elif 'lone' in col_lower:
            categories['mental_health_scales']['loneliness'].append(col)
            classified = True
        
        # COVID Impact Assessment
        elif 'covid' in col_lower:
            # Further subcategorization of COVID variables
            if any(term in col_lower for term in ['university', 'studies', 'academic']):
                categories['covid_impact']['academic_stress'].append(col)
            elif any(term in col_lower for term in ['stressed', 'contracting', 'family', 'friends']):
                categories['covid_impact']['health_concerns'].append(col)
            elif any(term in col_lower for term in ['social', 'distancing', 'lonely', 'together']):
                categories['covid_impact']['social_impact'].append(col)
            elif any(term in col_lower for term in ['media', 'social media']):
                categories['covid_impact']['media_consumption'].append(col)
            else:
                categories['covid_impact']['coping_behaviors'].append(col)
            classified = True
        
        # Composite Scores
        elif 'total' in col_lower or 'severity' in col_lower:
            categories['composite_scores'].append(col)
            classified = True
        
        # Time Point Classification
        if not classified:
            if 't1' in col_lower:
                categories['time_points']['t1_baseline'].append(col)
                classified = True
            elif 't2' in col_lower:
                categories['time_points']['t2_followup'].append(col)
                classified = True
            elif 't3' in col_lower:
                categories['time_points']['t3_followup'].append(col)
                classified = True
            elif 't4' in col_lower:
                categories['time_points']['t4_latest'].append(col)
                classified = True
        
        # Auxiliary variables
        if not classified and any(term in col_lower for term in ['item', 'filter', '(2)', '(3)']):
            categories['auxiliary_variables'].append(col)
            classified = True
        
        # Unclassified
        if not classified:
            categories['unclassified'].append(col)
    
    return categories

# Perform categorization
variable_inventory = categorize_variables(df.columns)

# Display inventory
print("PARTICIPANT IDENTIFICATION:")
print(f"  Participant IDs: {len(variable_inventory['participant_ids'])} variables")
for var in variable_inventory['participant_ids']:
    print(f"    - {var}")

print(f"\nDEMOGRAPHICS:")
print(f"  Total: {len(variable_inventory['demographics'])} variables")
for var in variable_inventory['demographics'][:5]:
    print(f"    - {var}")
if len(variable_inventory['demographics']) > 5:
    print(f"    ... and {len(variable_inventory['demographics']) - 5} more")

print(f"\nMENTAL HEALTH SCALES:")
for scale_type, variables in variable_inventory['mental_health_scales'].items():
    if variables:
        print(f"  {scale_type.replace('_', ' ').title()}: {len(variables)} variables")
        for var in variables[:3]:
            print(f"    - {var}")
        if len(variables) > 3:
            print(f"    ... and {len(variables) - 3} more")

print(f"\nCOVID-19 IMPACT ASSESSMENT:")
for impact_type, variables in variable_inventory['covid_impact'].items():
    if variables:
        print(f"  {impact_type.replace('_', ' ').title()}: {len(variables)} variables")

print(f"\nTIME POINTS:")
for timepoint, variables in variable_inventory['time_points'].items():
    if variables:
        print(f"  {timepoint.replace('_', ' ').title()}: {len(variables)} variables")

print(f"\nCOMPOSITE SCORES: {len(variable_inventory['composite_scores'])} variables")
print(f"AUXILIARY VARIABLES: {len(variable_inventory['auxiliary_variables'])} variables")
print(f"UNCLASSIFIED: {len(variable_inventory['unclassified'])} variables")
