## Formatting: Datasets 5–7

Includes:
- Anxiety Disorders Treatment Gap
- Depressive Symptoms Across US Population
- Number of Countries with Primary Data on Prevalence of Mental Illness

In [26]:
import os

os.listdir('Dataset/MentalHealth')


['5- anxiety-disorders-treatment-gap.csv',
 '7- number-of-countries-with-primary-data-on-prevalence-of-mental-illnesses-in-the-global-burden-of-disease-study.csv',
 '6- depressive-symptoms-across-us-population.csv',
 '2- burden-disease-from-each-mental-illness(1).csv',
 '1- mental-illnesses-prevalence.csv',
 '4- adult-population-covered-in-primary-data-on-the-prevalence-of-mental-illnesses.csv',
 '.ipynb_checkpoints',
 '3- adult-population-covered-in-primary-data-on-the-prevalence-of-major-depression.csv']

In [7]:
import pandas as pd

# File paths for datasets 5, 6, and 7
file_paths = [
    'Dataset/MentalHealth/5-anxiety-disorders-treatment-gap.csv',
    'Dataset/MentalHealth/6-depressive-symptoms-across-us-population.csv',
    'Dataset/MentalHealth/7-number-of-countries-with-primary-data-on-prevalence-of-mental-illnesses-in-the-global-burden-of-disease-study.csv'
]


# Load datasets
dfs = [pd.read_csv(fp) for fp in file_paths]

# Unpack into named DataFrames
df5, df6, df7 = dfs

# Clean all datasets
for df in dfs:
    df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_')
    for col in df.select_dtypes(include='object'):
        df[col] = df[col].str.strip()
    if 'year' in df.columns:
        df['year'] = pd.to_numeric(df['year'], errors='coerce', downcast='integer')
    # Drop rows where *all* values are NaN
    df.dropna(how='all', inplace=True)


# Rename columns in Dataset 5 (dfs[0])
dfs[0].rename(columns={
    'potentially_adequate_treatment,_conditional': 'Adequate_Treatment',
    'other_treatments,_conditional': 'Other_Treatments',
    'untreated,_conditional': 'Untreated'
}, inplace=True)

# Rename columns in Dataset 6 (dfs[1])
dfs[1].rename(columns={
    'nearly_every_day': 'Severe_Symptoms',
    'more_than_half_the_days': 'Moderate_Symptoms',
    'several_days': 'Mild_Symptoms',
    'not_at_all': 'No_Symptoms'
}, inplace=True)

# Fix column renaming in Dataset 7 (dfs[2])
dfs[2].columns = dfs[2].columns.str.strip()
dfs[2].rename(columns={
    'number_of_countries_with_primary_data_on_prevalence_of_mental_disorders': 'Countries_With_Data'
}, inplace=True)

# Save cleaned datasets
output_paths = ['cleaned_df5.csv', 'cleaned_df6.csv', 'cleaned_df7.csv']
for df, path in zip(dfs, output_paths):
    df.to_csv(path, index=False)


In [62]:
for i, df in enumerate(dfs, start=5):
    print(f"{'='*60}")
    print(f"📊 DataFrame {i} Summary")
    print(f"{'='*60}")
    
    # Show basic info
    print(f"Shape: {df.shape}")
    print(f"\nColumns:\n{list(df.columns)}")
    
    # Show missing values neatly
    print(f"\nMissing Values:\n{df.isnull().sum()[df.isnull().sum() > 0].to_string()}")
    
    # Show preview
    print("\nPreview (first 5 rows):")
    display(df.head()) 


📊 DataFrame 5 Summary
Shape: (26, 6)

Columns:
['entity', 'code', 'year', 'Adequate_Treatment', 'Other_Treatments', 'Untreated']

Missing Values:
code    7

Preview (first 5 rows):


Unnamed: 0,entity,code,year,Adequate_Treatment,Other_Treatments,Untreated
0,Argentina,ARG,2015,12.0,18.0,70.0
1,"Beijing/Shanghai, China",,2005,8.8,8.5,82.7
2,Belgium,BEL,2002,11.2,24.5,64.3
3,Bulgaria,BGR,2006,7.3,14.3,78.4
4,Colombia,COL,2012,3.2,10.0,86.8


📊 DataFrame 6 Summary
Shape: (10, 7)

Columns:
['entity', 'code', 'year', 'Severe_Symptoms', 'Moderate_Symptoms', 'Mild_Symptoms', 'No_Symptoms']

Missing Values:
code    10

Preview (first 5 rows):


Unnamed: 0,entity,code,year,Severe_Symptoms,Moderate_Symptoms,Mild_Symptoms,No_Symptoms
0,Appetite change,,2014,4.6,5.1,15.5,74.8
1,Average across symptoms,,2014,4.4,4.3,15.0,76.3
2,Depressed mood,,2014,3.6,3.9,16.8,75.7
3,Difficulty concentrating,,2014,3.5,3.6,10.9,82.1
4,Loss of interest,,2014,4.4,5.4,16.3,73.8


📊 DataFrame 7 Summary
Shape: (15, 4)

Columns:
['entity', 'code', 'year', 'Countries_With_Data']

Missing Values:
code    15

Preview (first 5 rows):


Unnamed: 0,entity,code,year,Countries_With_Data
0,Alcohol use disorders,,2019,58
1,Amphetamine use disorders,,2019,58
2,Anorexia nervosa,,2019,27
3,Anxiety disorders,,2019,58
4,Attention-deficit hyperactivity disorder,,2019,172


In [8]:
print("✅ Cleaned df5:")
display(df5.head())
print(f"Shape: {df5.shape}")

print("\n✅ Cleaned df6:")
display(df6.head())
print(f"Shape: {df6.shape}")

print("\n✅ Cleaned df7:")
display(df7.head())
print(f"Shape: {df7.shape}")




✅ Cleaned df5:


Unnamed: 0,entity,code,year,Adequate_Treatment,Other_Treatments,Untreated
0,Argentina,ARG,2015,12.0,18.0,70.0
1,"Beijing/Shanghai, China",,2005,8.8,8.5,82.7
2,Belgium,BEL,2002,11.2,24.5,64.3
3,Bulgaria,BGR,2006,7.3,14.3,78.4
4,Colombia,COL,2012,3.2,10.0,86.8


Shape: (26, 6)

✅ Cleaned df6:


Unnamed: 0,entity,code,year,Severe_Symptoms,Moderate_Symptoms,Mild_Symptoms,No_Symptoms
0,Appetite change,,2014,4.6,5.1,15.5,74.8
1,Average across symptoms,,2014,4.4,4.3,15.0,76.3
2,Depressed mood,,2014,3.6,3.9,16.8,75.7
3,Difficulty concentrating,,2014,3.5,3.6,10.9,82.1
4,Loss of interest,,2014,4.4,5.4,16.3,73.8


Shape: (10, 7)

✅ Cleaned df7:


Unnamed: 0,entity,code,year,Countries_With_Data
0,Alcohol use disorders,,2019,58
1,Amphetamine use disorders,,2019,58
2,Anorexia nervosa,,2019,27
3,Anxiety disorders,,2019,58
4,Attention-deficit hyperactivity disorder,,2019,172


Shape: (15, 4)
