# Investigating how Mental Health Issues affect adults

In [2]:
import os 
import pandas as pd

In [3]:
folder_path = 'dataset'

files = os.listdir(folder_path)

In [4]:
# Filter the list to include only CSV files
csv_files = [file for file in files if file.endswith('.csv')]

# Load each CSV file into a DataFrame and store them in a list
dataframes = []
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Example: Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)



In [5]:
combined_df.describe()

Unnamed: 0,Year,Major depression,Bipolar disorder,Eating disorders,Dysthymia,Anxiety disorders,DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Depressive disorders,DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Schizophrenia,DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Bipolar disorder,DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Eating disorders,...,Not at all,Number of countries with primary data on prevalence of mental disorders,"Potentially adequate treatment, conditional","Other treatments, conditional","Untreated, conditional",Schizophrenia disorders (share of population) - Sex: Both - Age: Age-standardized,Depressive disorders (share of population) - Sex: Both - Age: Age-standardized,Anxiety disorders (share of population) - Sex: Both - Age: Age-standardized,Bipolar disorders (share of population) - Sex: Both - Age: Age-standardized,Eating disorders (share of population) - Sex: Both - Age: Age-standardized
count,13355.0,44.0,22.0,22.0,22.0,22.0,6840.0,6840.0,6840.0,6840.0,...,10.0,15.0,26.0,26.0,26.0,6420.0,6420.0,6420.0,6420.0,6420.0
mean,2004.539948,28.995455,15.931818,14.577273,17.372727,34.181818,652.215475,171.090876,137.930619,42.392972,...,76.31,56.733333,8.653846,16.365385,74.980769,0.266604,3.767036,4.10184,0.636968,0.195664
std,8.648371,32.841373,28.352431,25.455918,29.174127,35.82587,183.643326,26.234514,51.197175,29.39438,...,13.162949,46.17276,5.073557,6.570415,10.238087,0.039383,0.925286,1.050543,0.233391,0.13838
min,1990.0,0.0,0.0,0.0,0.0,0.0,243.09784,119.91338,39.438133,9.671199,...,49.1,2.0,0.0,6.9,54.9,0.188416,1.522333,1.879996,0.181667,0.04478
25%,1997.0,1.3,0.0,0.0,0.0,0.925,506.857413,155.950035,112.140244,20.837689,...,74.05,31.0,3.425,11.1,68.8,0.242267,3.080036,3.425846,0.520872,0.096416
50%,2005.0,15.25,2.75,0.0,0.85,23.9,640.09915,175.1151,124.228445,31.430651,...,76.0,41.0,9.8,15.25,76.4,0.273477,3.636772,3.939547,0.579331,0.14415
75%,2012.0,47.0,14.1,18.2,23.05,63.925,765.84291,183.999005,184.43812,55.850353,...,82.475,63.0,11.975,20.575,82.55,0.286575,4.366252,4.564164,0.844406,0.251167
max,2019.0,100.0,100.0,89.8,100.0,100.0,1427.4236,291.1001,325.1528,218.70439,...,96.6,172.0,16.1,34.6,91.8,0.462045,7.645899,8.624634,1.50673,1.031688


In [6]:
# Checking the shape of the data
print(combined_df.shape)


(13355, 27)


In [7]:
#displaying column names
display(combined_df.columns.values)

array(['Entity', 'Code', 'Year', 'Major depression', 'Bipolar disorder',
       'Eating disorders', 'Dysthymia', 'Schizophrenia',
       'Anxiety disorders',
       'DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Depressive disorders',
       'DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Schizophrenia',
       'DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Bipolar disorder',
       'DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Eating disorders',
       'DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Anxiety disorders',
       'Nearly every day', 'More than half the days', 'Several days',
       'Not at all',
       'Number of countries with primary data on prevalence of mental disorders',
       'Potentially adequate treatment, conditional',
       'Other treatments, conditional', 'Untreated, conditional',
       'Schizophrenia disorders (share of population) - Sex: Both - Age: Age-standardized',
       'Depressive disorders (

In [8]:
#checking the number of unique values in columns
print(combined_df.nunique())

Entity                                                                                287
Code                                                                                  205
Year                                                                                   30
Major depression                                                                       18
Bipolar disorder                                                                       14
Eating disorders                                                                       11
Dysthymia                                                                              14
Schizophrenia                                                                          14
Anxiety disorders                                                                      18
DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Depressive disorders       6835
DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Schizophrenia              6834
DALYs (rat

In [9]:
# Display basic statistics for the depression and anxiety columns
print(combined_df[['Major depression', 'Anxiety disorders']].describe())

       Major depression  Anxiety disorders
count         44.000000          22.000000
mean          28.995455          34.181818
std           32.841373          35.825870
min            0.000000           0.000000
25%            1.300000           0.925000
50%           15.250000          23.900000
75%           47.000000          63.925000
max          100.000000         100.000000


## Expoloratory Data Analysis and Data Cleaning

In [10]:
print (combined_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13355 entries, 0 to 13354
Data columns (total 27 columns):
 #   Column                                                                             Non-Null Count  Dtype  
---  ------                                                                             --------------  -----  
 0   Entity                                                                             13355 non-null  object 
 1   Code                                                                               12321 non-null  object 
 2   Year                                                                               13355 non-null  int64  
 3   Major depression                                                                   44 non-null     float64
 4   Bipolar disorder                                                                   22 non-null     float64
 5   Eating disorders                                                                   22 non-null     flo

In [11]:
# Display the combined DataFrame
print(combined_df.head(10))
print(combined_df['Major depression'].dtype)

                       Entity Code  Year  Major depression  Bipolar disorder  \
0        Andean Latin America  NaN  2008               0.0               0.0   
1                Asia Pacific  NaN  2008              80.8               3.8   
2                 Australasia  NaN  2008             100.0             100.0   
3                   Caribbean  NaN  2008               9.1               0.0   
4                Central Asia  NaN  2008               0.0               0.0   
5              Central Europe  NaN  2008              16.0               0.0   
6       Central Latin America  NaN  2008              49.8              34.5   
7  Central Sub-Saharan Africa  NaN  2008               0.0               0.0   
8                   East Asia  NaN  2008              12.2               8.4   
9     East Sub-Saharan Africa  NaN  2008               1.3               0.9   

   Eating disorders  Dysthymia Schizophrenia  Anxiety disorders  \
0               0.0        0.0             0        

In [12]:
#displaying the last 10 
print(combined_df.tail(10))

         Entity Code  Year  Major depression  Bipolar disorder  \
13345  Zimbabwe  ZWE  2010               NaN               NaN   
13346  Zimbabwe  ZWE  2011               NaN               NaN   
13347  Zimbabwe  ZWE  2012               NaN               NaN   
13348  Zimbabwe  ZWE  2013               NaN               NaN   
13349  Zimbabwe  ZWE  2014               NaN               NaN   
13350  Zimbabwe  ZWE  2015               NaN               NaN   
13351  Zimbabwe  ZWE  2016               NaN               NaN   
13352  Zimbabwe  ZWE  2017               NaN               NaN   
13353  Zimbabwe  ZWE  2018               NaN               NaN   
13354  Zimbabwe  ZWE  2019               NaN               NaN   

       Eating disorders  Dysthymia Schizophrenia  Anxiety disorders  \
13345               NaN        NaN           NaN                NaN   
13346               NaN        NaN           NaN                NaN   
13347               NaN        NaN           NaN            

## Cleaning the code and filling in the missing values
when running df.info(), the code variable has some NaN, I am going to clean the data and fill in the missing values

In [13]:
combined_df.isnull().sum

<bound method DataFrame.sum of        Entity   Code   Year  Major depression  Bipolar disorder  \
0       False   True  False             False             False   
1       False   True  False             False             False   
2       False   True  False             False             False   
3       False   True  False             False             False   
4       False   True  False             False             False   
...       ...    ...    ...               ...               ...   
13350   False  False  False              True              True   
13351   False  False  False              True              True   
13352   False  False  False              True              True   
13353   False  False  False              True              True   
13354   False  False  False              True              True   

       Eating disorders  Dysthymia  Schizophrenia  Anxiety disorders  \
0                 False      False          False              False   
1                 Fa

In [15]:
#filling the empty cells in Major Depression by first calculating the mean
mean = combined_df['Major depression'].mean()

#filling the empty cells in Major depression with the calculated mean
combined_df['Major depression'].fillna(mean, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['Major depression'].fillna(mean, inplace=True)


In [16]:
#filling the empty cells in Anxiety disorders by first calculating the mean
mean = combined_df['Anxiety disorders'].mean()

#filling the empty cells in Anxiety disorders with the calculated mean
combined_df['Anxiety disorders'].fillna(mean, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['Anxiety disorders'].fillna(mean, inplace=True)


In [17]:
combined_df.isnull().sum()

Entity                                                                                   0
Code                                                                                  1034
Year                                                                                     0
Major depression                                                                         0
Bipolar disorder                                                                     13333
Eating disorders                                                                     13333
Dysthymia                                                                            13333
Schizophrenia                                                                        13333
Anxiety disorders                                                                        0
DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Depressive disorders        6515
DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Schizophrenia               6515