In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

In [2]:
mhsdata = pd.read_csv('Mental Illness Survey.csv')
print("Data type : ", type(mhsdata))
print("Data dims : ", mhsdata.shape)

Data type :  <class 'pandas.core.frame.DataFrame'>
Data dims :  (335, 40)


335 Records
40 Variable

In [3]:
print(mhsdata.dtypes)

Respondent ID                                                   float64
Collector ID                                                    float64
Start Date                                                       object
End Date                                                         object
IP Address                                                      float64
Email Address                                                   float64
First Name                                                      float64
Last Name                                                       float64
Custom Data 1                                                    object
I identify as having a mental illness                            object
Education                                                        object
I have my own computer separate from a smart phone               object
I have been hospitalized before for my mental illness            object
How many days were you hospitalized for your mental illness     

In [4]:
#create backup copy
mhsdata_clean = mhsdata.copy()

#rename column variable
mhsdata_clean.rename(columns={'I identify as having a mental illness':'MentallyIll'}, inplace =True)
mhsdata_clean.rename(columns={'I am unemployed':'Unemployed'}, inplace =True)
mhsdata_clean.rename(columns={'Annual income (including any social welfare programs) in USD':'Income'}, inplace =True)

#making column variable more readable
mhsdata_clean.columns = mhsdata_clean.columns.str.upper()
mhsdata_clean.columns = mhsdata_clean.columns.str.replace(" ", "_")

In [5]:
pd.set_option('display.max_columns', None)

In [6]:
# remove useless column
mhsdata_clean.drop(mhsdata_clean.columns[[0,9]], axis=1, inplace=True)

In [7]:
#replace no and yes to 0 and 1
mhsdata_clean = mhsdata_clean.replace(to_replace=['No', 'Yes'], value=[0, 1])

#we assume that NaNs means no
mhsdata_clean = mhsdata_clean.fillna(0)

#remove first row (not counting the variables name)
mhsdata_clean = mhsdata_clean.iloc[1:]
display(mhsdata_clean)

Unnamed: 0,COLLECTOR_ID,START_DATE,END_DATE,IP_ADDRESS,EMAIL_ADDRESS,FIRST_NAME,LAST_NAME,CUSTOM_DATA_1,EDUCATION,I_HAVE_MY_OWN_COMPUTER_SEPARATE_FROM_A_SMART_PHONE,I_HAVE_BEEN_HOSPITALIZED_BEFORE_FOR_MY_MENTAL_ILLNESS,HOW_MANY_DAYS_WERE_YOU_HOSPITALIZED_FOR_YOUR_MENTAL_ILLNESS,I_AM_CURRENTLY_EMPLOYED_AT_LEAST_PART-TIME,I_AM_LEGALLY_DISABLED,I_HAVE_MY_REGULAR_ACCESS_TO_THE_INTERNET,I_LIVE_WITH_MY_PARENTS,I_HAVE_A_GAP_IN_MY_RESUME,TOTAL_LENGTH_OF_ANY_GAPS_IN_MY_RESUME_IN MONTHS.,INCOME,UNEMPLOYED,I_READ_OUTSIDE_OF_WORK_AND_SCHOOL,ANNUAL_INCOME_FROM_SOCIAL_WELFARE_PROGRAMS,I_RECEIVE_FOOD_STAMPS,I_AM_ON_SECTION_8_HOUSING,HOW_MANY_TIMES_WERE_YOU_HOSPITALIZED_FOR_YOUR_MENTAL_ILLNESS,I_HAVE_ONE_OF_THE_FOLLOWING ISSUES_IN_ADDITION_TO_MY_ILLNESS,UNNAMED:_28,UNNAMED:_29,UNNAMED:_30,UNNAMED:_31,UNNAMED:_32,UNNAMED:_33,UNNAMED:_34,AGE,GENDER,HOUSEHOLD_INCOME,REGION,DEVICE_TYPE
1,168522804.0,01/15/2018 03:45:16 AM,01/15/2018 03:48:24 AM,0.0,0.0,0.0,0.0,06f645d7ea5af372d50a62bd17,High School or GED,0,0,0,0,0,1,0,1,24,35,1,1,0,0,0,0,Lack of concentration,Anxiety,Depression,Obsessive thinking,0,Panic attacks,0,0,30-44,Male,"$25,000-$49,999",Mountain,Android Phone / Tablet
2,168522804.0,01/15/2018 03:17:52 AM,01/15/2018 03:18:57 AM,0.0,0.0,0.0,0.0,abca2776418ff1fe24bb85e21f,Some Phd,1,0,0,1,0,1,0,0,1,22,0,1,0,0,0,0,Lack of concentration,Anxiety,Depression,0,0,Panic attacks,0,Tiredness,18-29,Male,"$50,000-$74,999",East South Central,MacOS Desktop / Laptop
3,168522804.0,01/15/2018 03:10:28 AM,01/15/2018 03:12:49 AM,0.0,0.0,0.0,0.0,3800088cf4e55278b38bbe67f3,Completed Undergraduate,1,0,0,1,0,1,0,0,0,100,0,1,0,0,0,0,0,0,0,0,0,0,0,0,30-44,Male,"$150,000-$174,999",Pacific,MacOS Desktop / Laptop
4,168522804.0,01/15/2018 02:11:16 AM,01/15/2018 02:12:33 AM,0.0,0.0,0.0,0.0,84585803a3cec189f89fe43d44,Some Undergraduate,1,0,0,0,0,1,1,1,11,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,30-44,Male,"$25,000-$49,999",New England,Windows Desktop / Laptop
5,168522804.0,01/15/2018 01:24:12 AM,01/15/2018 01:26:34 AM,0.0,0.0,0.0,0.0,4b4faaaa7eaff01549233044bd,Completed Undergraduate,1,1,35,1,1,1,0,1,33,32,0,1,30,0,0,4,Lack of concentration,Anxiety,Depression,Obsessive thinking,Mood swings,Panic attacks,Compulsive behavior,Tiredness,30-44,Male,"$25,000-$49,999",East North Central,iOS Phone / Tablet
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330,168522804.0,01/14/2018 03:34:38 AM,01/14/2018 03:36:44 AM,0.0,0.0,0.0,0.0,87711d94384b2bfbcd7cdbe52a,High School or GED,1,0,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,Tiredness,45-60,Female,Prefer not to answer,Mountain,Android Phone / Tablet
331,168522804.0,01/14/2018 03:34:38 AM,01/14/2018 03:36:09 AM,0.0,0.0,0.0,0.0,9e9969a3f4eb0950af94407466,Some Undergraduate,1,0,0,1,0,1,1,0,0,62,0,1,0,0,0,0,0,0,0,0,0,0,0,0,18-29,Male,"$50,000-$74,999",Pacific,Windows Desktop / Laptop
332,168522804.0,01/14/2018 03:27:20 AM,01/14/2018 03:33:01 AM,0.0,0.0,0.0,0.0,558859a6e130ae0f1817fc0ac7,Some Undergraduate,1,0,0,1,0,1,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,> 60,Female,"$10,000-$24,999",West North Central,Windows Desktop / Laptop
333,168522804.0,01/14/2018 03:29:31 AM,01/14/2018 03:32:45 AM,0.0,0.0,0.0,0.0,34d6b882e9f4ca7712b0b44541,Some Undergraduate,0,1,1,0,1,1,1,1,30,12,1,1,12,1,0,3,Lack of concentration,Anxiety,Depression,Obsessive thinking,Mood swings,Panic attacks,Compulsive behavior,Tiredness,18-29,Female,"$0-$9,999",West South Central,Android Phone / Tablet
