In [None]:
import os

def scale_input_data(scale_factor):
  file_bases = ['./input/survey']
  for file_base in file_bases:
    import pandas as pd
    import shutil
    if scale_factor == 1.0:
      shutil.copyfile(file_base + '.csv', file_base + '.scaled.csv')
      continue
    df_to_scale = pd.read_csv(file_base + '.csv')
    new_num_rows = int(scale_factor * len(df_to_scale))
    if scale_factor <= 1.0:
      df_to_scale = df_to_scale.iloc[:new_num_rows]
    else:
      while len(df_to_scale) < new_num_rows:
        df_to_scale = pd.concat([df_to_scale, df_to_scale[:min(new_num_rows - len(df_to_scale), len(df_to_scale))]])
    df_to_scale.to_csv(file_base + '.scaled.csv', index=False)

if 'INPUT_SCALE_FACTOR' in os.environ:
  scale_input_data(float(os.environ['INPUT_SCALE_FACTOR']))

A notebook to clean this data and attempt to classify those who are at risk.

In [1]:
import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
exec(os.environ['IREWR_IMPORTS'])
# FIRST-AUTHOR: remove plotting, path printing
# import seaborn as sns
# %pylab inline
# from subprocess import check_output
# print(check_output(["ls", "../input"]).decode("utf8"))


In [2]:
df = pd.read_csv('./input/survey.scaled.csv')
df.info()
to_drop = ['Timestamp']  # A list of columns we will drop later on

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Timestamp                  1259 non-null   object
 1   Age                        1259 non-null   int64 
 2   Gender                     1259 non-null   object
 3   Country                    1259 non-null   object
 4   state                      744 non-null    object
 5   self_employed              1241 non-null   object
 6   family_history             1259 non-null   object
 7   treatment                  1259 non-null   object
 8   work_interfere             995 non-null    object
 9   no_employees               1259 non-null   object
 10  remote_work                1259 non-null   object
 11  tech_company               1259 non-null   object
 12  benefits                   1259 non-null   object
 13  care_options               1259 non-null   object
 14  wellness

In [3]:
df.Gender.unique()

array(['Female', 'M', 'Male', 'male', 'female', 'm', 'Male-ish', 'maile',
       'Trans-female', 'Cis Female', 'F', 'something kinda male?',
       'Cis Male', 'Woman', 'f', 'Mal', 'Male (CIS)', 'queer/she/they',
       'non-binary', 'Femake', 'woman', 'Make', 'Nah', 'All', 'Enby',
       'fluid', 'Genderqueer', 'Female ', 'Androgyne', 'Agender',
       'cis-female/femme', 'Guy (-ish) ^_^', 'male leaning androgynous',
       'Male ', 'Man', 'Trans woman', 'msle', 'Neuter', 'Female (trans)',
       'queer', 'Female (cis)', 'Mail', 'cis male', 'A little about you',
       'Malr', 'p', 'femail', 'Cis Man',
       'ostensibly male, unsure what that really means'], dtype=object)

In [4]:
# Holy hamburgers! That's a lot of possibilities for gender. Let's clean it up
# Let's see their freq counts first
df.Gender.value_counts()

Male                                              615
male                                              206
Female                                            121
M                                                 116
female                                             62
F                                                  38
m                                                  34
f                                                  15
Make                                                4
Male                                                3
Woman                                               3
Cis Male                                            2
Man                                                 2
Female (trans)                                      2
Female                                              2
Trans woman                                         1
msle                                                1
male leaning androgynous                            1
Neuter                      

In [5]:
df.Gender = df.Gender.str.lower()
df.Gender = df.Gender = df.Gender.replace('m', 'male')
df.Gender = df.Gender.replace('f', 'female')
df['HasMale'] = df.Gender.str.contains('male|man|guy|maile|malr|androgyne|male|mal|make|msle')
df['HasFemale'] = df.Gender.str.contains('female|woman|femail|androgyne|femake')
df['HasNB'] = df.Gender.str.contains('non-binary|enby|queer|all|fluid|agender|neuter|p')
# That's gender cleaned up.
to_drop.append('Gender')
# Moving on.
df.describe(include=['O'])

Unnamed: 0,Timestamp,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
count,1259,1259,1259,744,1241,1259,1259,995,1259,1259,...,1259,1259,1259,1259,1259,1259,1259,1259,1259,164
unique,1246,41,48,45,2,2,2,4,6,2,...,5,3,3,3,3,3,3,3,2,160
top,2014-08-27 12:44:51,male,United States,CA,No,No,Yes,Sometimes,6-25,No,...,Don't know,No,No,Some of them,Yes,No,Maybe,Don't know,No,* Small family business - YMMV.
freq,2,971,751,138,1095,767,637,465,290,883,...,563,490,925,774,516,1008,557,576,1075,5


In [6]:
# Let's take care of country first.
df.Country.unique()

array(['United States', 'Canada', 'United Kingdom', 'Bulgaria', 'France',
       'Portugal', 'Netherlands', 'Switzerland', 'Poland', 'Australia',
       'Germany', 'Russia', 'Mexico', 'Brazil', 'Slovenia', 'Costa Rica',
       'Austria', 'Ireland', 'India', 'South Africa', 'Italy', 'Sweden',
       'Colombia', 'Latvia', 'Romania', 'Belgium', 'New Zealand',
       'Zimbabwe', 'Spain', 'Finland', 'Uruguay', 'Israel',
       'Bosnia and Herzegovina', 'Hungary', 'Singapore', 'Japan',
       'Nigeria', 'Croatia', 'Norway', 'Thailand', 'Denmark',
       'Bahamas, The', 'Greece', 'Moldova', 'Georgia', 'China',
       'Czech Republic', 'Philippines'], dtype=object)

In [7]:
# They're clean. They can be one-hot-encoded
for country in sorted(list(df.Country.unique())):
    df['Country_'+str(country)] = (df.Country == country).astype(int)
to_drop.append('Country')

In [8]:
# First we need to handle the missing values in state. There are simply too many to ignore
# Let's see where exactly they are missing. I suspect that only US states have been recorded
df.groupby('Country')['state'].apply(lambda x: x.isnull().mean())

Country
Australia                 1.000000
Austria                   1.000000
Bahamas, The              0.000000
Belgium                   1.000000
Bosnia and Herzegovina    1.000000
Brazil                    1.000000
Bulgaria                  0.750000
Canada                    1.000000
China                     1.000000
Colombia                  1.000000
Costa Rica                1.000000
Croatia                   1.000000
Czech Republic            1.000000
Denmark                   1.000000
Finland                   1.000000
France                    1.000000
Georgia                   1.000000
Germany                   1.000000
Greece                    1.000000
Hungary                   1.000000
India                     1.000000
Ireland                   1.000000
Israel                    0.800000
Italy                     1.000000
Japan                     1.000000
Latvia                    0.000000
Mexico                    1.000000
Moldova                   1.000000
Netherlands 

In [9]:
# As we can see, most countries have no state data.
# It's just easier to leave the NA's as they are
# We'll one hot them too.
for st in list(df.state.unique()):
    df['state_'+str(st)] = (df.state == st).astype(int)
to_drop.append('state')

# all the columns which are binary in nature, let's make them 01 based.
df.self_employed.fillna(df.self_employed.mode()[0], inplace=True)
for col in df.select_dtypes(include=['object']):
    u_count = len(df[col].unique()) 
    if u_count < 2:
        to_drop.append(col)
        print('adding ', col, 'to drop list as no variation')
    elif u_count == 2:
        first = list(df[col].unique())[-1]
        df[col] = (df[col] == first).astype(int)
        print('converted', col)

converted self_employed
converted family_history
converted treatment
converted remote_work
converted tech_company
converted obs_consequence


In [10]:
# Let's see what is still left
df.drop(to_drop, axis=1).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Columns: 120 entries, Age to state_ME
dtypes: bool(3), int64(101), object(16)
memory usage: 1.1+ MB


In [11]:
# For now we drop everything else.
df.work_interfere.unique()

array(['Often', 'Rarely', 'Never', 'Sometimes', nan], dtype=object)

In [12]:
df.work_interfere.fillna(df.work_interfere.mode().values[0], inplace=True)
df.work_interfere = df.work_interfere.map({'Never': 0, 'Rarely': 1,
                                           'Sometimes': 2, 'Often': 3})
df.no_employees.unique()

array(['6-25', 'More than 1000', '26-100', '100-500', '1-5', '500-1000'],
      dtype=object)

In [13]:
df.no_employees = df.no_employees.map({'6-25': 6, '26-100': 26,
                                       '100-500': 100, '500-1000': 500,
                                       'More than 1000': 1000, '1-5': 1
                                      })
df.benefits.unique()

array(['Yes', "Don't know", 'No'], dtype=object)

In [14]:
# There is another pattern here. We take advantage of that:
option_map = {'Yes': 1, 'No': -1, "Don't know": 0,
              'Not sure': 0, 'Maybe': 0, 'Some of them': 0}
ynns = {'Yes': 1, 'No': -1, 'Not sure': 0}
for col in df.select_dtypes(include=['object']):
    uniques = set(df[col].unique())
    if (uniques == {'Yes', 'No', "Don't know"} or
        uniques == {'Yes', 'No', 'Not sure'} or
        uniques == {'Yes', 'No', 'Maybe'} or
        uniques == {'Yes', 'No', 'Some of them'}):
        print('encoding', col, 'To -1, 0, 1')
        df[col] = df[col].map(option_map)

encoding benefits To -1, 0, 1
encoding care_options To -1, 0, 1
encoding wellness_program To -1, 0, 1
encoding seek_help To -1, 0, 1
encoding anonymity To -1, 0, 1
encoding mental_health_consequence To -1, 0, 1
encoding phys_health_consequence To -1, 0, 1
encoding coworkers To -1, 0, 1
encoding supervisor To -1, 0, 1
encoding mental_health_interview To -1, 0, 1
encoding phys_health_interview To -1, 0, 1
encoding mental_vs_physical To -1, 0, 1


In [15]:
df.describe(include=['O'])

Unnamed: 0,Timestamp,Gender,Country,state,leave,comments
count,1259,1259,1259,744,1259,164
unique,1246,41,48,45,5,160
top,2014-08-27 12:44:51,male,United States,CA,Don't know,* Small family business - YMMV.
freq,2,971,751,138,563,5


In [16]:
df.leave.unique()

array(['Somewhat easy', "Don't know", 'Somewhat difficult',
       'Very difficult', 'Very easy'], dtype=object)

In [17]:
df.leave = df.leave.map({'Very easy': 0, 'Somewhat easy': 1, "Don't know": 2, 'Somewhat difficult': 3,
                         'Very difficult': 4
                        })
# this leaves comments as the only string data. Since it's quiet small in number, we'll drop it

In [18]:
to_drop.append('comments')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Columns: 124 entries, Timestamp to state_ME
dtypes: bool(3), int64(116), object(5)
memory usage: 1.2+ MB


In [19]:
# We obtain a clean dataset. Now we can try predicting stuff.
print(to_drop)
data = df.drop(to_drop, axis=1)
data.info()

['Timestamp', 'Gender', 'Country', 'state', 'comments']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Columns: 119 entries, Age to state_ME
dtypes: bool(3), int64(116)
memory usage: 1.1 MB


# Since we have nothing to classify/ regress, we choose some

In [20]:
# Thos who have shought treatment
x, y = data.drop('treatment', axis=1), data.treatment

# FIRST-AUTHOR: remove ML code
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import cross_val_score
# model = RandomForestClassifier(n_jobs=-1, n_estimators=200, class_weight='balanced')
# scores = cross_val_score(model, x, y, scoring='roc_auc', cv=5)
# print(scores.mean())

In [21]:
# Family history
x, y = data.drop('family_history', axis=1), data.family_history
# FIRST-AUTHOR: remove ML code
# model = RandomForestClassifier(n_jobs=-1, n_estimators=200, class_weight='balanced')
# scores = cross_val_score(model, x, y, scoring='roc_auc', cv=5)
# print(scores.mean())

We leave it at that for now.