In [115]:
import pandas as pd
import numpy as np
import re

from matplotlib import pyplot as plt
import matplotlib as mpl
import seaborn as sns

### Read in the pre- and post-workshop data for all years available 

In [104]:
pre2014 = pd.read_csv("pre2014.csv")
post2014 = pd.read_csv("post2014.csv")
pre2015 = pd.read_csv("pre2015.csv")
pre2016 = pd.read_csv("pre2016.csv")

### Some initial preprocessing steps

The first thing to do is combine all three of the pre-workshop intake forms into one file. Each of these three files uses slightly different column names for most of the same features, so they need to all be renamed to a standardized column name before merging. 

Note also that many rows contain missing date for the `Date` field, and the 2016 data only includes the year for the `Date` field. We will drop this feature when merging the datasets and instead just aggregate by year. 

In [105]:
# Create mappings for column names of 2014 data
col_mappings_2014 = {'Zip code': 'ZipCode',
                     'Interested in receiving more information': 'MoreInfo',
                     'Status/FInancial Situation': 'Status',
                     'Age Range': 'Age'}

# Create mappings for column names of 2015 data
col_mappings_2015 = {'Zip code': 'ZipCode',
                     'How did you hear about us': 'Source',
                     'Interested in receiving more information': 'MoreInfo', 
                     'Age Range': 'Age'}

# Create mappings for column names of 2016 data
col_mappings_2016 = {'Zip Code:': 'ZipCode', 
                     'Race:': 'Race',
                     'Gender:': 'Gender', 
                     'How did you hear about this workshop?': 'Source',
                     'Current Status (please select one):': 'Status',
                     'Age Range (please select one):': 'Age',
                     'Interested in receiving more information': 'MoreInfo',
                     'Comments / Please Help me With': 'Comments', 
                     'Partner (If Applicable)': 'Partner'}


# Create a "Year" column for each workshop file before merging
for year, file in zip([2014, 2015, 2016], [pre2014, pre2015, pre2016]):
    file['Year'] = year

# Rename columns using the dictionaries above for each file
pre2014 = pre2014.rename(columns = col_mappings_2014)
pre2015 = pre2015.rename(columns = col_mappings_2015)
pre2016 = pre2016.rename(columns = col_mappings_2016)

# Concatenate the three files, drop the date file
df = pd.concat((pre2014, pre2015, pre2016), join='outer', ignore_index=True).drop(columns = ['Date', 'Partner'])

# Preview the combined dataframe output
df.head()

Unnamed: 0,Workshop,USERID,ZipCode,MoreInfo,Status,Age,Comments,Year,Location,Race,Gender,Source
0,Resume Workshop,14875,20748,Yes,Employed but seeking better opportunities,41-50,I'm not sure at this time,2014,,,,
1,Dream Building,14876,20020,,"Not fulfilled, but not happy",30-34,,2014,,,,
2,Resume Workshop,14877,20746,Yes,Unemployed,51-60,job search,2014,,,,
3,Dream Building,14878,20020,Yes,Fulfilled in life,25-29,,2014,,,,
4,Resume Workshop,14879,20748,,Unemployed,18-25,,2014,,,,


### Additional Cleaning

* The `MoreInfo` column contains multiple possible responses which should be standardized: "Yes", "Y", and "Yes," should all map to "Yes", and "No, thank you.", "N", "No, thank you", and a blank response should all map to "No".
* The `ZipCode` column contains some 5-digit ZIP codes and some 9-digit ZIP codes which should be reduced to 5-digit ZIP codes; all values should be converted to strings and not integer/float (for comparison of categorical variables).
* In some years, '61+' was an option in `Age`, while in other years, '61 & up' was the option instead; these should both be changed to '61+'.
* The `Race` column includes lots of different options for the same three outcomes (e.g. "Black", "African American", "black", etc. all exist in this column); these responses should be more standardized.
* The `Workshop` names vary from year to year and should be standardized a bit more.

In [109]:
# Convert `MoreInfo` to a better representation
df['MoreInfo'] = df.MoreInfo.apply(lambda x: 'Yes' if x in ['Yes', 'Y', 'Yes,'] else 'No')

# Condense 9-digit ZIP codes and convert all to the same representation
df['ZipCode'] = [str(z)[:5] for z in df.ZipCode]

# Combine the responses "61 & up" and "61+" to both be "61+" in the `Age` column
df['Age'] = df.Age.apply(lambda x: '61+' if x == '61 & up' else x)

# Use regex matching to convert the `Race` responses to one of the three observed categorical outcomes
df['Race'] = df.Race.apply(lambda x: 'Black' if bool(re.search(r'(?i)(black)|(african)', str(x))) else x) 
df['Race'] = df.Race.apply(lambda x: 'White' if bool(re.search(r'(?i)white', str(x))) else x)
df['Race'] = df.Race.apply(lambda x: 'Hispanic' if bool(re.search(r'(?i)(hispanic)|(latin.)', str(x))) else x)

# Convert workshop titles to a more standardized set of titles
workshop_names = {'Resume Workshop': 'Resume', 
                  'Budget Workshop': 'Budget',
                  'Creating a Budget': 'Budget',
                  'Landing the Job - Interview': 'Interview',
                  'Resume Building': 'Resume',
                  'Understanding Credit': 'Credit'}
                  
df['Workshop'] = df.Workshop.apply(lambda x: workshop_names[x] if x in workshop_names else x)

df.head()

Unnamed: 0,Workshop,USERID,ZipCode,MoreInfo,Status,Age,Comments,Year,Location,Race,Gender,Source
0,Resume,14875,20748,Yes,Employed but seeking better opportunities,41-50,I'm not sure at this time,2014,,,,
1,Dream Building,14876,20020,No,"Not fulfilled, but not happy",30-34,,2014,,,,
2,Resume,14877,20746,Yes,Unemployed,51-60,job search,2014,,,,
3,Dream Building,14878,20020,Yes,Fulfilled in life,25-29,,2014,,,,
4,Resume,14879,20748,No,Unemployed,18-25,,2014,,,,
