In [1]:
# Std Lib
import re

# 3rd Party
import pandas as pd

In [2]:
!ls

Baltimore_City_Employee_Salaries_FY2015.csv
README.md
Untitled.ipynb


## Data Cleaning

In [3]:
data = pd.read_csv('Baltimore_City_Employee_Salaries_FY2015.csv')

In [4]:
data.head()

Unnamed: 0,name,JobTitle,AgencyID,Agency,HireDate,AnnualSalary,GrossPay
0,"Aaron,Patricia G",Facilities/Office Services II,A03031,OED-Employment Dev (031),10/24/1979,$55314.00,$53626.04
1,"Aaron,Petra L",ASSISTANT STATE'S ATTORNEY,A29045,States Attorneys Office (045),09/25/2006,$74000.00,$73000.08
2,"Abaineh,Yohannes T",EPIDEMIOLOGIST,A65026,HLTH-Health Department (026),07/23/2009,$64500.00,$64403.84
3,"Abbene,Anthony M",POLICE OFFICER,A99005,Police Department (005),07/24/2013,$46309.00,$59620.16
4,"Abbey,Emmanuel",CONTRACT SERV SPEC II,A40001,M-R Info Technology (001),05/01/2013,$60060.00,$54059.60


In [5]:
data.dtypes

name            object
JobTitle        object
AgencyID        object
Agency          object
HireDate        object
AnnualSalary    object
GrossPay        object
dtype: object

In [6]:
data.HireDate = pd.to_datetime(data.HireDate)

In [7]:
# for now just drop the na values that are in the gross pay. we can 
data = data.dropna()

In [8]:
# convert all the financial data to floats
x = lambda x: re.findall(r'[^$]*', x)[1]
data.AnnualSalary = data.AnnualSalary.apply(x)
data.AnnualSalary = pd.to_numeric(data.AnnualSalary)

data.GrossPay = data.GrossPay.apply(x)
data.GrossPay = pd.to_numeric(data.GrossPay)

In [9]:
data.head()

Unnamed: 0,name,JobTitle,AgencyID,Agency,HireDate,AnnualSalary,GrossPay
0,"Aaron,Patricia G",Facilities/Office Services II,A03031,OED-Employment Dev (031),1979-10-24,55314.0,53626.04
1,"Aaron,Petra L",ASSISTANT STATE'S ATTORNEY,A29045,States Attorneys Office (045),2006-09-25,74000.0,73000.08
2,"Abaineh,Yohannes T",EPIDEMIOLOGIST,A65026,HLTH-Health Department (026),2009-07-23,64500.0,64403.84
3,"Abbene,Anthony M",POLICE OFFICER,A99005,Police Department (005),2013-07-24,46309.0,59620.16
4,"Abbey,Emmanuel",CONTRACT SERV SPEC II,A40001,M-R Info Technology (001),2013-05-01,60060.0,54059.6


In [10]:
data.dtypes

name                    object
JobTitle                object
AgencyID                object
Agency                  object
HireDate        datetime64[ns]
AnnualSalary           float64
GrossPay               float64
dtype: object

## Feature Engineering

In [11]:
today = pd.datetime(2015, 5, 30)
x = lambda x: today - x
data['tenure'] = data.HireDate.apply(x)

x = lambda x: int(x / pd.Timedelta(1, 'Y'))
data['tenure_years'] = data.tenure.apply(x)

In [12]:
data.head()

Unnamed: 0,name,JobTitle,AgencyID,Agency,HireDate,AnnualSalary,GrossPay,tenure,tenure_years
0,"Aaron,Patricia G",Facilities/Office Services II,A03031,OED-Employment Dev (031),1979-10-24,55314.0,53626.04,13002 days,35
1,"Aaron,Petra L",ASSISTANT STATE'S ATTORNEY,A29045,States Attorneys Office (045),2006-09-25,74000.0,73000.08,3169 days,8
2,"Abaineh,Yohannes T",EPIDEMIOLOGIST,A65026,HLTH-Health Department (026),2009-07-23,64500.0,64403.84,2137 days,5
3,"Abbene,Anthony M",POLICE OFFICER,A99005,Police Department (005),2013-07-24,46309.0,59620.16,675 days,1
4,"Abbey,Emmanuel",CONTRACT SERV SPEC II,A40001,M-R Info Technology (001),2013-05-01,60060.0,54059.6,759 days,2


In [13]:
x = lambda x: x[:-3]
data['AgencyCode'] = data.AgencyID.apply(x)

x = lambda x: x[-3:]
data['AgencySubCode'] = data.AgencyID.apply(x)

In [16]:
# this crazy regex is to 
x = lambda x: re.sub(r' \([\w-]+\) *\(*[\w]*\)*|-Hea[lt]{2}h.*| \(.*| \d+', '', x)
data['AgencyCat'] = data.Agency.apply(x)

In [17]:
data.head()

Unnamed: 0,name,JobTitle,AgencyID,Agency,HireDate,AnnualSalary,GrossPay,tenure,tenure_years,AgencyCode,AgencySubCode,AgencyCat
0,"Aaron,Patricia G",Facilities/Office Services II,A03031,OED-Employment Dev (031),1979-10-24,55314.0,53626.04,13002 days,35,A03,31,OED-Employment Dev
1,"Aaron,Petra L",ASSISTANT STATE'S ATTORNEY,A29045,States Attorneys Office (045),2006-09-25,74000.0,73000.08,3169 days,8,A29,45,States Attorneys Office
2,"Abaineh,Yohannes T",EPIDEMIOLOGIST,A65026,HLTH-Health Department (026),2009-07-23,64500.0,64403.84,2137 days,5,A65,26,HLTH
3,"Abbene,Anthony M",POLICE OFFICER,A99005,Police Department (005),2013-07-24,46309.0,59620.16,675 days,1,A99,5,Police Department
4,"Abbey,Emmanuel",CONTRACT SERV SPEC II,A40001,M-R Info Technology (001),2013-05-01,60060.0,54059.6,759 days,2,A40,1,M-R Info Technology


In [145]:
data.pivot_table(index=['AgencyCode', 'AgencyCat'], aggfunc=pd.Series.nunique, values=['Agency','name'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Agency,name
AgencyCode,AgencyCat,Unnamed: 2_level_1,Unnamed: 3_level_1
A01,Mayor's Office,1,10
A01,Mayors Office,2,83
A02,City Council,3,93
A03,OED-Employment Dev,17,159
A04,R&P-Recreation,9,139
A06,Housing & Community Dev,30,383
A08,M-R Human Services,12,149
A09,Liquor License Board,2,24
A12,FIN-Acct & Payroll,3,51
A14,FIN-Collections,8,131
