## Data set-up and cleaning

In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import psycopg2 as pg
%matplotlib inline
import seaborn as sns
import dbcreds

In [2]:
conn = pg.connect(dbname = dbcreds.database, host=dbcreds.host, user=dbcreds.user, password = dbcreds.password)
apps_df = pd.read_sql('select * from cascais.application', con=conn)
move_df = pd.read_sql('select * from cascais.movement', con=conn)

In [3]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [4]:
# print overview stats
print len(apps_df)
print len(move_df)

125029
870061


In [5]:
move_df.loc[:, 'year_month_yr'] = move_df.loc[:, 'year_month'].apply(lambda x: int(str(x)[0:4]))
move_df.loc[:, 'year_month_mth'] = move_df.loc[:, 'year_month'].apply(lambda x: int(str(x)[4:6]))

move_df.loc[:, 'movement_date_yr'] = move_df.loc[:, 'movement_date'].apply(lambda x: None if x=='' else int(str(x)[0:4]))
move_df.loc[:, 'movement_date_mth'] = move_df.loc[:, 'movement_date'].apply(lambda x: None if x=='' else int(str(x)[5:7]))

In [6]:
move_df.loc[:, 'year_diff'] = move_df.loc[:, 'year_month_yr']-move_df.loc[:, 'movement_date_yr']

In [7]:
move_df.head()

Unnamed: 0,ute_id,movement_date,year_month,application_id,movement_type,movement_subtype,movement_result,movement_index,year_month_yr,year_month_mth,movement_date_yr,movement_date_mth,year_diff
0,677,,200701,-1,convocation,GERAL UTENTE,COMPARECEU,3,2007,1,,,
1,677,,200702,-1,convocation,INTERVENÃÃO TÃCNICA,COMPARECEU,2752,2007,2,,,
2,677,2007-09-26 00:00:00,200709,-1,cancellation,FALTA AO CONTROLO,,7180,2007,9,2007.0,9.0,0.0
3,710,2016-12-06 00:00:00,201612,120674,application,DESEMPREGADO-NOVO EMPREGO,,120674,2016,12,2016.0,12.0,0.0
4,710,,201701,120674,convocation,INTERVENÃÃO TÃCNICA,COMPARECEU,248206,2017,1,,,


In [8]:
# examine movements of ute_id 135175
move_df[move_df['ute_id']==135175]

Unnamed: 0,ute_id,movement_date,year_month,application_id,movement_type,movement_subtype,movement_result,movement_index,year_month_yr,year_month_mth,movement_date_yr,movement_date_mth,year_diff
3907,135175,2007-01-24 00:00:00,200701,8,application,DESEMPREGADO-NOVO EMPREGO,,8,2007,1,2007.0,1.0,0.0
3908,135175,,200702,8,interview,,RECUSA DE ENTIDADE EMPREGADORA - DESAJUSTAMENT...,1150,2007,2,,,
3909,135175,,200705,8,convocation,SESSÃO COLECTIVA,ANULADA,10141,2007,5,,,
3910,135175,2007-05-22 00:00:00,200705,8,cancellation,"COLOCAÃÃO POR MEIOS PRÃPRIOS, POR CONTA DE ...",,3661,2007,5,2007.0,5.0,0.0
3911,135175,2008-02-12 00:00:00,200802,10689,application,DESEMPREGADO-NOVO EMPREGO,,10689,2008,2,2008.0,2.0,0.0
3912,135175,,200803,10689,convocation,SESSÃO COLECTIVA,COMPARECEU,31908,2008,3,,,
3913,135175,,200804,10689,convocation,INTERVENÃÃO TÃCNICA,COMPARECEU,33393,2008,4,,,
3914,135175,,200805,10689,intervention,EFA - FORMAÃÃO CONTINUA - NÃVEL SECUNDÃRIO,,5601,2008,5,,,
3915,135175,2008-05-29 16:35:25,200805,10689,cancellation,FREQUÃNCIA DE FORMAÃÃO DO IEFP (ENTRADA SGFOR),,13707,2008,5,2008.0,5.0,0.0
3916,135175,2009-11-19 00:00:00,200911,30893,application,DESEMPREGADO-NOVO EMPREGO,,30893,2009,11,2009.0,11.0,0.0


In [9]:
# filter out movements that don't start with applications
# filter out movements that don't belong to an application date between Jan 2007 - April 2015
# Remove applications that had "phantom" movements after an exit
# Remove intermediary exits for those with a positive exit

In [10]:
# create datetime objects for date of applications
from datetime import datetime, timedelta
apps_df.loc[:,'date']= apps_df.loc[:,'candidatura_data'].apply(
    lambda x: None if x=='' else datetime(year=int(x[0:4]),
                       month=int(x[5:7]),
                       day=int(x[8:10])))

In [11]:
# Identify application IDs that are associated with applications submitted after
# April 2015 - to allow for 2 years of follow-up data
apps_not_late = apps_df[apps_df['date']<datetime(2015, 4, 1)].loc[:,'table_index'].tolist()
print len(apps_not_late)

apps_df = apps_df[apps_df['table_index'].isin(apps_not_late)]
move_df = move_df[move_df['application_id'].isin(apps_not_late)]

print len(apps_df)
print len(move_df)

99659
99659
684311


## Variables for applications / demographics

In [12]:
# create english education levels
eng_edu = {'11': '11', '12': '12', '04': '04', '06': '06', '09':'09',
          'BM': 'bachelor', 'DT': 'doctoral', 'PS': 'post-secondary',
          'SL': 'grade school', 'LC': 'licensed',
          'MT': 'masters', 'NS': 'cannot read/write'}
    
apps_df.loc[:, 'education'] = apps_df['chabilitacao_escolar'].map(eng_edu)

# create english categories
eng_category = {'DESEMPREGADO-NOVO EMPREGO': 'Unemployed - new job',
          'DESEMPREGADO-1Âº EMPREGO': 'Unemployed - first job',
          'EMPREGADO': 'Employed',
          'EMPREGADO A TEMPO PARCIAL':'Part-time employed'}
apps_df.loc[:, 'category'] = apps_df['dcategoria'].map(eng_category)

# create english nationality
eng_nat = {'PORTUGAL':'PORTUGAL','BRASIL': 'BRAZIL',
           'GUINÃ-BISSAU':'GUINEA-BISSAU','CABO VERDE': 'CAPE VERDE'}

apps_df.loc[:,'nationality'] = apps_df['dnacionalidade'].apply(
lambda x: eng_nat[x] if x in eng_nat.keys() else 'OTHER')

#convert dependents to integer
apps_df.loc[:,'all_dependents'] = apps_df.loc[:,'ute_nr_pessoas_cargo'].apply(
    lambda x: None if x is None else float(x))
apps_df.loc[:,'descendants'] = apps_df.loc[:,'ute_nr_descendentes_cargo'].apply(
    lambda x: None if x is None else float(x))

#create age buckets
def age_bucket(x):
    if x<30:
        return "<30"
    elif x<50:
        return "30-49"
    elif x<65:
        return "50-64"
    else:
        return "65+"

apps_df.loc[:, 'age_bucket'] = apps_df.loc[:, 'ute_idade'].apply(lambda x: age_bucket(x))

In [13]:
# create dictionary of applicaiton dates
app_date_dict = dict(zip(apps_df['table_index'], apps_df['date']))

In [14]:
# create dictionary of next application date
apps_df = apps_df.sort_values(['ute_id', 'date'])
apps_df = apps_df.reset_index(drop=True)

next_app_list=[]
for index in range(len(apps_df)-1):
    if apps_df.loc[index,'ute_id'] == apps_df.loc[index+1, 'ute_id']:
        next_app_list.append(apps_df.loc[index+1,'date'])
    else:
        next_app_list.append(np.nan)
next_app_list.append(np.nan)

apps_df.loc[:, 'next_app_date'] = next_app_list

next_app_date_dict = dict(zip(apps_df['table_index'], apps_df['next_app_date']))

In [15]:
# check that bachelors and licensed are exchangable - they are NOT
# apps_df.loc[:,'year']=apps_df.loc[:,'anomes'].apply(lambda x: int(str(x)[0:4]))
# DataFrame(apps_df[apps_df['education'].isin(['licensed', 'bachelor'])].groupby(['education', 'year']).ute_id.count()).unstack()

## Variables for movements

In [16]:
# create datetime objects for date of applications / cancellations
# from datetime import datetime, timedelta
# move_df.loc[:,'date']= move_df.loc[:,'movement_date'].apply(
#     lambda x: None if x=='' else datetime(year=int(x[0:4]),
#                        month=int(x[5:7]),
#                        day=int(x[8:10])))

In [17]:
# create datetime objects for date of applications / cancellations
# create datetime objects placeholder for interviews

def to_datetime(x):
    if x=='':
        return None
    else:
        return datetime(year=int(x[0:4]), month=int(x[5:7]), day=int(x[8:10]))

dates = []
for index in move_df.index:
    if move_df.loc[index,'movement_type'] in ['application', 'cancellation']:
        dates.append(to_datetime(move_df.loc[index, 'movement_date']))
    elif move_df.loc[index,'movement_type'] =='interview':
        dates.append(datetime(
            year = int(str(move_df.loc[index,'year_month'])[0:4]),
            month = int(str(move_df.loc[index,'year_month'])[4:6]),
            day = 15))
    else:
        dates.append(np.nan)
    
move_df.loc[:, 'date'] = dates

In [18]:
# identify exits from system and the job path they took
positive_cancellations = ['COLOCAÃÃO POR MEIOS PRÃPRIOS, POR CONTA DE OUTREM',
                         'COLOCAÃÃO POR MEIOS PRÃPRIOS, POR CONTA PRÃPRIA',
                         'COLOCAÃÃO - CANDIDATURA EXTERNA'
                         'COLOCAÃÃO - CANDIDATURA INTERNA']
exit_reason_list=[]
job_path_list=[]

def identify_exits(df):
    for index in df.index:
        if df['movement_result'][index] =='ADMITIDO / COLOCADO':
            exit_reason_list.append('found_job')
            job_path_list.append('through_IEFP')
        elif df['movement_subtype'][index] in positive_cancellations: #df['movement_type'][index] == 'cancellation':
            exit_reason_list.append('found_job')
            job_path_list.append('outside')
        elif df['movement_type'][index] == 'cancellation':
            exit_reason_list.append('non_job_cancel')
            job_path_list.append(np.nan)
        else:
            exit_reason_list.append('not_exit')
            job_path_list.append(np.nan)
    df.loc[:,'exit_reason']=exit_reason_list
    df.loc[:,'job_path']=job_path_list
    
identify_exits(move_df)

In [19]:
# add time since application
move_df.loc[:, 'app_date'] = move_df.loc[:, 'application_id'].map(app_date_dict)
move_df.loc[:, 'time_since_app'] = move_df.loc[:, 'date'] - move_df.loc[:, 'app_date']

In [20]:
# add time until next application
move_df.loc[:, 'next_app_date'] = move_df.loc[:, 'application_id'].map(next_app_date_dict)
move_df.loc[:, 'time_until_next_app'] = move_df.loc[:, 'next_app_date'] - move_df.loc[:,'date']

In [21]:
len(move_df[(move_df['exit_reason']=='found_job')])

25783

In [22]:
# identify positive exits within 12 months
short_term = move_df[(move_df['exit_reason']=='found_job') & (move_df['time_since_app']<=timedelta(days=365))]

# if application ID has multiple exits, take the last one
short_term = short_term.groupby('application_id').last().reset_index(level='application_id')

In [23]:
short_term.loc[:, 'placement_length'] = short_term.loc[:, 'time_until_next_app'].apply(lambda x: "returned" if x<timedelta(days=365) else "held_on")

In [24]:
# create lists of applications in each label
ST_returned = short_term[short_term['placement_length']=='returned'].application_id.unique().tolist()
ST_held_on = short_term[short_term['placement_length']=='held_on'].application_id.unique().tolist()
print len(ST_returned)
print len(ST_held_on)

7459
11741


In [25]:
# number of applications that were short-term unemployed

print color.BOLD+'\nUnique applications that found a job within 12 months'+color.END
print len(short_term)
print short_term.application_id.nunique()
print float(len(short_term))/float(len(apps_df))

print color.BOLD+'\nUnique applications ST by path'+color.END
print short_term.groupby(['job_path']).application_id.nunique()

print color.BOLD+'\nUnique applications ST by held_on'+color.END
print short_term.groupby(['placement_length']).application_id.nunique()

print color.BOLD+'\nUnique applications ST by path and placement_length'+color.END
print short_term.groupby(['job_path','placement_length']).application_id.nunique()

[1m
Unique applications that found a job within 12 months[0m
19200
19200
0.192656960234
[1m
Unique applications ST by path[0m
job_path
outside         15781
through_IEFP     3419
Name: application_id, dtype: int64
[1m
Unique applications ST by held_on[0m
placement_length
held_on     11741
returned     7459
Name: application_id, dtype: int64
[1m
Unique applications ST by path and placement_length[0m
job_path      placement_length
outside       held_on             10013
              returned             5768
through_IEFP  held_on              1728
              returned             1691
Name: application_id, dtype: int64


In [26]:
# identify non job exits within 12 months, and whether they reappear
non_job_cancel = move_df[(move_df.loc[:,'exit_reason']=='non_job_cancel')]# & (move_df['time_since_app']<=timedelta(days=365))]
non_job_cancel.loc[:,'reappears']=non_job_cancel.loc[:,'time_until_next_app'].apply(lambda x: 'reappears' if x<=timedelta(days=365) else 'no')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [27]:
#only take last cancellation for each app id
non_job_cancel = non_job_cancel.groupby('application_id').last().reset_index(level='application_id')

In [28]:
print color.BOLD+'Unique applications that canceled within 12 months w/o finding a job'+color.END
print non_job_cancel.application_id.nunique()

print color.BOLD+'\nUnique applications that canceled within 12 months w/o finding a job, by reappearance'+color.END
print non_job_cancel.groupby('reappears').application_id.nunique()
# print len(short_term)
# print short_term.application_id.nunique()
# print float(len(short_term))/float(len(apps_df))

[1mUnique applications that canceled within 12 months w/o finding a job[0m
70543
[1m
Unique applications that canceled within 12 months w/o finding a job, by reappearance[0m
reappears
no           51113
reappears    19430
Name: application_id, dtype: int64


In [29]:
# label cancellations with neutral reasons
neutral_cancellations = ['EMIGRAÃÃO, AUSÃNCIA DO PAÃS',
                        'INCAPACIDADE PROLONGADA / PERMANENTE PARA O TRABALHO']

In [30]:
DataFrame(move_df[move_df['movement_type']=='cancellation'].movement_subtype.value_counts(ascending=False))

Unnamed: 0,movement_subtype
FALTA A CONVOCATÃRIA,27473
FALTA AO CONTROLO,26593
"COLOCAÃÃO POR MEIOS PRÃPRIOS, POR CONTA DE OUTREM",19045
"EMIGRAÃÃO, AUSÃNCIA DO PAÃS",4107
TRANSFERÃNCIA DE CENTRO DE EMPREGO,2683
FREQUÃNCIA DE FORMAÃÃO DO IEFP (ENTRADA SGFOR),1649
REFORMOU-SE,1508
"COLOCAÃÃO POR MEIOS PRÃPRIOS, POR CONTA PRÃPRIA",1053
INSERÃÃO NOUTROS PROGRAMAS/MEDIDAS EMPREGO (SIEF),979
DESISTIU,823


## Explore category changes

In [31]:
# map category changes codes to description
cat_translation = {1.0: 'Unemployed - 1st job', 2.0: 'Unemployed - new job',
                  3: 'Employed', 4: 'Part-time employed', 5: 'Busy',
                  6: 'Unavailable - 1st job', 7: 'Unavailable - new job',
                  8: 'Unavailable - employed'}
# cat_changes_df['previous_cat_int'] = cat_changes_df['candidatura_categoria_anterior'].apply(lambda x: float(x) if pd.notnull(x) else None)
# cat_changes_df['previous_cat'] = cat_changes_df['previous_cat_int'].apply(lambda x: cat_translation[x] if pd.notnull(x) else np.nan)
# cat_changes_df['new_cat'] = cat_changes_df['ccategoria'].apply(lambda x: cat_translation[x] if pd.notnull(x) else np.nan)

In [32]:
# move_df[:, 'previous_category'] = move_df['movement_subtype'].map(cat_translation)
# move_df[:, 'new_category'] = move_df['movement_result'].map(cat_translation)

In [33]:
move_df[move_df['movement_type']=='category_change'].head(20)

Unnamed: 0,ute_id,movement_date,year_month,application_id,movement_type,movement_subtype,movement_result,movement_index,year_month_yr,year_month_mth,movement_date_yr,movement_date_mth,year_diff,date,exit_reason,job_path,app_date,time_since_app,next_app_date,time_until_next_app
28,820,,201105,22603,category_change,,3,24326,2011,5,,,,NaT,not_exit,,2009-03-20,NaT,2012-05-17,NaT
54,836,,200907,8464,category_change,,3,13118,2009,7,,,,NaT,not_exit,,2007-11-05,NaT,2011-02-15,NaT
60,836,,201111,50118,category_change,2.0,5,27866,2011,11,,,,NaT,not_exit,,2011-07-27,NaT,NaT,NaT
62,836,,201204,50118,category_change,5.0,2,32168,2012,4,,,,NaT,not_exit,,2011-07-27,NaT,NaT,NaT
167,923,,201405,66570,category_change,2.0,5,67300,2014,5,,,,NaT,not_exit,,2012-10-30,NaT,NaT,NaT
172,923,,201406,66570,category_change,5.0,2,69108,2014,6,,,,NaT,not_exit,,2012-10-30,NaT,NaT,NaT
177,923,,201408,66570,category_change,2.0,3,72300,2014,8,,,,NaT,not_exit,,2012-10-30,NaT,NaT,NaT
184,932,,201106,40942,category_change,,2,25021,2011,6,,,,NaT,not_exit,,2010-09-14,NaT,NaT,NaT
185,932,,201106,40942,category_change,,7,25022,2011,6,,,,NaT,not_exit,,2010-09-14,NaT,NaT,NaT
186,932,,201107,40942,category_change,,2,25510,2011,7,,,,NaT,not_exit,,2010-09-14,NaT,NaT,NaT


In [34]:
apps_found_job = move_df[(move_df['exit_reason']=='found_job')].application_id.unique().tolist()