# Data set-up and cleaning

In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import psycopg2 as pg
%matplotlib inline
import seaborn as sns
import dbcreds

In [2]:
conn = pg.connect(dbname = dbcreds.database, host=dbcreds.host, user=dbcreds.user, password = dbcreds.password)

In [3]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [4]:
events_df = pd.read_sql('select * from cascais.movement', con=conn)

In [5]:
#print overall counts for dataset - ORIGINAL dataset
print len(events_df)
print events_df.application_id.nunique()
print events_df.ute_id.nunique()

870061
125030
74791


In [34]:
# identify first event for each app ID
first_events = events_df.groupby("application_id").first().movement_type

# identify list of app IDs that do not start with application
apps_to_remove=[]
for app_id in range(1,len(first_events)+1):
    if first_events[app_id] != "application":
        apps_to_remove.append(app_id)
        
#remove all application ids that does not start with an application
events_df = events_df[~events_df['application_id'].isin(apps_to_remove)]

KeyError: 125030

In [None]:
#print overall counts for dataset - REMOVE sets not starting with application
print len(events_df)
print events_df.application_id.nunique()
print events_df.ute_id.nunique()

In [None]:
# Identify application IDs that are associated with applications submitted after
# April 2015 - to allow for 2 years of follow-up data
late_apps = []
for i, row in events_df.iterrows():
    if row['movement_type']=='application' and row['date']>201604:
        late_apps.append(row['application_id'])

In [None]:
#remove late application IDs
events_df = events_df[~events_df['application_id'].isin(late_apps)]


In [None]:
#print overall counts for dataset - REMOVE starting w/o app, & apps w/o
# 2 years of follow-up
print len(events_df)
print events_df.application_id.nunique()
print events_df.ute_id.nunique()

In [None]:
# create datetime objects
from datetime import datetime, timedelta
events_df.loc[:,'datetime']= events_df.loc[:,'date'].apply(
    lambda x: datetime(year=int(str(x)[0:4]), month=int(str(x)[4:6]), day=1))

# add time since application
application_dates_series = events_df.groupby('application_id').first().datetime
events_df.loc[:, 'application_date'] = events_df.loc[:, 'application_id'].apply(lambda x: application_dates_series[x])
events_df.loc[:, 'time_since_app'] = events_df.loc[:, 'datetime']-events_df.loc[:,'application_date']

In [None]:
# identify exits from system 
positive_cancellations = ['COLOCAÃÃO POR MEIOS PRÃPRIOS, POR CONTA DE OUTREM',
                         'COLOCAÃÃO POR MEIOS PRÃPRIOS, POR CONTA PRÃPRIA',
                         'COLOCAÃÃO - CANDIDATURA EXTERNA'
                         'COLOCAÃÃO - CANDIDATURA INTERNA']
is_exit_list=[]
def is_exit(df):
    for index in df.index:
        if df['movement_result'][index] =='ADMITIDO / COLOCADO' or df['movement_subtype'][index] in positive_cancellations: #df['movement_type'][index] == 'cancellation':
            is_exit_list.append('yes')
        else:
            is_exit_list.append('no')
    df.loc[:,'is_exit']=is_exit_list
    
is_exit(events_df)

In [None]:
#print overall counts
print color.BOLD+'Number of unique applications'+color.END
print events_df.application_id.nunique()

print color.BOLD+'\nNumber of unique users'+color.END
print events_df.ute_id.nunique()

print color.BOLD+'\nTotal number of placements through an interview'+color.END
print len(events_df[events_df['movement_result']=='ADMITIDO / COLOCADO'])

In [None]:
# count how many supposed positive exits end up with a non-application as a next event
def next_event(df, index, step):
    if index+step == len(df):
        return 'end'
    elif df.loc[index+step, 'ute_id'] != df.loc[index, 'ute_id']:
        return 'new_user'
    elif df.loc[index+step,'datetime'] == df.loc[index, 'datetime']:
        step+=1
        return next_event(df, index, step)
    else:
        return (df.loc[index+step, 'movement_type'])

def add_next_event(df):
    next_event_list = []
    for index in range(len(df)-1):
        next_event_list.append(next_event(df, index, 1))
    df['next_movement_type']=pd.Series(next_event_list)

In [None]:
events_df = events_df.reset_index(drop=True)
add_next_event(events_df)

In [None]:
events_df[events_df['ute_id']==832]

In [None]:
events_df[events_df['is_exit']=='yes'].groupby('next_movement_type').ute_id.count()

In [None]:
# remove applications that had "phantom" movements after a positive exit
# need to check if there was an application in the same month


# phantom_movements = ['cancellation', 'category_change', 'convocation', 'interventions', 'interviews']
# apps_phantom = []
# positive_exits = events_df[events_df['is_exit']=='yes'].reset_index(drop=True)

# for index in range(len(positive_exits)-1):
#     if positive_exits.loc[index, 'next_movement_type'] in phantom_movements:
#         apps_phantom.append(positive_exits.loc[index,'application_id'])
        
# events_df = events_df[~events_df['application_id'].isin(apps_phantom)]

In [None]:
#print overall counts for dataset - REMOVE starting w/o app, & apps w/o
# 2 years of follow-up, & apps w/ "phantom" movements
print len(events_df)
print events_df.application_id.nunique()
print events_df.ute_id.nunique()

# Data exploration / validation

In [None]:
# show applications that had multiple admitido / colocado placements through IEFP
#events_df[events_df['movement_result']=='ADMITIDO / COLOCADO'].groupby(
#    'application_id').application_id.count().sort_values(ascending=False)

In [None]:
# show example timeline of a user who had multiple placements from one application
# events_df[events_df['application_id']==39862]

In [None]:
# show types of cancellations
#events_df[events_df['movement_type']=='cancellation'].groupby(
#    'movement_subtype').application_id.count().sort_values(ascending=False)

# Finding values for classification flowchart

In [None]:
# add columns about job found

job_path_list=[]
def job_placement_path(df):
    for index in df.index:
        if df['movement_result'][index] =='ADMITIDO / COLOCADO':
            job_path_list.append('through_IEFP')
        elif df['movement_subtype'][index] in positive_cancellations:
            job_path_list.append('outside')
        else:
            job_path_list.append(np.nan)
    df.loc[:,'job_path']=job_path_list
    return df.head()

job_placement_path(events_df)

In [None]:
# unique applications that found a job
print color.BOLD+'Unique applications that found a job ever'+color.END
print events_df.groupby('job_path').application_id.nunique()

print color.BOLD+'\nUnique users that found a job ever'+color.END
print events_df.groupby('job_path').ute_id.nunique()

In [35]:
#found a job within 12 months
events_within_year = events_df[events_df['time_since_app']<=timedelta(days=365)]

print color.BOLD+'Unique applications that found a job within 12 months'+color.END
print events_within_year.groupby('job_path').application_id.nunique()

print color.BOLD+'\nUnique users that found a job within 12 months'+color.END
print events_within_year.groupby('job_path').ute_id.nunique()

KeyError: 'time_since_app'

In [36]:
# add time until next event, if not a user change & not the same month
events_df = events_df.reset_index(drop=True)

time_until_next_list = []
for index in range(len(events_df)-1):
    if events_df.loc[index+1, 'ute_id'] != events_df.loc[index, 'ute_id']:
        time_until_next_list.append(np.nan)
    elif events_df.loc[index+1, 'datetime'] == events_df.loc[index, 'datetime']:
        time_until_next_list.append(np.nan)
    else:
        time_until_next_list.append(events_df.loc[index+1, 'datetime'] - events_df.loc[index,'datetime'])
    
events_df['time_until_next']=pd.Series(time_until_next_list)

KeyError: 'the label [datetime] is not in the [index]'

In [None]:
#found a job within 12 months & didn't return in 12 months
found_job_df = events_df[events_df.loc[:,'job_path'].notnull()]
found_job_df.loc[:,'found_within_year'] = events_df.loc[:,'time_since_app'].apply(lambda x: x<=timedelta(days=365))
found_job_df.loc[:,'next_within_year'] = events_df.loc[:,'time_until_next'].apply(lambda x: x<=timedelta(days=365))

#remove intermediary exits
app_ids = []
exit_count = []
for index in found_job_df.index:
    app_id = found_job_df.loc[index,'application_id']
    app_ids.append(app_id)
    exit_count.append(app_ids.count(app_id))
found_job_df.loc[:,'exit_number']=exit_count

In [37]:
# show example timelines of users that appear both in and out of next within year
#events_df[events_df['application_id']==465]
#events_df[events_df['ute_id']==131234]
events_df[events_df['application_id']==1897]
#found_job_within_year[found_job_within_year['application_id']==1897]

Unnamed: 0,ute_id,date,application_id,movement_type,movement_subtype,movement_result,movement_index
160275,1628253,200703,1897,application,DESEMPREGADO-NOVO EMPREGO,,1897
160276,1628253,200704,1897,interview,,RECUSA ENTIDADE EMPREGADORA- OUTROSMOTIVOS,3610
160277,1628253,200705,1897,convocation,INTERVENÃÃO TÃCNICA,NÃO COMPARECEU INJUSTIFICADAMENTE,10820
160278,1628253,200705,1897,interview,,RECUSA DE ENTIDADE EMPREGADORA - DESAJUSTAMENT...,4705
160279,1628253,200706,1897,convocation,OFERTA,APRESENTADO,14430
160280,1628253,200706,1897,interview,,RECUSA DE ENTIDADE EMPREGADORA - DESAJUSTAMENT...,6309
160281,1628253,200708,1897,convocation,SESSÃO COLECTIVA,COMPARECEU,18555
160282,1628253,200709,1897,convocation,OFERTA,APRESENTADO,20433
160283,1628253,200710,1897,interview,,ADMITIDO / COLOCADO,11397
160284,1628253,200710,1897,intervention,RVCC - OUTRAS ENTIDADES,,2903


In [38]:
# find ute_ids that appear both in and out of next within year

# found_job_within_year = found_job_df[found_job_df['found_within_year']==True]
# app_relapse = found_job_within_year[found_job_within_year['next_within_year']==True].application_id.unique()
# app_perm = found_job_within_year[found_job_within_year['next_within_year']==False].application_id.unique()
# print len(app_relapse)
# print len(app_perm)
# app_intersect = set(app_relapse).intersection(app_perm)
# app_intersect

In [39]:
print color.BOLD+'Unique applications that found a job within 12 months and did not return in 12 months'+color.END
print found_job_df.groupby(['found_within_year','job_path','next_within_year']).application_id.nunique()
print color.BOLD+'\nUnique users that found a job within 12 months and did not return in 12 months'+color.END
print found_job_df.groupby(['found_within_year','job_path','next_within_year']).ute_id.nunique()

[1mUnique applications that found a job within 12 months and did not return in 12 months[0m


NameError: name 'found_job_df' is not defined

In [40]:
found_job_df.head(100)

NameError: name 'found_job_df' is not defined

In [41]:
# label cancellations with neutral reasons
cancel_df = events_df[events_df['movement_type']=='cancellation']

In [42]:
neutral_cancellations = ['EMIGRAÃÃO, AUSÃNCIA DO PAÃS',
                        'INCAPACIDADE PROLONGADA / PERMANENTE PARA O TRABALHO']

In [43]:
cancel_df['cancel_label'] = 

SyntaxError: invalid syntax (<ipython-input-43-83ef2d7ea17a>, line 1)