# Data set-up and cleaning

In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import psycopg2 as pg
%matplotlib inline
import seaborn as sns
import dbcreds

In [2]:
conn = pg.connect(dbname = dbcreds.database, host=dbcreds.host, user=dbcreds.user, password = dbcreds.password)

In [3]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [4]:
events_df = pd.read_sql('select * from cascais.movements', con=conn)

In [5]:
#print overall counts for dataset - ORIGINAL dataset
print len(events_df)
print events_df.application_id.nunique()
print events_df.ute_id.nunique()

870061
140714
74791


In [6]:
# identify first event for each app ID
first_events = events_df.groupby("application_id").first().movement_type

# identify list of app IDs that do not start with application
apps_to_remove=[]
for app_id in range(1,len(first_events)+1):
    if first_events[app_id] != "application":
        apps_to_remove.append(app_id)
        
#remove all application ids that does not start with an application
events_df = events_df[~events_df['application_id'].isin(apps_to_remove)]

In [7]:
#print overall counts for dataset - REMOVE sets not starting with application
print len(events_df)
print events_df.application_id.nunique()
print events_df.ute_id.nunique()

808166
125029
65523


In [8]:
# Identify application IDs that are associated with applications submitted after
# April 2015 - to allow for 2 years of follow-up data
late_apps = []
for i, row in events_df.iterrows():
    if row['movement_type']=='application' and row['date']>201604:
        late_apps.append(row['application_id'])

In [9]:
#remove late application IDs
events_df = events_df[~events_df['application_id'].isin(late_apps)]


In [10]:
#print overall counts for dataset - REMOVE starting w/o app, & apps w/o
# 2 years of follow-up
print len(events_df)
print events_df.application_id.nunique()
print events_df.ute_id.nunique()

765659
113249
61403


In [11]:
# create datetime objects
from datetime import datetime, timedelta
events_df.loc[:,'datetime']= events_df.loc[:,'date'].apply(
    lambda x: datetime(year=int(str(x)[0:4]), month=int(str(x)[4:6]), day=1))

# add time since application
application_dates_series = events_df.groupby('application_id').first().datetime
events_df.loc[:, 'application_date'] = events_df.loc[:, 'application_id'].apply(lambda x: application_dates_series[x])
events_df.loc[:, 'time_since_app'] = events_df.loc[:, 'datetime']-events_df.loc[:,'application_date']

In [12]:
# identify exits from system 
positive_cancellations = ['COLOCAÃÃO POR MEIOS PRÃPRIOS, POR CONTA DE OUTREM',
                         'COLOCAÃÃO POR MEIOS PRÃPRIOS, POR CONTA PRÃPRIA',
                         'COLOCAÃÃO - CANDIDATURA EXTERNA'
                         'COLOCAÃÃO - CANDIDATURA INTERNA']
is_exit_list=[]
def is_exit(df):
    for index in df.index:
        if df['movement_result'][index] =='ADMITIDO / COLOCADO' or df['movement_subtype'][index] in positive_cancellations: #df['movement_type'][index] == 'cancellation':
            is_exit_list.append('yes')
        else:
            is_exit_list.append('no')
    df.loc[:,'is_exit']=is_exit_list
    
is_exit(events_df)

In [13]:
#print overall counts
print color.BOLD+'Number of unique applications'+color.END
print events_df.application_id.nunique()

print color.BOLD+'\nNumber of unique users'+color.END
print events_df.ute_id.nunique()

print color.BOLD+'\nTotal number of placements through an interview'+color.END
print len(events_df[events_df['movement_result']=='ADMITIDO / COLOCADO'])

[1mNumber of unique applications[0m
113249
[1m
Number of unique users[0m
61403
[1m
Total number of placements through an interview[0m
6776


In [None]:
# count how many supposed positive exits end up with a non-application as a next event
def next_event(df, step):
    df = df.reset_index(drop=True)
    next_event_list = []
    for index in range(len(df)-1):
        if df.loc[index+step, 'ute_id'] != df.loc[index, 'ute_id']:
            next_event_list.append('new_user')
        elif df.loc[index+step,'datetime'] == df.loc[index, 'datetime']:
            step+=1
            next_event(df, step)
            #next_event_list.append('same_month')
        else:
            next_event_list.append(events_df.loc[index+1, 'movement_type'])
    df['next_movement_type']=pd.Series(next_event_list)

#events_df[events_df['is_exit']=='yes'].groupby('next_movement_type').ute_id.count()

In [None]:
next_event(events_df)

In [15]:
# remove applications that had "phantom" movements after a positive exit
phantom_movements = ['cancellation', 'category_change', 'convocation', 'interventions', 'interviews']
apps_phantom = []
positive_exits = events_df[events_df['is_exit']=='yes'].reset_index(drop=True)

for index in range(len(positive_exits)-1):
    if positive_exits.loc[index, 'next_movement_type'] in phantom_movements:
        apps_phantom.append(positive_exits.loc[index,'application_id'])

In [16]:
events_df = events_df[~events_df['application_id'].isin(apps_phantom)]

In [17]:
#print overall counts for dataset - REMOVE starting w/o app, & apps w/o
# 2 years of follow-up, & apps w/ "phantom" movements
print len(events_df)
print events_df.application_id.nunique()
print events_df.ute_id.nunique()

753130
111510
61102


# Data exploration / validation

In [18]:
# show applications that had multiple admitido / colocado placements through IEFP
#events_df[events_df['movement_result']=='ADMITIDO / COLOCADO'].groupby(
#    'application_id').application_id.count().sort_values(ascending=False)

In [19]:
# show example timeline of a user who had multiple placements from one application
# events_df[events_df['application_id']==39862]

In [20]:
# show types of cancellations
#events_df[events_df['movement_type']=='cancellation'].groupby(
#    'movement_subtype').application_id.count().sort_values(ascending=False)

# Finding values for classification flowchart

In [21]:
# add columns about job found

job_path_list=[]
def job_placement_path(df):
    for index in df.index:
        if df['movement_result'][index] =='ADMITIDO / COLOCADO':
            job_path_list.append('through_IEFP')
        elif df['movement_subtype'][index] in positive_cancellations:
            job_path_list.append('outside')
        else:
            job_path_list.append(np.nan)
    df.loc[:,'job_path']=job_path_list
    return df.head()

job_placement_path(events_df)

Unnamed: 0,ute_id,date,application_id,movement_type,movement_subtype,movement_result,movement_index,datetime,application_date,time_since_app,is_exit,next_movement_type,job_path
0,818,201302,3,application,DESEMPREGADO-NOVO EMPREGO,,70865,2013-02-01,2013-02-01,0 days,no,convocation,
1,818,201304,3,convocation,OFERTA,APRESENTADO,119828,2013-04-01,2013-02-01,59 days,no,interviews,
2,818,201305,3,interviews,,RECUSA DA ENTIDADE EMPREGADORA - NÃO MARCAÃÃ...,85298,2013-05-01,2013-02-01,89 days,no,convocation,
3,818,201306,3,convocation,OFERTA,NÃO APRESENTADO,125724,2013-06-01,2013-02-01,120 days,no,convocation,
4,818,201403,3,convocation,OFERTA,NÃO COMPARECEU INJUSTIFICADAMENTE,147876,2014-03-01,2013-02-01,393 days,no,convocation,


In [22]:
# unique applications that found a job
print color.BOLD+'Unique applications that found a job ever'+color.END
print events_df.groupby('job_path').application_id.nunique()

print color.BOLD+'\nUnique users that found a job ever'+color.END
print events_df.groupby('job_path').ute_id.nunique()

[1mUnique applications that found a job ever[0m
job_path
outside         20634
through_IEFP     6058
Name: application_id, dtype: int64
[1m
Unique users that found a job ever[0m
job_path
outside         15406
through_IEFP     5386
Name: ute_id, dtype: int64


In [23]:
#found a job within 12 months
events_within_year = events_df[events_df['time_since_app']<=timedelta(days=365)]

print color.BOLD+'Unique applications that found a job within 12 months'+color.END
print events_within_year.groupby('job_path').application_id.nunique()

print color.BOLD+'\nUnique users that found a job within 12 months'+color.END
print events_within_year.groupby('job_path').ute_id.nunique()

[1mUnique applications that found a job within 12 months[0m
job_path
outside         16746
through_IEFP     4664
Name: application_id, dtype: int64
[1m
Unique users that found a job within 12 months[0m
job_path
outside         12489
through_IEFP     4126
Name: ute_id, dtype: int64


In [24]:
# add time until next event, if not a user change & not the same month
events_df = events_df.reset_index(drop=True)

time_until_next_list = []
for index in range(len(events_df)-1):
    if events_df.loc[index+1, 'ute_id'] != events_df.loc[index, 'ute_id']:
        time_until_next_list.append(np.nan)
    elif events_df.loc[index+1, 'datetime'] == events_df.loc[index, 'datetime']:
        time_until_next_list.append(np.nan)
    else:
        time_until_next_list.append(events_df.loc[index+1, 'datetime'] - events_df.loc[index,'datetime'])
    
events_df['time_until_next']=pd.Series(time_until_next_list)

In [25]:
#found a job within 12 months & didn't return in 12 months
found_job_df = events_df[events_df.loc[:,'job_path'].notnull()]
found_job_df.loc[:,'found_within_year'] = events_df.loc[:,'time_since_app'].apply(lambda x: x<=timedelta(days=365))
found_job_df.loc[:,'next_within_year'] = events_df.loc[:,'time_until_next'].apply(lambda x: x<=timedelta(days=365))

#remove intermediary exits
app_ids = []
exit_count = []
for index in found_job_df.index:
    app_id = found_job_df.loc[index,'application_id']
    app_ids.append(app_id)
    exit_count.append(app_ids.count(app_id))
found_job_df.loc[:,'exit_number']=exit_count

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [26]:
# show example timelines of users that appear both in and out of next within year
#events_df[events_df['application_id']==465]
#events_df[events_df['ute_id']==131234]
events_df[events_df['application_id']==1897]
#found_job_within_year[found_job_within_year['application_id']==1897]

Unnamed: 0,ute_id,date,application_id,movement_type,movement_subtype,movement_result,movement_index,datetime,application_date,time_since_app,is_exit,next_movement_type,job_path,time_until_next
11174,273106,201507,1897,application,DESEMPREGADO-NOVO EMPREGO,,102552,2015-07-01,2015-07-01,0 days,no,same_month,,NaT
11175,273106,201507,1897,cancellation,"COLOCAÃÃO POR MEIOS PRÃPRIOS, POR CONTA DE ...",,92320,2015-07-01,2015-07-01,0 days,yes,same_month,outside,NaT
11176,273106,201507,1897,category_change,OCUPADO,,90430,2015-07-01,2015-07-01,0 days,no,same_month,,NaT
11177,273106,201507,1897,category_change,DESEMPREGADO-NOVO EMPREGO,,90431,2015-07-01,2015-07-01,0 days,no,same_month,,NaT
11178,273106,201507,1897,convocation,GERAL UTENTE,ANULADA,200461,2015-07-01,2015-07-01,0 days,no,same_month,,NaT
11179,273106,201507,1897,interventions,FORMAÃÃO - VIDA ATIVA,,107767,2015-07-01,2015-07-01,0 days,no,same_month,,NaT
11180,273106,201507,1897,interventions,EFA - N2 - FORMAÃÃO PROFISSIONAL,,107768,2015-07-01,2015-07-01,0 days,no,cancellation,,123 days
11181,273106,201511,1897,cancellation,"COLOCAÃÃO POR MEIOS PRÃPRIOS, POR CONTA DE ...",,96453,2015-11-01,2015-07-01,123 days,yes,application,outside,92 days


In [27]:
found_job_within_year[found_job_within_year['application_id']==1897]

NameError: name 'found_job_within_year' is not defined

In [None]:
# find ute_ids that appear both in and out of next within year

found_job_within_year = found_job_df[found_job_df['found_within_year']==True]
app_relapse = found_job_within_year[found_job_within_year['next_within_year']==True].application_id.unique()
app_perm = found_job_within_year[found_job_within_year['next_within_year']==False].application_id.unique()
print len(app_relapse)
print len(app_perm)
app_intersect = set(app_relapse).intersection(app_perm)
app_intersect

In [None]:
print color.BOLD+'Unique applications that found a job within 12 months and did not return in 12 months'+color.END
print found_job_df.groupby(['found_within_year','job_path','next_within_year']).application_id.nunique()
print color.BOLD+'\nUnique users that found a job within 12 months and did not return in 12 months'+color.END
print found_job_df.groupby(['found_within_year','job_path','next_within_year']).ute_id.nunique()

In [None]:
found_job_df.head(100)