In [18]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from datetime import datetime#, timedelta
from utils import db_utils
from utils import iefp_data_utils

In [19]:
conn = db_utils.connect_to_db()

In [20]:
movements = db_utils.read_table(conn,'cascais','movement')

In [21]:
applications = db_utils.read_table(conn,'cascais','application')

In [22]:
clean_apps = iefp_data_utils.clean_applications(applications,'2016-04-30')

In [23]:
clean_movs = iefp_data_utils.clean_movements(movements, clean_apps['table_index'])
clean_movs.head()

Unnamed: 0,ute_id,movement_date,year_month,application_id,movement_type,movement_subtype,movement_result,movement_index,date
9,818,2013-02-18 00:00:00,201302,70845,application,DESEMPREGADO-NOVO EMPREGO,,70845,2013-02-01
10,818,,201304,70845,convocation,OFERTA,APRESENTADO,119828,2013-04-01
11,818,,201305,70845,interview,,RECUSA DA ENTIDADE EMPREGADORA - NÃO MARCAÃÃ...,85260,2013-05-01
12,818,,201306,70845,convocation,OFERTA,NÃO APRESENTADO,125724,2013-06-01
13,818,,201403,70845,convocation,OFERTA,NÃO COMPARECEU INJUSTIFICADAMENTE,147876,2014-03-01


In [24]:
clean_data = pd.merge(clean_movs,clean_apps,how='inner', left_on = ['ute_id','application_id'], right_on=['ute_id','table_index'])

In [25]:
clean_data.head()

Unnamed: 0,ute_id,movement_date,year_month,application_id,movement_type,movement_subtype,movement_result,movement_index,date,table_index,...,conjuge_estado_civil,conjuge_categoria,conjuge_estado,conjuge_motivo_indisponibilidade,candidatura_categoria_anterior,candidatura_estado_anterior,ute_nr_pessoas_cargo,ute_nr_descendentes_cargo,candidatura_data_ppe,app_start_date
0,818,2013-02-18 00:00:00,201302,70845,application,DESEMPREGADO-NOVO EMPREGO,,70845,2013-02-01,70845,...,,,,,,,2.0,2.0,,2013-02-01
1,818,,201304,70845,convocation,OFERTA,APRESENTADO,119828,2013-04-01,70845,...,,,,,,,2.0,2.0,,2013-02-01
2,818,,201305,70845,interview,,RECUSA DA ENTIDADE EMPREGADORA - NÃO MARCAÃÃ...,85260,2013-05-01,70845,...,,,,,,,2.0,2.0,,2013-02-01
3,818,,201306,70845,convocation,OFERTA,NÃO APRESENTADO,125724,2013-06-01,70845,...,,,,,,,2.0,2.0,,2013-02-01
4,818,,201403,70845,convocation,OFERTA,NÃO COMPARECEU INJUSTIFICADAMENTE,147876,2014-03-01,70845,...,,,,,,,2.0,2.0,,2013-02-01


In [26]:
mov_counts = clean_data.groupby(['ute_id','application_id']).movement_type.value_counts().unstack().reset_index()
mov_counts.head()

movement_type,ute_id,application_id,application,cancellation,category_change,convocation,intervention,interview
0,818,70845,1.0,1.0,,9.0,2.0,2.0
1,820,22603,1.0,1.0,1.0,,2.0,1.0
2,820,61013,1.0,1.0,,3.0,1.0,1.0
3,832,1930,1.0,,,4.0,1.0,1.0
4,832,9174,1.0,1.0,,1.0,,


In [27]:
#Add column to show how many months after the application the movement occurred
def difftime_in_months(timeA,timeB):
    return (timeA-timeB)/np.timedelta64(1, 'M')

clean_data['months_after_app'] = difftime_in_months(clean_data['date'],clean_data['app_start_date'])
clean_data.head()

Unnamed: 0,ute_id,movement_date,year_month,application_id,movement_type,movement_subtype,movement_result,movement_index,date,table_index,...,conjuge_categoria,conjuge_estado,conjuge_motivo_indisponibilidade,candidatura_categoria_anterior,candidatura_estado_anterior,ute_nr_pessoas_cargo,ute_nr_descendentes_cargo,candidatura_data_ppe,app_start_date,months_after_app
0,818,2013-02-18 00:00:00,201302,70845,application,DESEMPREGADO-NOVO EMPREGO,,70845,2013-02-01,70845,...,,,,,,2.0,2.0,,2013-02-01,0.0
1,818,,201304,70845,convocation,OFERTA,APRESENTADO,119828,2013-04-01,70845,...,,,,,,2.0,2.0,,2013-02-01,1.938438
2,818,,201305,70845,interview,,RECUSA DA ENTIDADE EMPREGADORA - NÃO MARCAÃÃ...,85260,2013-05-01,70845,...,,,,,,2.0,2.0,,2013-02-01,2.924085
3,818,,201306,70845,convocation,OFERTA,NÃO APRESENTADO,125724,2013-06-01,70845,...,,,,,,2.0,2.0,,2013-02-01,3.942586
4,818,,201403,70845,convocation,OFERTA,NÃO COMPARECEU INJUSTIFICADAMENTE,147876,2014-03-01,70845,...,,,,,,2.0,2.0,,2013-02-01,12.911969


In [28]:
#Check whether application was cancelled before 12 months
def has_cancellation_before_n_months(x,n=12):
    return np.any((x['movement_type'].isin(['cancellation'])) & (x['months_after_app'] < n))

#Check whether application was placement before 12 months
def has_placement_before_n_months(x,n=12):
    return np.any((x['movement_result'].isin(['ADMITIDO / COLOCADO'])) & (x['months_after_app'] < n))

#Label application as LTU/Non-LTU using LTU criteria
def is_ltu(app_ltu_crit):
    return (~((app_ltu_crit['cancelled_before_12mo']) | (app_ltu_crit['placed_before_12mo'])))

#Compute LTU criteria for application
def get_ltu_criteria(application_movs):
    return pd.Series({'cancelled_before_12mo': has_cancellation_before_n_months(application_movs),
                      'placed_before_12mo':has_placement_before_n_months(application_movs)})

apps_ltu = clean_data.groupby(['ute_id','application_id']).apply(get_ltu_criteria).reset_index()
apps_ltu['ltu'] = is_ltu(apps_ltu)
apps_ltu.head()

Unnamed: 0,ute_id,application_id,cancelled_before_12mo,placed_before_12mo,ltu
0,818,70845,False,False,True
1,820,22603,False,False,True
2,820,61013,True,False,False
3,832,1930,False,True,False
4,832,9174,True,False,False


In [29]:
apps_ltu.head()
mov_counts.head(100)
model_matrix = apps_ltu.merge(mov_counts, on=['ute_id','application_id']).merge(clean_apps,left_on=['ute_id','application_id'], right_on=['ute_id','table_index'])
model_matrix.columns

Index([                             u'ute_id',
                            u'application_id',
                     u'cancelled_before_12mo',
                        u'placed_before_12mo',
                                       u'ltu',
                               u'application',
                              u'cancellation',
                           u'category_change',
                               u'convocation',
                              u'intervention',
                                 u'interview',
                               u'table_index',
                                    u'anomes',
                           u'ctipo_movimento',
                           u'dtipo_movimento',
                                      u'sexo',
                      u'chabilitacao_escolar',
                      u'dhabilitacao_escolar',
                              u'cdeficiencia',
                              u'ddeficiencia',
                            u'cnacionalidade',
             

In [30]:
def findSeason(x):
    month=int(str(x)[4:6])
    if month in [12,1,2]:
        return "Winter"
    if month in [3,4,5]:
        return "Spring"
    if month in [6,7,8]:
        return "Summer"
    if month in [9,10,11]:
        return "Autumn"
    
    

In [31]:
def isPortuguese(x):
    if x=="PT":
        return 1
    else:
        return 0

In [32]:
def lookingForFirstJob(x):
    if "NOVO" in x:
        return 1
    else:
        return 0

In [91]:
#here the feature that can be taken directly is added
feature_matrix=pd.DataFrame()
#model_matrix.head()
#feature_matrix.head()
#feature_matrix["ltu"]=model_matrix["ltu"]
feature_matrix[["ltu","age","gender","is_re_registriation","soc_ben"]]=model_matrix[["ltu","ute_idade","sexo","candidatura_rinsc","sub_rsi"]]
#find season
feature_matrix["season"]=model_matrix["anomes"].apply(findSeson)
#find if from portogal
#how one can check if 
#model_matrix["cnacionalidade"].isnull().value_counts()
#model_matrix["cnacionalidade"].head()
#model_matrix["cancionalidade"].head()
feature_matrix["nationality"]=model_matrix["cnacionalidade"].apply(isPortuguese)
#model_matrix["dcategoria"].value_counts().plot(kind="barh")
model_matrix["dcategoria"].value_counts()
feature_matrix["is_re_registriation"].replace(('S', 'N'), (1, 0), inplace=True)
feature_matrix.head()



Unnamed: 0,ltu,age,gender,is_re_registriation,soc_ben,season,nationality
0,True,50,F,0,N,Winter,1
1,True,39,F,0,N,Spring,1
2,False,42,F,1,N,Spring,1
3,False,32,M,0,N,Spring,1
4,False,33,M,1,N,Winter,1


In [92]:
def createEducationBuckets(x):
    HS=["12"]
    #SL means  able to write, but no school degree,NS means not able to write
    NR=["NS"]
    U6=["SL","04","06"]
    U11=["09","11"]
    
    #The rest is different kinds of higher degree
    if x in HS:
        return "HS"
    if x in NR:
        return "NR"
    if x in U6:
        return "U6"
    if x in U11:
        return "U11"
    else:
        return "MHS"
    
    

In [67]:
def civilStatus(x):
    if x=="S":
        return "S"
    if x=="C":
        return "M"
    else :
        return "O"

In [93]:
feature_matrix["education"]=model_matrix["chabilitacao_escolar"].apply(createEducationBuckets)
#model_matrix[["dhabilitacao_escolar","chabilitacao_escolar"]].head()
#model_matrix["dhabilitacao_escolar"].value_counts().plot(kind="barh")
#model_matrix["chabilitacao_escolar"].value_counts()
model_matrix["ltu"].isnull().value_counts()

feature_matrix["first_job"]=model_matrix["dcategoria"].apply(lookingForFirstJob)
#feature_matrix["first_job"].mean()

model_matrix[["candidatura_rinsc","reinscricao_ult_saida_data"]].head(100)
    #clean_apps['app_start_date'] = pd.to_datetime(clean_apps['anomes'],format="%Y%m")

#model_matrix[["candidatura_data","reinscricao_ult_saida_data"]].head()
feature_matrix["time_since_exit"]=(pd.to_datetime(model_matrix["candidatura_data"])-pd.to_datetime(model_matrix["reinscricao_ult_saida_data"])).dt.days
feature_matrix["time_since_exit"]=pd.to_numeric(feature_matrix["time_since_exit"])

#(feature_matrix["time_since_exit"]/np.timedelta64(1, 'D'))
feature_matrix.head(100)

feature_matrix["is_disabled"]=model_matrix["cdeficiencia"].apply(lambda x: x if x==0 else 1)
feature_matrix.head(100)
model_matrix["ute_estado_civil"].value_counts()
feature_matrix["civil_status"]=model_matrix["ute_estado_civil"].apply(civilStatus)
feature_matrix.head()
#feature_matrix["is_disabled"].value_counts()
#model_matrix["cdeficiencia"].value_counts()
#feature_matrix["time_since_exit"]=timeSinceLastCancellation(model_matrix["candidatura_rinsc"],model_matrix["reinscricao_ult_saida_data"])
#feature_matrix["time_since_exit"]=timeSinceLastCancellation(model_matrx,model_matrix["reinscricao_ult_saida_data"])
#model_matrix["reinscricao_ult_saida_data"]
#model_matrix["reinscricao_ult_saida_data"].head()
#Reinscricao Ult Saida Data
#feature_matrix.head(100)

Unnamed: 0,ltu,age,gender,is_re_registriation,soc_ben,season,nationality,education,first_job,time_since_exit,is_disabled,civil_status
0,True,50,F,0,N,Winter,1,U11,1,,0,M
1,True,39,F,0,N,Spring,1,MHS,1,,0,S
2,False,42,F,1,N,Spring,1,MHS,1,170.0,0,S
3,False,32,M,0,N,Spring,1,U6,1,,0,S
4,False,33,M,1,N,Winter,1,U6,1,24.0,0,S


In [85]:
#add course area and profesional training area
#course area CArea Curso-Tabela em Activo
#professional training area DArea Formacao-Tabela em Activo
model_matrix["darea_curso_tabela_em_activo"].value_counts(normalize=True,dropna=False)
feature_matrix["has_course_area"]=model_matrix["darea_curso_tabela_em_activo"]#.apply(lambda x: 0 if x.isnan() else 1)
feature_matrix["has_course_area"].fillna(0,inplace=True)
feature_matrix["has_course_area"]=feature_matrix["has_course_area"].apply(lambda x: 1 if x==0 else 0)
feature_matrix.head()

Unnamed: 0,ltu,age,gender,is_re_registriation,season,nationality,education,first_job,time_since_exit,is_disabled,civil_status,has_prof_area,has_course_area
0,True,50,F,0,Winter,1,U11,1,,0,M,1,1
1,True,39,F,0,Spring,1,MHS,1,,0,S,0,0
2,False,42,F,1,Spring,1,MHS,1,170.0,0,S,0,0
3,False,32,M,0,Spring,1,U6,1,,0,S,1,1
4,False,33,M,1,Winter,1,U6,1,24.0,0,S,1,1


In [104]:
model_matrix["darea_formacao_tabela_em_activo"].value_counts(normalize=True,dropna=False)
feature_matrix["has_prof_area"]=model_matrix["darea_formacao_tabela_em_activo"]#.apply(lambda x: 0 if x.isnan() else 1)
feature_matrix["has_prof_area"].fillna(0,inplace=True)
feature_matrix["has_prof_area"]=feature_matrix["has_prof_area"].apply(lambda x: 1 if x==0 else 0)
feature_matrix.head()
#model_matrix["candidatura_categoria_anterior"].head(1000)
#Candidatura-Categoria Anterior
#model_matrix["dcnp_pretendida"].value_counts()
#model_matrix["dcpp_pretendida"].value_counts()
# u'cnp_pretendida',
#                           u'dcnp_pretendida',
#                            u'cpp_pretendida',
 #                          u'dcpp_pretendida',








Unnamed: 0,ltu,age,gender,is_re_registriation,soc_ben,season,nationality,education,first_job,time_since_exit,is_disabled,civil_status,has_prof_area
0,True,50,F,0,N,Winter,1,U11,1,,0,M,1
1,True,39,F,0,N,Spring,1,MHS,1,,0,S,0
2,False,42,F,1,N,Spring,1,MHS,1,170.0,0,S,0
3,False,32,M,0,N,Spring,1,U6,1,,0,S,1
4,False,33,M,1,N,Winter,1,U6,1,24.0,0,S,1


In [108]:
def ageBucket(x):
    if x<30:
        return "age<30"
    elif x<50:
        return "30<age<50"
    else:
        return "age>50"

In [126]:
feature_matrix["age_category"]=feature_matrix["age"].apply(ageBucket)
feature_matrix.head(100)
feature_matrix["number_dependence"]=model_matrix["ute_nr_pessoas_cargo"]
feature_matrix.head(100)
feature_matrix.shape
model_matrix["candidatura_estado_anterior"].value_counts(dropna=False)

model_matrix["candidatura_prof_pret_tempo_pratica"].value_counts(dropna=False)
feature_matrix["experience_indended_prof"]=model_matrix["candidatura_prof_pret_tempo_pratica"]
feature_matrix["experience_prev_prof"]=model_matrix["sit_anterior_prof_tempo_pratica"]
feature_matrix.head(100)

#Candidatura-Prof Pret-Tempo Pratica
#Sit Anterior-Prof-Tempo Pratica
#candidatura_prof_pret_tempo_pratica',
#sit_anterior_prof_tempo_pratica',





#Ute-Nr Pessoas Cargo

Unnamed: 0,ltu,age,gender,is_re_registriation,soc_ben,season,nationality,education,first_job,time_since_exit,is_disabled,civil_status,has_prof_area,age_category,number_dependence,experience_indended_prof,experience_prev_prof
0,True,50,F,0,N,Winter,1,U11,1,,0,M,1,age>50,2.0,156.0,156.0
1,True,39,F,0,N,Spring,1,MHS,1,,0,S,0,30<age<50,2.0,144.0,144.0
2,False,42,F,1,N,Spring,1,MHS,1,170.0,0,S,0,30<age<50,2.0,144.0,10.0
3,False,32,M,0,N,Spring,1,U6,1,,0,S,1,30<age<50,1.0,30.0,30.0
4,False,33,M,1,N,Winter,1,U6,1,24.0,0,S,1,30<age<50,1.0,30.0,30.0
5,False,32,F,1,N,Autumn,1,U11,1,1224.0,0,O,1,30<age<50,0.0,48.0,48.0
6,False,35,F,1,N,Winter,1,U11,1,1016.0,0,O,1,30<age<50,0.0,48.0,36.0
7,True,36,F,1,N,Summer,1,HS,1,41.0,0,O,1,30<age<50,0.0,48.0,36.0
8,True,40,F,1,N,Winter,1,HS,1,181.0,0,O,1,30<age<50,0.0,48.0,36.0
9,False,36,M,0,N,Winter,1,HS,1,,0,S,1,30<age<50,0.0,115.0,115.0


In [129]:
feature_matrix.to_csv('/mnt/data/shared/workingData/feature_matrix/data_matrix.csv', index=False, encoding='utf8')