In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import psycopg2 as pg
%matplotlib inline
import seaborn as sns
from sqlalchemy import create_engine
from datetime import datetime#, timedelta
from utils import db_utils
from utils import iefp_data_utils

In [2]:
#Connect to database
conn = db_utils.connect_to_db()

In [3]:
#Read movements data from database as a dataframe
movements = db_utils.read_table(conn,'cascais_v2','movement')

In [4]:
#Read Application Data
applications = db_utils.read_table(conn,'cascais_v2','application')

In [5]:
#Initial Data
print "Initial Data:"
print "Movements: ", movements.shape
print "Applications: ", applications.shape
print "Users: ", applications.ute_id.unique().shape

Initial Data:
Movements:  (1244698, 7)
Applications:  (125029, 61)
Users:  (65523,)


In [12]:
#Applications data filtering
applications['app_start_date'] = pd.to_datetime(applications['candidatura_data'])
apps_within_date_limit = applications[(applications['app_start_date'] < '1980-01-01') | (applications['app_start_date'] > '2015-04-30')]
print "Applications which started after 04-30-2015 (leaving 2 years of follow up): ", apps_within_date_limit.shape[0], "(", apps_within_date_limit.shape[0]/float(applications.shape[0]), "%)"

apps_from_employed = applications[applications['dcategoria'].isin(['EMPREGADO', 'EMPREGADO A TEMPO PARCIAL'])] 
print "Applications of people who are (part-time)employed: ", apps_from_employed.shape[0], "(", apps_from_employed.shape[0]/float(applications.shape[0]), "%)"

Applications which started after 04-30-2015 (leaving 2 years of follow up):  24378 ( 0.194978764927 %)
Applications of people who are (part-time)employed:  5103 ( 0.0408145310288 %)


In [13]:
clean_apps = iefp_data_utils.clean_applications(applications,'1980-01-01','2015-04-30')

print "Clean applications left: ", clean_apps.shape[0], "(", clean_apps.shape[0]/float(applications.shape[0]), "%)"
print "Number of unique users: ", clean_apps.ute_id.unique().shape[0], "(", clean_apps.ute_id.unique().shape[0]/float(applications.ute_id.unique().shape[0]), "%)"

Clean applications left:  96388 ( 0.770925145366 %)
Number of unique users:  55375 ( 0.84512308655 %)


In [14]:
#Movements data filtering
movements['date'] = pd.to_datetime(movements['movement_event_date'])
movs_no_preceding_app = movements[movements['application_id'] == -1] #Removing movements that don't start with an application
print "Movements which do not have a preceding application: ", movs_no_preceding_app.shape[0], "(", movs_no_preceding_app.shape[0]/float(movements.shape[0]), "%)"

Movements which do not have a preceding application:  99421 ( 0.0798756003464 %)


In [16]:
clean_movs = iefp_data_utils.clean_movements(movements, '1980-01-01','2017-04-30',clean_apps['table_index'])
print "Clean movements left: ", clean_movs.shape[0], "(", clean_movs.shape[0]/float(movements.shape[0]), "%)"

Clean movements left:  934232 ( 0.75056921438 %)


In [17]:
#Removing "clean" movements which do not match clean applications
clean_data = pd.merge(clean_movs,clean_apps,how='inner', left_on = 'application_id', right_on='table_index')
print "Clean movements left: ", clean_data.shape[0], "(", clean_data.shape[0]/float(movements.shape[0]), "%)"

Clean movements left:  934232 ( 0.75056921438 %)


In [18]:
#Removing movements which do not match clean applications
clean_data2 = pd.merge(movements,clean_apps,how='inner', left_on = 'application_id', right_on='table_index')
print "Clean movements left: ", clean_data2.shape[0], "(", clean_data2.shape[0]/float(movements.shape[0]), "%)"

Clean movements left:  934244 ( 0.750578855273 %)
