# Detect the ones who are only screened 

> run eligibility notebook

> run survey data notebook



In [2]:
%run "../eligibility.ipynb"
%run "../survey_data/survey_data.ipynb"

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
survey_data.shape
eligibility.shape

(37748, 31)

In [4]:
survey_data.columns
eligibility.columns

Index(['case_id', 'organisation', 'first_name', 'last_name', 'dob',
       'interview_date', 'adress', 'commune', 'commune_section',
       'interviewer_firstname', 'interviewer_lastname', 'is_your_parent_alive',
       'mothers_name', 'fathers_name', 'who_is_your_law_parent', 'total',
       'is_eligible_for_dep_hiv', 'age', 'already_in_a_group'],
      dtype='object')

Index(['case_id', 'dreams_code', 'organisation', 'id', 'phone',
       'not_selectable', 'first_name', 'last_name', 'dob', 'interview_date',
       'adress', 'commune', 'commune_section', 'interviewer_firstname',
       'interviewer_lastname', 'is_your_parent_alive', 'mothers_name',
       'fathers_name', 'total', 'who_is_your_law_parent', 'group_name',
       'id_group', 'hub_name', 'id_hub', 'eske_ou_lekol_deja',
       'depi_kile_ou_pa_al_lekol_with_label',
       'c61_depi_ki_l_ou_pa_al_lekl_ank', 'c6_ske_ou_te_oblije_double_deja',
       'kiyes_ki_peye_lekol_pou_ou', 'age', 'kpi_score'],
      dtype='object')

## Eligibility

In [5]:
eligibility.dreams_code.fillna("null",inplace=True)

In [6]:
def tranche_age_classique(age):
        if age>=10 and age<=14:
            return "10-14"
        elif age>=15 and age<=19:
            return "15-19"
        elif age>=20 and age<=24:
            return "20-24"
        elif age>=25 and age<=29:
            return "25-29"
        else:
            return "not_valid_age"
        
def tranche_age_mineur_majeur(age):
        if age>=10 and age<=17:
            return "10-17"
        elif age>=18 and age<=19:
            return "18-19"
        elif age>=20 and age<=24:
            return "20-24"
        elif age>=25 and age<=29:
            return "25-29"
        else:
            return "not_valid_age"

In [7]:
eligibility['age_range'] = eligibility.age.map(tranche_age_classique)
eligibility['newage_range'] = eligibility.age.map(tranche_age_mineur_majeur)

In [8]:
import pandas as pd
from datetime import datetime

In [9]:
eligibility["date_entevyou"] = pd.to_datetime( eligibility.interview_date)

In [10]:
def fiscalYear21(date):
    if date.year == 2021 and date.month>=1 and date.month<=3:
        return "FY21Q2"
    elif date.year == 2020 and date.month>=10 and date.month<=12:
        return "FY21Q1"
    elif date.year == 2021 and date.month>=4 and date.month<=6:
        return "FY21Q3"
    elif date.year == 2021 and date.month>=7 and date.month<=9:
        return "FY21Q4"
    else:
        return "not_valid_fy"
    

    

def validTimeOnSystem(date):
    if date>= datetime.strptime("2020-04-01","%Y-%m-%d") and date<= datetime.now():
        return "required_Time_on"
    else:
        return "not_valid_time_on"


In [11]:
eligibility["fiscal_year"] = eligibility.date_entevyou.map(fiscalYear21)
eligibility["timeOn_system"] = eligibility.date_entevyou.map(validTimeOnSystem)

In [12]:
almost_eligibility = eligibility[(eligibility.dreams_code == "null")&
                                 (eligibility.timeOn_system=="required_Time_on")&
                                 (eligibility.age_range!="25-29")&
                                 (eligibility.age_range!="not_valid_age")]

In [13]:
almost_eligibility.dreams_code.count()

3791

In [14]:
def firstlastdob(df):
    return f"{df.first_name} {df.last_name} {df.dob}"

In [15]:
%%capture
almost_eligibility["unduplicate_me"] = almost_eligibility.apply(lambda df: firstlastdob(df),axis=1)
clean_eligibility =  almost_eligibility.drop_duplicates(subset=["unduplicate_me"])


In [16]:
clean_eligibility.dreams_code.count()

3773

## Survey data

In [17]:
survey_data["remove_duplicate"] = survey_data.apply(lambda df: firstlastdob(df),axis=1)

In [18]:
survey_data["date_entevyou"] = pd.to_datetime( survey_data.interview_date)
survey_data["fiscal_year"] = survey_data.date_entevyou.map(fiscalYear21)
survey_data["timeOn_system"] = survey_data.date_entevyou.map(validTimeOnSystem)
survey_data['age_range'] = survey_data.age.map(tranche_age_classique)
survey_data['newage_range'] = survey_data.age.map(tranche_age_mineur_majeur)

In [19]:
almost_survey_data = survey_data[(survey_data.already_in_a_group == "no")&
                                 (survey_data.timeOn_system=="required_Time_on")&
                                 (survey_data.age_range!="25-29")&
                                 (survey_data.age_range!="not_valid_age")&
                                 (survey_data.total>=14)]

In [20]:
almost_survey_data.case_id.count()

3790

In [21]:
clean_survey_data =  almost_survey_data.drop_duplicates(subset=["remove_duplicate"])

In [22]:
clean_survey_data.case_id.count()

3772

## what we want

In [23]:
id_form_survey_data = clean_survey_data[["case_id","already_in_a_group"]]

In [24]:
update_screening_data = id_form_survey_data.merge(clean_eligibility, on="case_id",how='right')

In [25]:
update_screening_data.case_id.count()

3773

In [26]:
update_screening_data.to_excel("update_screened.xlsx",index=False,na_rep="NULL")

In [27]:
update_screening_data.date_entevyou.min().strftime("%Y-%m-%d")
update_screening_data.date_entevyou.max().strftime("%Y-%m-%d")

'2020-05-25'

'2021-06-27'

## verification des 4360 Mastersheets

In [28]:
old_data = pd.read_excel("old_screened_4360.xlsx")

In [39]:
how_much_isin_old_data = update_screening_data[update_screening_data.case_id.isin(old_data.case_id)]

In [41]:
how_much_isnotin_old_data  = update_screening_data[~update_screening_data.case_id.isin(old_data.case_id)]

3501

In [43]:
how_much_isnotin_old_data.case_id.count()

272

## count and date

In [44]:
old_data.case_id.count()
old_data.date_entevyou.min().strftime("%Y-%m-%d")
old_data.date_entevyou.max().strftime("%Y-%m-%d")

4360

'2020-04-18'

'2021-06-13'

In [45]:
how_much_isin_old_data.case_id.count()
how_much_isin_old_data.date_entevyou.min().strftime("%Y-%m-%d")
how_much_isin_old_data.date_entevyou.max().strftime("%Y-%m-%d")

3501

'2020-05-25'

'2021-06-13'

In [46]:
how_much_isnotin_old_data.case_id.count()
how_much_isnotin_old_data.date_entevyou.min().strftime("%Y-%m-%d")
how_much_isnotin_old_data.date_entevyou.max().strftime("%Y-%m-%d")

272

'2020-06-16'

'2021-06-27'