# Detect the ones who are only screened 

> run eligibility notebook

> run survey data notebook



In [39]:
%run "../eligibility.ipynb"
%run "../survey_data/survey_data.ipynb"

In [40]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


In [41]:
survey_data.shape
eligibility.shape

(55349, 19)

(42062, 31)

In [42]:
survey_data.columns
eligibility.columns

Index(['case_id', 'organisation', 'first_name', 'last_name', 'dob',
       'interview_date', 'adress', 'commune', 'commune_section',
       'interviewer_firstname', 'interviewer_lastname', 'is_your_parent_alive',
       'mothers_name', 'fathers_name', 'who_is_your_law_parent', 'total',
       'is_eligible_for_dep_hiv', 'age', 'already_in_a_group'],
      dtype='object')

Index(['case_id', 'dreams_code', 'organisation', 'id', 'phone',
       'not_selectable', 'first_name', 'last_name', 'dob', 'interview_date',
       'adress', 'commune', 'commune_section', 'interviewer_firstname',
       'interviewer_lastname', 'is_your_parent_alive', 'mothers_name',
       'fathers_name', 'total', 'who_is_your_law_parent', 'group_name',
       'id_group', 'hub_name', 'id_hub', 'eske_ou_lekol_deja',
       'depi_kile_ou_pa_al_lekol_with_label',
       'c61_depi_ki_l_ou_pa_al_lekl_ank', 'c6_ske_ou_te_oblije_double_deja',
       'kiyes_ki_peye_lekol_pou_ou', 'age', 'kpi_score'],
      dtype='object')

In [43]:
#survey_data[survey_data.organisation=="CARIS"]

## Eligibility

In [44]:
eligibility.dreams_code.fillna("null",inplace=True)

In [45]:
def tranche_age_classique(age):
        if age>=10 and age<=14:
            return "10-14"
        elif age>=15 and age<=19:
            return "15-19"
        elif age>=20 and age<=24:
            return "20-24"
        elif age>=25 and age<=29:
            return "25-29"
        else:
            return "not_valid_age"
        
def tranche_age_mineur_majeur(age):
        if age>=10 and age<=17:
            return "10-17"
        elif age>=18 and age<=19:
            return "18-19"
        elif age>=20 and age<=24:
            return "20-24"
        elif age>=25 and age<=29:
            return "25-29"
        else:
            return "not_valid_age"

In [46]:
eligibility['age_range'] = eligibility.age.map(tranche_age_classique)
eligibility['newage_range'] = eligibility.age.map(tranche_age_mineur_majeur)

In [47]:
import pandas as pd
from datetime import datetime

In [48]:
eligibility["date_entevyou"] = pd.to_datetime( eligibility.interview_date)

In [49]:
def fiscalYear21(date):
    if date.year == 2021 and date.month>=1 and date.month<=3:
        return "FY21Q2"
    elif date.year == 2020 and date.month>=10 and date.month<=12:
        return "FY21Q1"
    elif date.year == 2021 and date.month>=4 and date.month<=6:
        return "FY21Q3"
    elif date.year == 2021 and date.month>=7 and date.month<=9:
        return "FY21Q4"
    else:
        return "Q3fy21-Q4fy21"
    

    

def validTimeOnSystem(date):
    if date>= datetime.strptime("2020-10-01","%Y-%m-%d") and date<= datetime.now():
        return "required_Time_on"
    else:
        return "not_valid_time_on"


In [50]:
eligibility["fiscal_year"] = eligibility.date_entevyou.map(fiscalYear21)
eligibility["timeOn_system"] = eligibility.date_entevyou.map(validTimeOnSystem)

In [51]:
almost_eligibility = eligibility[(eligibility.dreams_code == "null")&
                                 (eligibility.timeOn_system=="required_Time_on")&
                                 (eligibility.age_range!="25-29")&
                                 (eligibility.age_range!="not_valid_age")]

In [52]:
almost_eligibility.dreams_code.count()

3080

In [53]:
def firstlastdob(df):
    return f"{df.first_name} {df.last_name} {df.dob}"

In [54]:
%%capture
almost_eligibility["unduplicate_me"] = almost_eligibility.apply(lambda df: firstlastdob(df),axis=1)
clean_eligibility =  almost_eligibility.drop_duplicates(subset=["unduplicate_me"])


In [55]:
clean_eligibility.dreams_code.count()

3040

## Survey data

In [56]:
survey_data["remove_duplicate"] = survey_data.apply(lambda df: firstlastdob(df),axis=1)

In [57]:
survey_data["date_entevyou"] = pd.to_datetime( survey_data.interview_date)
survey_data["fiscal_year"] = survey_data.date_entevyou.map(fiscalYear21)
survey_data["timeOn_system"] = survey_data.date_entevyou.map(validTimeOnSystem)
survey_data['age_range'] = survey_data.age.map(tranche_age_classique)
survey_data['newage_range'] = survey_data.age.map(tranche_age_mineur_majeur)

In [58]:
almost_survey_data = survey_data[(survey_data.already_in_a_group == "no")&
                                 (survey_data.timeOn_system=="required_Time_on")&
                                 (survey_data.age_range!="25-29")&
                                 (survey_data.age_range!="not_valid_age")&
                                 (survey_data.total>=14)]

In [59]:
almost_survey_data.case_id.count()

3080

In [60]:
clean_survey_data =  almost_survey_data.drop_duplicates(subset=["remove_duplicate"])

In [61]:
clean_survey_data.case_id.count()

3040

## what we want

In [62]:
id_form_survey_data = clean_survey_data[["case_id","already_in_a_group"]]

In [63]:
update_screening_data = id_form_survey_data.merge(clean_eligibility, on="case_id",how='right')

In [64]:
update_screening_data.case_id.count()

3040

In [65]:
update_screening_data.to_excel("update_screened.xlsx",index=False,na_rep="NULL")

In [66]:
update_screening_data.date_entevyou.min().strftime("%Y-%m-%d")
update_screening_data.date_entevyou.max().strftime("%Y-%m-%d")

'2020-10-03'

'2021-09-14'

## verification des 4360 Mastersheets

In [67]:
old_data = pd.read_excel("old_screened_4360.xlsx")

In [68]:
how_much_isin_old_data = update_screening_data[update_screening_data.case_id.isin(old_data.case_id)]

In [69]:
how_much_isnotin_old_data  = update_screening_data[~update_screening_data.case_id.isin(old_data.case_id)]

## count and date

In [70]:
old_data.case_id.count()
old_data.date_entevyou.min().strftime("%Y-%m-%d")
old_data.date_entevyou.max().strftime("%Y-%m-%d")

4360

'2020-04-18'

'2021-06-13'

In [71]:
how_much_isin_old_data.case_id.count()
how_much_isin_old_data.date_entevyou.min().strftime("%Y-%m-%d")
how_much_isin_old_data.date_entevyou.max().strftime("%Y-%m-%d")

689

'2020-10-03'

'2021-06-12'

In [72]:
how_much_isnotin_old_data.case_id.count()
how_much_isnotin_old_data.date_entevyou.min().strftime("%Y-%m-%d")
how_much_isnotin_old_data.date_entevyou.max().strftime("%Y-%m-%d")

2351

'2020-11-14'

'2021-09-14'

## Verification with the newest from the list of JULY

In [73]:
onejuly = pd.read_excel("old_new_enroled_July.xlsx")
onejuly.case_id.count()
onejuly.date_entevyou.min().strftime("%Y-%m-%d")
onejuly.date_entevyou.max().strftime("%Y-%m-%d")

1328

'2020-06-14'

'2021-07-09'

In [74]:
how_much_isin_onejuly = how_much_isnotin_old_data[how_much_isnotin_old_data.case_id.isin(onejuly.case_id)]
how_much_isin_onejuly.case_id.count()
how_much_isin_onejuly.date_entevyou.min().strftime("%Y-%m-%d")
how_much_isin_onejuly.date_entevyou.max().strftime("%Y-%m-%d")

499

'2020-11-14'

'2021-07-03'

In [75]:
how_much_isnotin_onejuly = how_much_isnotin_old_data[~how_much_isnotin_old_data.case_id.isin(onejuly.case_id)]
how_much_isnotin_onejuly.case_id.count()
how_much_isnotin_onejuly.date_entevyou.min().strftime("%Y-%m-%d")
how_much_isnotin_onejuly.date_entevyou.max().strftime("%Y-%m-%d")

1852

'2020-11-14'

'2021-09-14'

## SAVE and PICTURE

In [76]:
#how_much_isin_old_data.to_excel("evolution_des_4360.xlsx",na_rep="NULL", index=False)
#how_much_isin_onejuly.to_excel("evolution_des_1328.xlsx",na_rep="NULL",index=False)
#how_much_isnotin_onejuly.to_excel("new_screened_after_1328.xlsx",na_rep="NULL",index=False)
#how_much_isnotin_old_data.to_excel("new_screened_after_4360.xlsx",na_rep="NULL",index=False)
how_much_isnotin_old_data.to_excel("LesfillesScreeners.xlsx",na_rep="NULL",index=False)




df = pd.DataFrame({"stage":["Liste_des_4360","Evolution_liste_4360","new_enroles_4360","Liste_des_1328","Evolution_liste_1328","new_enroles_1328"],
                   "data":[
                        old_data.case_id.count(),
                        how_much_isin_old_data.case_id.count(),
                        how_much_isnotin_old_data.case_id.count(),
                        onejuly.case_id.count(),
                        how_much_isin_onejuly.case_id.count(),
                        how_much_isnotin_onejuly.case_id.count()
                    ]}
)

df

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

plt.subplots(figsize=(16,8))
sns.set_style("darkgrid")
splot = sns.barplot(x="stage",y="data",data=df)
splot.set_xlabel("")
splot.set_ylabel("")
plt.suptitle("La situation des formulaires d'enrolement sur le systeme")
for p in splot.patches:
    height = p.get_height()
    plt.text(
                    p.get_x() + p.get_width()/2, 
                    0.5*p.get_height() + p.get_y(),
                    '{:1.0f}'.format(height), 
                    ha='center', va='center',
                    size=15,
                    fontdict=dict(color="white",fontsize=15)
                )
    
plt.annotate(
    "source: HIVHaiti",(0,0), (-80,-20), fontsize=10, 
             xycoords='axes fraction', textcoords='offset points', va='top'
)
plt.savefig("Evolution_screening",dpi=400)