In [2]:
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Output, Input
import os 
import pandas as pd
import numpy as np
from plotly import graph_objects as go

In [1]:
pwd

'c:\\Users\\HP\\source\\repos\\STARS_dashboard\\stars-dashboard'

In [3]:
os.chdir(r"C:\Users\HP\source\repos\STARS_dashboard")
os.getcwd()

'C:\\Users\\HP\\source\\repos\\STARS_dashboard'

In [2]:
# app instantiation
app = JupyterDash(__name__)

In [4]:
# app layout
app.layout = html.Div([
    dcc.Dropdown(
        options=[{"label" : color,
                 "value" : color }
                 for color in ["blue","green","yellow"]
    ]),
    html.Div()
])

In [None]:
# run the app
if __name__ == "__main__":
    app.run_server(mode = "inline",port = "3333")

In [None]:
#using callbacks

# app instantiation
app = JupyterDash(__name__)

# app layout
app.layout = html.Div([
    dcc.Dropdown(id= "color_dropdown",
                 options = [
                    {"label": color, 
                    "value": color}
                    for color in ["Blue","Black", "Green"]
                ]),
    html.Br(),
    html.Div(id="display_dropdown")
])

# callback functions
@app.callback(Output(component_id = "display_dropdown",
                     component_property= "children"),
              Input(component_id = "color_dropdown",
                    component_property = "value"))

def display_selected_color(color):
    if color is None:
        color = "Nothing"
    return "The color is " + color 

# run the app
if __name__ == "__main__":
    app.run_server(mode = "inline", port = 2)

In [None]:
# app instantiation
app = JupyterDash(__name__)

# information on the activities
timeline = ["first year", "second year"]
short_description = {"first year":"The data collection activities spanned 10 districts while gathering data on 343 schools where 100 schools were in the\
                     control group while 243 schools were in the treatment group",
                     "second year":"489 schools were visited across 10 districts to conduct STARS data collection activities."}

# app layout
app.layout = html.Div([
    dcc.Dropdown(id = "id_dropdown",
                 options = [{"label" : time,
                            "value": time}
                            for time in timeline]),
    html.Br(),
    html.Div(id = "output_dropdown")
])

# callback functions
@app.callback(Output(component_id = "output_dropdown",
                     component_property= "children"),
              Input(component_id = "id_dropdown",
                    component_property = "value"))

def activity_info(timeline):
    if timeline is None:
        return "STARS project is being implemented by Innovation for Poverty Actions in partnership with Georgetown University, MINEDUC, REB, and NESA"
    return [html.H3(timeline), f'For the  {timeline}, {short_description[timeline]}']

# run the app
if __name__ == "__main__":
    app.run_server(mode = "inline", port = 5)

## Working with Plotly's Figure Objects

### Understanding the Figure Object

In [None]:
go.Figure()

In [None]:
fig = go.Figure()
fig.add_scatter(x=[1,2,3],y=[4,2,3])
fig.show()

In [None]:
fig.add_scatter(x = [3,5,2,4], y = [5,2,6,0])

#### Layout Attribute

In [None]:
fig.layout.title = "Example 0"
fig.layout.xaxis.title = "x axis"
fig.layout.yaxis.title = "y axis"
fig.show()

In [None]:
# exploring figure objects
fig.show("json")

In [None]:
fig.show()

In [None]:
fig.show(config = {"displaylogo" : False,
                   "modeBarButtonsToAdd" : ["drawrect",
                                            "drawcircle",
                                            "eraseshape"]})

#### Converting figures

In [9]:
# To Html
fig.write_html("html_plot.html",
               config = {"toImageButtonOptions" : {"format" : "svg"}})

In [None]:
# to images
fig.write_image("apps/images/example_image.svg",
                height = 800, width = 900)

# plotting using real data

In [4]:
# import datasets to use

# we start by using data entry data from year 1

data_entry_team_1 = pd.read_stata(r"data\ipa-data\learning_assessment\data_entry\year1\clean\first_entry\Final Assessment First Entry Clean.dta")
data_entry_team_2 = pd.read_stata(r"data\ipa-data\learning_assessment\data_entry\year1\clean\second_entry\Final Assessment Second Entry Clean.dta")
data_entry_reconciliation = pd.read_stata(r"data\ipa-data\learning_assessment\data_entry\year1\clean\reconciliation\reconciled_clean.dta")

In [None]:
data_entry_team_1.head(5)

In [None]:
data_entry_team_1.dropna(axis=1,how="all").describe(include = [np.number]).T

In [None]:
data_entry_team_1.dropna(axis=1,how="all").describe(include="all",datetime_is_numeric=True).T

In [8]:
data_entry_team_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56906 entries, 0 to 56905
Columns: 456 entries, deviceid to v454
dtypes: category(407), datetime64[ns](5), float64(10), object(34)
memory usage: 43.9+ MB


In [5]:
# check columns that are entirely empty
empty_cols = [col for col in data_entry_team_1 if  data_entry_team_1[col].isna().all()]
print(empty_cols)

['kin1marks_p6_kin1p6_kin_q021p6_k', 'kin1marks_p6_kin1p6_kin_q061p6_k', 'kin1marks_p6_kin1p6_kin_q101p6_k', 'v453', 'kin1marks_p5_kin1p5_kin_q141p5_k', 'v454']


In [6]:
#drop empty cols
df_team_1 = data_entry_team_1.copy()
df_team_1 = df_team_1.drop(empty_cols,axis = 1)

# check columns that are entirely empty
empty_cols0 = [col for col in df_team_1 if  df_team_1[col].isna().all()]
print(empty_cols0)

[]


In [7]:
# let's check categorical columns
#cat_cols = [col for col in df_team_1 if df_team_1[col].dtypes.name == "category"]
#print(cat_cols)

cat_cols0 = df_team_1.select_dtypes(["category"]).columns
print(cat_cols0)

Index(['spv', 'fo', 'eng_participation', 'p1_eng_q01_sub1', 'p1_eng_q01_sub2',
       'p1_eng_q02_sub1', 'p1_eng_q02_sub2', 'p1_eng_q02_sub3',
       'p1_eng_q03_sub1', 'p1_eng_q03_sub2',
       ...
       'p2_leg_q01_sub1', 'p2_leg_q02_sub1', 'p2_leg_q03_sub1',
       'p2_leg_q04_sub1', 'p2_leg_q05_sub1', 'p3_leg_q01_sub1',
       'p3_leg_q02_sub1', 'p3_leg_q03_sub1', 'p3_leg_q04_sub1',
       'p3_leg_q05_sub1'],
      dtype='object', length=407)


we have 407 columns that are categorical, this makes sense where, except for svp and fo, the columns represent participation or marks received by students with 3 choices: correct, incorrect, or left blank

In [None]:
# let's visualize categorical columns
print(df_team_1[cat_cols0])

let's check for one of the categorical columns the choices available:

In [29]:
#print(df_team_1["p1_eng_q02_sub3"].cat.codes.unique())
#print(df_team_1["p1_eng_q02_sub3"].cat.categories.unique())
print(dict(enumerate(df_team_1["p1_eng_q02_sub3"].cat.categories)))
print(dict(enumerate(df_team_1["eng_participation"].cat.categories)))
print(dict(enumerate(df_team_1["spv"].cat.categories)))
print(dict(enumerate(df_team_1["fo"].cat.categories)))

{0: 'Left blank', 1: 'No', 2: 'Yes'}
{0: 'No', 1: 'Yes'}
{0: 'Shema Christian', 1: 'NAHIMANA Reverien   ', 2: 'Havugimana Saidi', 3: 'Uwimana  Pauline', 4: 'NTAREMBA George   ', 5: 'Hatangimana Gisele'}
{0: 'Rushambara Alexis', 1: 'NSHIMIYIMANA PASCAL', 2: 'Habonimana Gabriel', 3: 'Nirere Sandrine', 4: 'NIYIBIZI Leandre', 5: 'UWASE Francine\xa0', 6: 'Uwizeyimana Viateur', 7: 'Rusangwa Adolphe', 8: 'IRADUKUNDA GAHIRE ADOLPHE', 9: 'Ingabire Emelyne', 10: 'MUSHIMIRE Clarisse', 11: 'Mudahogora Placidie', 12: 'TUYISENGE ANICK', 13: 'Bagirishyaka Fulgence', 14: 'Fidele Iragena', 15: 'MUKANKOMEJE  Chantal', 16: 'NABAGIZE JUSTINE', 17: 'NTAWUSIGIRYAYO  Eric ', 18: 'IRAMBONA Valens', 19: 'Shingiro John', 20: 'UMUBYEYI MARIE GRACE', 21: 'MURAGIJIMANA Obadia', 22: 'MASENGESHO Samuel', 23: 'UMUBYEYI Claudine', 24: 'DUFATANYE Devota', 25: 'NSHIMIYIMANA Bernard', 26: 'MUKANSANGA Jacqueline', 27: 'UWIMANA Jeannette', 28: 'NAHAYO Jean Damascene', 29: 'DUSENGUMUREMYI Yvonne', 30: 'NSENGUMUREMYI Felix',

In [8]:
enumerators = dict(enumerate(df_team_1["fo"].cat.categories))
enumerators = pd.DataFrame(enumerators.items(),columns=["codes", "names"])
enumerators.head(20)

Unnamed: 0,codes,names
0,0,Rushambara Alexis
1,1,NSHIMIYIMANA PASCAL
2,2,Habonimana Gabriel
3,3,Nirere Sandrine
4,4,NIYIBIZI Leandre
5,5,UWASE Francine
6,6,Uwizeyimana Viateur
7,7,Rusangwa Adolphe
8,8,IRADUKUNDA GAHIRE ADOLPHE
9,9,Ingabire Emelyne


In [9]:
#we are going to convert categorical variables to their numeric values
df_team_1[cat_cols0] = df_team_1[cat_cols0].apply(lambda x: x.cat.codes)
#check the results
print(df_team_1.select_dtypes(["category"]).columns)
print(df_team_1["p1_eng_q02_sub3"].unique())
print(df_team_1["eng_participation"].unique())
print(df_team_1["spv"].unique())
print(df_team_1["fo"].unique())

Index([], dtype='object')
[-1  2  1  0]
[ 1  0 -1]
[0 1 4 2 5 3]
[110 109   2  89 103  99  36 112  13  98   5 137  40  48 119  26   8 140
 105  76  42  74  14  59  71 134  91 127  17   4  95  37 116 128  31  52
  61 121  51  96  77 102  54  55 122  97 113 139  21 136  35  62  90 108
  85 124  79  49  75 138  33  86  68  83 126  64  28  44  30  81   6  88
  23  41  11 107  67  65 120 100  15 123  34  10  43  47  57  94   9   3
  12  50  70  22  69 131 125 101  20  38  87  16  73  63  92   7 133  46
  45 104  80  78  84 114  32   0  72  66  53 117 130 118  24  93  18 106
  82  39  27  60 129 141  19  56  29  58 115   1 111 132  25 135]


In [37]:
df_team_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56906 entries, 0 to 56905
Columns: 450 entries, deviceid to enddate
dtypes: datetime64[ns](5), float64(4), int16(1), int8(406), object(34)
memory usage: 41.2+ MB


we have successfully converted categorical variables to numeric variables

In [10]:
#check object type variables
object_cols = df_team_1.select_dtypes(["object"]).columns
print(object_cols)

Index(['deviceid', 'devicephonenum', 'username', 'device_info', 'duration',
       'caseid', 'district', 'sector', 'school', 'grade', 'student_code',
       'student_code_other', 'student_code_name', 'time_check_eng_1',
       'time_check_eng_2', 'time_check_eng_3', 'time_check_eng_4',
       'time_check_eng_5', 'time_check_eng_6', 'time_check_math_1',
       'time_check_math_2', 'time_check_math_3', 'time_check_math_4',
       'time_check_math_5', 'time_check_math_6', 'time_check_kiny_4',
       'time_check_kiny_5', 'time_check_kiny_6', 'time_check_kiny_1',
       'time_check_kiny_2', 'time_check_kiny_3', 'time_check_end',
       'instancename', 'key'],
      dtype='object')


In [None]:
# let's view the object type variables
df_team_1[object_cols].head()

In [19]:
#drop unuseful columns
df_team_1 = df_team_1.drop(['deviceid', 'devicephonenum', 'username', 'caseid','device_info','instancename', 'key'], axis=1)

In [20]:
#check object type variables again
object_cols0 = df_team_1.select_dtypes(["object"]).columns
print(object_cols0)

Index(['duration', 'district', 'sector', 'school', 'grade', 'student_code',
       'student_code_other', 'student_code_name', 'time_check_eng_1',
       'time_check_eng_2', 'time_check_eng_3', 'time_check_eng_4',
       'time_check_eng_5', 'time_check_eng_6', 'time_check_math_1',
       'time_check_math_2', 'time_check_math_3', 'time_check_math_4',
       'time_check_math_5', 'time_check_math_6', 'time_check_kiny_4',
       'time_check_kiny_5', 'time_check_kiny_6', 'time_check_kiny_1',
       'time_check_kiny_2', 'time_check_kiny_3', 'time_check_end',
       'student_code_unsure'],
      dtype='object')


In [None]:
print(df_team_1["school"].unique())

In [None]:
print(df_team_1["student_code"].unique())
print(df_team_1["student_code_name"].unique())
print(df_team_1["student_code_other"].unique())
print(df_team_1["time_check_eng_1"].unique())
print(df_team_1["time_check_end"].unique())

In [None]:
df_team_1[["student_code","student_code_other"]][df_team_1["student_code_other"]!="."]

In [53]:
df_team_1["student_code_other"][41] == df_team_1["student_code"][41] 

True

In [21]:
#Assigning values to student_code_other based on student_code variable
df_team_1.loc[df_team_1["student_code_other"] == df_team_1["student_code"], "student_code_other"] = "."
print(df_team_1.student_code_other.unique())

['.' '-555' '555' '540712210384' '21108002025' '555555555555'
 '510712002010' '540712210425' '421807055001' '4305061550' '5707122003003'
 '4211020003025' '211001230088' '421102210191' '42111204027'
 '320028002001']


In [None]:
df_team_1.loc[df_team_1["student_code_other"] != ".", ["student_code","student_code_other"]]

In [17]:
df_team_1.loc[df_team_1["student_code_other"] != ".", "student_code_other"].unique()

array(['-555', '555', '540712210384', '21108002025', '555555555555',
       '510712002010', '540712210425', '421807055001', '4305061550',
       '5707122003003', '4211020003025', '211001230088', '421102210191',
       '42111204027', '320028002001'], dtype=object)

In [None]:
df_team_1.loc[df_team_1["student_code_other"].isin(['540712210384', '21108002025',
       '510712002010', '540712210425', '421807055001', '4305061550',
       '5707122003003', '4211020003025', '211001230088', '421102210191',
       '42111204027', '320028002001']), ["student_code","student_code_other"]]

let's create a column that will help us track these inconsistencies

In [22]:
df_team_1["student_code_unsure"] = np.where(df_team_1["student_code_other"].isin(['540712210384', '21108002025',
       '510712002010', '540712210425', '421807055001', '4305061550',
       '5707122003003', '4211020003025', '211001230088', '421102210191',
       '42111204027', '320028002001']),"flagged","passed")


In [16]:
print(df_team_1["student_code_unsure"].value_counts())

passed     56894
flagged       12
Name: student_code_unsure, dtype: int64


In [16]:
# check if student_code column is unique and doesn't have duplicates
df_team_1["student_code"].is_unique

False

It seems like we have duplicates in the student code variable.
Let us count how many duplicates we have

In [17]:
# check how many dups we have
df_team_1["student_code"].duplicated().sum()

451

we have around 451 duplicates

In [None]:
#let us view the duplicates
df_team_1[df_team_1["student_code"].isin(df_team_1["student_code"][df_team_1["student_code"].duplicated()])].sort_values(["student_code","submissiondate"])

In [23]:
df_team_1_unique = df_team_1.sort_values(["student_code","submissiondate"]).drop_duplicates("student_code",keep="last")
# check how many dups we have
print(df_team_1_unique["student_code"].duplicated().sum())

0


Now that we have treated student_code duplicates, we can move on with our feature cleaning activities

In [24]:
#changing column types to numeric, Datetime, and string

#numeric
df_team_1_unique[["duration","school","student_code"]] = df_team_1_unique[["duration","school","student_code"]].apply(pd.to_numeric)

In [25]:
#datetime
# select columns that starts with time_check
time_cols = [col for col in df_team_1_unique if col.startswith("time_check")]

df_team_1_unique[time_cols] = df_team_1_unique[time_cols].apply(pd.to_datetime)

In [None]:
# Let us look at how the datetime transformation was executed
df_team_1_unique[time_cols]

In [24]:
# let's check how many object type columns are left
print(df_team_1_unique.select_dtypes("object").columns)

Index(['district', 'sector', 'grade', 'student_code_other',
       'student_code_name', 'student_code_unsure'],
      dtype='object')


In [26]:
# first we drop unwanted columns
unwanted_cols = ['student_code_other', 'student_code_name']
df_team_1_unique = df_team_1_unique.drop(unwanted_cols, axis=1)


In [27]:
# string
string_cols = ['district', 'sector', 'grade', 'student_code_unsure']

df_team_1_unique[string_cols] = df_team_1_unique[string_cols].apply(lambda col: col.map(repr)) #interesting
print(df_team_1_unique[string_cols].head())

         district        sector grade student_code_unsure
41776  'Kicukiro'     'Gikondo'  'P2'            'passed'
32697  'Kicukiro'     'Kanombe'  'P3'            'passed'
26680  'Kicukiro'    'Kigarama'  'P6'            'passed'
26635    'Gasabo'    'Rusororo'  'P5'            'passed'
28737  'Kicukiro'  'Nyarugunga'  'P3'            'passed'


In [None]:
# let's check how many object type columns are left
print(df_team_1_unique.select_dtypes("object").columns)

It turns out that string columns are represented as object type!
we can now move on to the next stage in our data cleaning/engineering process

In [28]:
# Let's view the columns we have in our dataset
columns_df = df_team_1_unique.columns
print(*columns_df)

duration spv fo district sector school grade student_code eng_participation time_check_eng_1 p1_eng_q01_sub1 p1_eng_q01_sub2 p1_eng_q02_sub1 p1_eng_q02_sub2 p1_eng_q02_sub3 p1_eng_q03_sub1 p1_eng_q03_sub2 p1_eng_q03_sub3 p1_eng_q04_sub1 p1_eng_q04_sub2 p1_eng_q04_sub3 p1_eng_q05_sub1 p1_eng_q05_sub2 p1_eng_q05_sub3 p1_eng_q05_sub4 p1_eng_q05_sub5 p1_eng_q06_sub1 p1_eng_q06_sub2 p1_eng_q06_sub3 p1_eng_q07_sub1 p1_eng_q07_sub2 p1_eng_q07_sub3 p1_eng_q08_sub1 p1_eng_q08_sub2 p1_eng_q08_sub3 time_check_eng_2 p2_eng_q01_sub1 p2_eng_q01_sub2 p2_eng_q01_sub3 p2_eng_q01_sub4 p2_eng_q02_sub1 p2_eng_q02_sub2 p2_eng_q02_sub3 p2_eng_q03_sub1 p2_eng_q04_sub1 p2_eng_q05_sub1 p2_eng_q06_sub1 p2_eng_q07_sub1 p2_eng_q07_sub2 p2_eng_q08_sub1 p2_eng_q08_sub2 p2_eng_q08_sub3 p2_eng_q09_sub1 p2_eng_q09_sub2 p2_eng_q09_sub3 p2_eng_q10_sub1 p2_eng_q10_sub2 p2_eng_q10_sub3 p2_eng_q10_sub4 p2_eng_q11_sub1 time_check_eng_3 p3_eng_q01_sub1 p3_eng_q01_sub2 p3_eng_q02_sub1 p3_eng_q03_sub1 p3_eng_q04_sub1 p3_eng_q0

We are going to remove columns that are not important for the following steps

In [29]:
cols_remove = ['formdef_version',  'submissiondate', 'starttime', 'endtime',  'enddate']
df_long_1 = df_team_1_unique.copy()
df_long_1 = df_long_1.drop(cols_remove,axis=1)

##### Reshaping the dataset from Wide to Long

In [30]:
# from wide to long
timing_cols = [col for col in df_long_1 if col.startswith("time_check")]
time_math = [col for col in df_long_1 if col.startswith("time_check_math")]
time_eng = [col for col in df_long_1 if col.startswith("time_check_eng")]
time_kiny = [col for col in df_long_1 if col.startswith("time_check_kiny")]
id_variables = ['duration', 'spv', 'fo', 'district', 'sector', 'school', 'grade', 
                   'student_code', 'subdate', 'student_code_unsure'] + timing_cols
df_long_1 = pd.melt(df_long_1, id_vars= id_variables, 
                   var_name="questions", value_name="answers").reset_index()

In [31]:
# subset data that we can use to compare with team 2 data to check for discrepancies
df_long_sub_1 = df_long_1.drop(timing_cols + ["index","duration"], axis=1)
print(df_long_sub_1.columns)

Index(['spv', 'fo', 'district', 'sector', 'school', 'grade', 'student_code',
       'subdate', 'student_code_unsure', 'questions', 'answers'],
      dtype='object')
