In [1]:
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Output, Input
import os 
import pandas as pd
import numpy as np
from plotly import graph_objects as go

In [2]:
pwd

'c:\\Users\\HP\\source\\repos\\STARS_dashboard\\stars-dashboard'

In [3]:
os.chdir(r"C:\Users\HP\source\repos\STARS_dashboard")
os.getcwd()

'C:\\Users\\HP\\source\\repos\\STARS_dashboard'

In [2]:
# app instantiation
app = JupyterDash(__name__)

In [4]:
# app layout
app.layout = html.Div([
    dcc.Dropdown(
        options=[{"label" : color,
                 "value" : color }
                 for color in ["blue","green","yellow"]
    ]),
    html.Div()
])

In [None]:
# run the app
if __name__ == "__main__":
    app.run_server(mode = "inline",port = "3333")

In [None]:
#using callbacks

# app instantiation
app = JupyterDash(__name__)

# app layout
app.layout = html.Div([
    dcc.Dropdown(id= "color_dropdown",
                 options = [
                    {"label": color, 
                    "value": color}
                    for color in ["Blue","Black", "Green"]
                ]),
    html.Br(),
    html.Div(id="display_dropdown")
])

# callback functions
@app.callback(Output(component_id = "display_dropdown",
                     component_property= "children"),
              Input(component_id = "color_dropdown",
                    component_property = "value"))

def display_selected_color(color):
    if color is None:
        color = "Nothing"
    return "The color is " + color 

# run the app
if __name__ == "__main__":
    app.run_server(mode = "inline", port = 2)

In [None]:
# app instantiation
app = JupyterDash(__name__)

# information on the activities
timeline = ["first year", "second year"]
short_description = {"first year":"The data collection activities spanned 10 districts while gathering data on 343 schools where 100 schools were in the\
                     control group while 243 schools were in the treatment group",
                     "second year":"489 schools were visited across 10 districts to conduct STARS data collection activities."}

# app layout
app.layout = html.Div([
    dcc.Dropdown(id = "id_dropdown",
                 options = [{"label" : time,
                            "value": time}
                            for time in timeline]),
    html.Br(),
    html.Div(id = "output_dropdown")
])

# callback functions
@app.callback(Output(component_id = "output_dropdown",
                     component_property= "children"),
              Input(component_id = "id_dropdown",
                    component_property = "value"))

def activity_info(timeline):
    if timeline is None:
        return "STARS project is being implemented by Innovation for Poverty Actions in partnership with Georgetown University, MINEDUC, REB, and NESA"
    return [html.H3(timeline), f'For the  {timeline}, {short_description[timeline]}']

# run the app
if __name__ == "__main__":
    app.run_server(mode = "inline", port = 5)

## Working with Plotly's Figure Objects

### Understanding the Figure Object

In [None]:
go.Figure()

In [None]:
fig = go.Figure()
fig.add_scatter(x=[1,2,3],y=[4,2,3])
fig.show()

In [None]:
fig.add_scatter(x = [3,5,2,4], y = [5,2,6,0])

#### Layout Attribute

In [None]:
fig.layout.title = "Example 0"
fig.layout.xaxis.title = "x axis"
fig.layout.yaxis.title = "y axis"
fig.show()

In [None]:
# exploring figure objects
fig.show("json")

In [None]:
fig.show()

In [None]:
fig.show(config = {"displaylogo" : False,
                   "modeBarButtonsToAdd" : ["drawrect",
                                            "drawcircle",
                                            "eraseshape"]})

#### Converting figures

In [9]:
# To Html
fig.write_html("html_plot.html",
               config = {"toImageButtonOptions" : {"format" : "svg"}})

In [None]:
# to images
fig.write_image("apps/images/example_image.svg",
                height = 800, width = 900)

# plotting using real data

In [4]:
# import datasets to use

# we start by using data entry data from year 1

data_entry_team_1 = pd.read_stata(r"data\ipa-data\learning_assessment\data_entry\year1\clean\first_entry\Final Assessment First Entry Clean.dta")
data_entry_team_2 = pd.read_stata(r"data\ipa-data\learning_assessment\data_entry\year1\clean\second_entry\Final Assessment Second Entry Clean.dta")
data_entry_reconciliation = pd.read_stata(r"data\ipa-data\learning_assessment\data_entry\year1\clean\reconciliation\reconciled_clean.dta")

In [5]:
data_entry_team_1.head(5)

Unnamed: 0,deviceid,devicephonenum,username,device_info,duration,caseid,spv,fo,district,sector,...,kin1marks_p6_kin1p6_kin_q061p6_k,kin1marks_p6_kin1p6_kin_q101p6_k,v453,kin1marks_p5_kin1p5_kin_q141p5_k,submissiondate,starttime,endtime,subdate,enddate,v454
0,E0D3C54C-EFFE-4185-8F00-80D1CE7B79E6,,collect,Apple|iPhone|15.7.5|SurveyCTO Collect 2.80 (27...,180,,Shema Christian,Ndagijimana Samuel,Nyanza,Busoro,...,,,,,2023-08-07 17:02:03,2023-08-07 15:07:01,2023-08-07 15:10:01,2023-08-07,2023-08-07,
1,e9129293624d72e5,,collect,samsung|SM-T225|13|SurveyCTO Collect 2.80.2 (2...,217,,NAHIMANA Reverien,NDABANANIYE David,Musanze,Busogo,...,,,,,2023-07-28 16:33:29,2023-07-28 15:28:16,2023-07-28 15:31:52,2023-07-28,2023-07-28,
2,d73df25fad2cc33b,,collect,samsung|SM-T225|13|SurveyCTO Collect 2.80.2 (2...,575,,Shema Christian,Habonimana Gabriel,Gakenke,Kivuruga,...,,,,,2023-07-24 15:34:11,2023-07-24 09:05:49,2023-07-24 09:15:25,2023-07-24,2023-07-24,
3,d6ee4c5c7759f9a6,,collect,samsung|SM-T290|11|SurveyCTO Collect 2.80 (9b6...,401,,NTAREMBA George,IRASUBIZA Zawadi,Musanze,Gacaca,...,,,,,2023-07-31 11:31:03,2023-07-31 10:53:15,2023-07-31 10:59:56,2023-07-31,2023-07-31,
4,87c4ab681ae1d513,,collect (not yet authenticated),samsung|SM-T505N|10|SurveyCTO Collect 2.80.2 (...,429,,NAHIMANA Reverien,KWIHANGANA Marc,Nyanza,Busoro,...,,,,,2023-08-07 09:32:26,2023-08-07 09:18:01,2023-08-07 09:25:10,2023-08-07,2023-08-07,


In [6]:
data_entry_team_1.dropna(axis=1,how="all").describe(include = [np.number]).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
p1_leg_cwpm_sub1,1667.0,10.94301,9.928665,0.0,0.0,10.0,20.0,27.0
p2_leg_cwpm_sub1,1675.0,23.56716,12.97464,0.0,15.0,25.0,35.0,41.0
p3_leg_cwpm_sub1,1651.0,35.86978,14.85797,0.0,27.0,37.0,47.0,59.0
formdef_version,56906.0,2402470000.0,21227120.0,2307212000.0,2407201000.0,2407201000.0,2407201000.0,2407201000.0


In [7]:
data_entry_team_1.dropna(axis=1,how="all").describe(include="all",datetime_is_numeric=True).T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
deviceid,56906,169,4345ef039ad8e2b5,1209,,,,,,,
devicephonenum,56906,5,,55777,,,,,,,
username,56906,5,collect,54311,,,,,,,
device_info,56906,57,samsung|SM-T225|13|SurveyCTO Collect 2.80.2 (2...,36882,,,,,,,
duration,56906,1456,201,190,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
submissiondate,56906,,,,2023-08-02 08:10:02.905159168,2023-07-22 12:42:41,2023-07-27 13:56:55.249999872,2023-08-01 17:32:54,2023-08-07 15:58:51.750000128,2023-08-25 18:56:57,
starttime,56906,,,,2023-08-02 04:46:56.341000704,2023-03-17 02:43:34,2023-07-27 10:05:07,2023-08-01 16:17:46,2023-08-07 13:14:34.249999872,2023-08-25 18:33:15,
endtime,56906,,,,2023-08-02 04:58:39.913945856,2023-07-22 08:27:26,2023-07-27 10:13:17.500000,2023-08-01 16:23:44,2023-08-07 13:19:46.500000,2023-08-25 18:40:53,
subdate,56906,,,,2023-08-01 16:50:58.854953984,2023-07-22 00:00:00,2023-07-27 00:00:00,2023-08-01 00:00:00,2023-08-07 00:00:00,2023-08-25 00:00:00,


In [8]:
data_entry_team_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56906 entries, 0 to 56905
Columns: 456 entries, deviceid to v454
dtypes: category(407), datetime64[ns](5), float64(10), object(34)
memory usage: 43.9+ MB


In [5]:
# check columns that are entirely empty
empty_cols = [col for col in data_entry_team_1 if  data_entry_team_1[col].isna().all()]
print(empty_cols)

['kin1marks_p6_kin1p6_kin_q021p6_k', 'kin1marks_p6_kin1p6_kin_q061p6_k', 'kin1marks_p6_kin1p6_kin_q101p6_k', 'v453', 'kin1marks_p5_kin1p5_kin_q141p5_k', 'v454']


In [6]:
#drop empty cols
df_team_1 = data_entry_team_1.copy()
df_team_1 = df_team_1.drop(empty_cols,axis = 1)

# check columns that are entirely empty
empty_cols0 = [col for col in df_team_1 if  df_team_1[col].isna().all()]
print(empty_cols0)

[]


In [7]:
# let's check categorical columns
cat_cols = [col for col in df_team_1 if df_team_1[col].dtypes.name == "category"]
print(cat_cols)

cat_cols0 = df_team_1.select_dtypes(["category"]).columns
print(cat_cols0)

['spv', 'fo', 'eng_participation', 'p1_eng_q01_sub1', 'p1_eng_q01_sub2', 'p1_eng_q02_sub1', 'p1_eng_q02_sub2', 'p1_eng_q02_sub3', 'p1_eng_q03_sub1', 'p1_eng_q03_sub2', 'p1_eng_q03_sub3', 'p1_eng_q04_sub1', 'p1_eng_q04_sub2', 'p1_eng_q04_sub3', 'p1_eng_q05_sub1', 'p1_eng_q05_sub2', 'p1_eng_q05_sub3', 'p1_eng_q05_sub4', 'p1_eng_q05_sub5', 'p1_eng_q06_sub1', 'p1_eng_q06_sub2', 'p1_eng_q06_sub3', 'p1_eng_q07_sub1', 'p1_eng_q07_sub2', 'p1_eng_q07_sub3', 'p1_eng_q08_sub1', 'p1_eng_q08_sub2', 'p1_eng_q08_sub3', 'p2_eng_q01_sub1', 'p2_eng_q01_sub2', 'p2_eng_q01_sub3', 'p2_eng_q01_sub4', 'p2_eng_q02_sub1', 'p2_eng_q02_sub2', 'p2_eng_q02_sub3', 'p2_eng_q03_sub1', 'p2_eng_q04_sub1', 'p2_eng_q05_sub1', 'p2_eng_q06_sub1', 'p2_eng_q07_sub1', 'p2_eng_q07_sub2', 'p2_eng_q08_sub1', 'p2_eng_q08_sub2', 'p2_eng_q08_sub3', 'p2_eng_q09_sub1', 'p2_eng_q09_sub2', 'p2_eng_q09_sub3', 'p2_eng_q10_sub1', 'p2_eng_q10_sub2', 'p2_eng_q10_sub3', 'p2_eng_q10_sub4', 'p2_eng_q11_sub1', 'p3_eng_q01_sub1', 'p3_eng_q01_sub

we have 407 columns that are categorical, this makes sense where, except for svp and fo, the columns represent participation or marks received by students with 3 choices: correct, incorrect, or left blank

In [21]:
# let's visualize categorical columns
print(df_team_1[cat_cols])

                        spv                        fo eng_participation  \
0           Shema Christian        Ndagijimana Samuel               Yes   
1      NAHIMANA Reverien            NDABANANIYE David               Yes   
2           Shema Christian        Habonimana Gabriel               Yes   
3        NTAREMBA George             IRASUBIZA Zawadi               Yes   
4      NAHIMANA Reverien              KWIHANGANA Marc               Yes   
...                     ...                       ...               ...   
56901      Uwimana  Pauline  UTAZIRUBANDA NOEL Roudge               Yes   
56902  NAHIMANA Reverien               MWIZA Florence               Yes   
56903    NTAREMBA George              KWIHANGANA Marc               Yes   
56904       Shema Christian        Habonimana Gabriel               Yes   
56905  NAHIMANA Reverien              UWASE Francine                Yes   

      p1_eng_q01_sub1 p1_eng_q01_sub2 p1_eng_q02_sub1 p1_eng_q02_sub2  \
0                 NaN     

let's check for one of the categorical columns the choices available:

In [29]:
#print(df_team_1["p1_eng_q02_sub3"].cat.codes.unique())
#print(df_team_1["p1_eng_q02_sub3"].cat.categories.unique())
print(dict(enumerate(df_team_1["p1_eng_q02_sub3"].cat.categories)))
print(dict(enumerate(df_team_1["eng_participation"].cat.categories)))
print(dict(enumerate(df_team_1["spv"].cat.categories)))
print(dict(enumerate(df_team_1["fo"].cat.categories)))

{0: 'Left blank', 1: 'No', 2: 'Yes'}
{0: 'No', 1: 'Yes'}
{0: 'Shema Christian', 1: 'NAHIMANA Reverien   ', 2: 'Havugimana Saidi', 3: 'Uwimana  Pauline', 4: 'NTAREMBA George   ', 5: 'Hatangimana Gisele'}
{0: 'Rushambara Alexis', 1: 'NSHIMIYIMANA PASCAL', 2: 'Habonimana Gabriel', 3: 'Nirere Sandrine', 4: 'NIYIBIZI Leandre', 5: 'UWASE Francine\xa0', 6: 'Uwizeyimana Viateur', 7: 'Rusangwa Adolphe', 8: 'IRADUKUNDA GAHIRE ADOLPHE', 9: 'Ingabire Emelyne', 10: 'MUSHIMIRE Clarisse', 11: 'Mudahogora Placidie', 12: 'TUYISENGE ANICK', 13: 'Bagirishyaka Fulgence', 14: 'Fidele Iragena', 15: 'MUKANKOMEJE  Chantal', 16: 'NABAGIZE JUSTINE', 17: 'NTAWUSIGIRYAYO  Eric ', 18: 'IRAMBONA Valens', 19: 'Shingiro John', 20: 'UMUBYEYI MARIE GRACE', 21: 'MURAGIJIMANA Obadia', 22: 'MASENGESHO Samuel', 23: 'UMUBYEYI Claudine', 24: 'DUFATANYE Devota', 25: 'NSHIMIYIMANA Bernard', 26: 'MUKANSANGA Jacqueline', 27: 'UWIMANA Jeannette', 28: 'NAHAYO Jean Damascene', 29: 'DUSENGUMUREMYI Yvonne', 30: 'NSENGUMUREMYI Felix',

In [8]:
enumerators = dict(enumerate(df_team_1["fo"].cat.categories))
enumerators = pd.DataFrame(enumerators.items(),columns=["codes", "names"])
enumerators.head(20)

Unnamed: 0,codes,names
0,0,Rushambara Alexis
1,1,NSHIMIYIMANA PASCAL
2,2,Habonimana Gabriel
3,3,Nirere Sandrine
4,4,NIYIBIZI Leandre
5,5,UWASE Francine
6,6,Uwizeyimana Viateur
7,7,Rusangwa Adolphe
8,8,IRADUKUNDA GAHIRE ADOLPHE
9,9,Ingabire Emelyne


In [9]:
#we are going to convert categorical variables to their numeric values
df_team_1[cat_cols0] = df_team_1[cat_cols0].apply(lambda x: x.cat.codes)
#check the results
print(df_team_1.select_dtypes(["category"]).columns)
print(df_team_1["p1_eng_q02_sub3"].unique())
print(df_team_1["eng_participation"].unique())
print(df_team_1["spv"].unique())
print(df_team_1["fo"].unique())

Index([], dtype='object')
[-1  2  1  0]
[ 1  0 -1]
[0 1 4 2 5 3]
[110 109   2  89 103  99  36 112  13  98   5 137  40  48 119  26   8 140
 105  76  42  74  14  59  71 134  91 127  17   4  95  37 116 128  31  52
  61 121  51  96  77 102  54  55 122  97 113 139  21 136  35  62  90 108
  85 124  79  49  75 138  33  86  68  83 126  64  28  44  30  81   6  88
  23  41  11 107  67  65 120 100  15 123  34  10  43  47  57  94   9   3
  12  50  70  22  69 131 125 101  20  38  87  16  73  63  92   7 133  46
  45 104  80  78  84 114  32   0  72  66  53 117 130 118  24  93  18 106
  82  39  27  60 129 141  19  56  29  58 115   1 111 132  25 135]


In [37]:
df_team_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56906 entries, 0 to 56905
Columns: 450 entries, deviceid to enddate
dtypes: datetime64[ns](5), float64(4), int16(1), int8(406), object(34)
memory usage: 41.2+ MB


we have successfully converted categorical variables to numeric variables

In [10]:
#check object type variables
object_cols = df_team_1.select_dtypes(["object"]).columns
print(object_cols)

Index(['deviceid', 'devicephonenum', 'username', 'device_info', 'duration',
       'caseid', 'district', 'sector', 'school', 'grade', 'student_code',
       'student_code_other', 'student_code_name', 'time_check_eng_1',
       'time_check_eng_2', 'time_check_eng_3', 'time_check_eng_4',
       'time_check_eng_5', 'time_check_eng_6', 'time_check_math_1',
       'time_check_math_2', 'time_check_math_3', 'time_check_math_4',
       'time_check_math_5', 'time_check_math_6', 'time_check_kiny_4',
       'time_check_kiny_5', 'time_check_kiny_6', 'time_check_kiny_1',
       'time_check_kiny_2', 'time_check_kiny_3', 'time_check_end',
       'instancename', 'key'],
      dtype='object')


In [40]:
# let's view the object type variables
df_team_1[object_cols].head()

Unnamed: 0,deviceid,devicephonenum,username,device_info,duration,caseid,district,sector,school,grade,...,time_check_math_6,time_check_kiny_4,time_check_kiny_5,time_check_kiny_6,time_check_kiny_1,time_check_kiny_2,time_check_kiny_3,time_check_end,instancename,key
0,E0D3C54C-EFFE-4185-8F00-80D1CE7B79E6,,collect,Apple|iPhone|15.7.5|SurveyCTO Collect 2.80 (27...,180,,Nyanza,Busoro,210207,P3,...,2023-Aug-7 15:09:59,,,,2023-Aug-7 15:10:01,2023-Aug-7 15:10:01,2023-Aug-7 15:10:01,2023-Aug-7 15:10:01,student_id210207190062,uuid:0000201e-8b84-40fa-8e37-05be1b8fac9b
1,e9129293624d72e5,,collect,samsung|SM-T225|13|SurveyCTO Collect 2.80.2 (2...,217,,Musanze,Busogo,430114,P3,...,2023-Jul-28 15:31:48,,,,2023-Jul-28 15:31:51,2023-Jul-28 15:31:51,2023-Jul-28 15:31:51,2023-Jul-28 15:31:51,student_id430101190753,uuid:0000a616-72b4-485a-a7e3-e57ad1b48f16
2,d73df25fad2cc33b,,collect,samsung|SM-T225|13|SurveyCTO Collect 2.80.2 (2...,575,,Gakenke,Kivuruga,420915,P5,...,2023-Jul-24 09:12:33,2023-Jul-24 09:13:07,2023-Jul-24 09:13:07,2023-Jul-24 09:15:24,,,,2023-Jul-24 09:15:24,,uuid:00027ad6-86ea-49c2-aca1-eaf59f3eebc3
3,d6ee4c5c7759f9a6,,collect,samsung|SM-T290|11|SurveyCTO Collect 2.80 (9b6...,401,,Musanze,Gacaca,430324,P6,...,2023-Jul-31 10:55:21,2023-Jul-31 10:56:50,2023-Jul-31 10:56:50,2023-Jul-31 10:56:50,,,,2023-Jul-31 10:59:55,student_id430311200702,uuid:00046f63-696f-44ba-b660-6395263d17eb
4,87c4ab681ae1d513,,collect (not yet authenticated),samsung|SM-T505N|10|SurveyCTO Collect 2.80.2 (...,429,,Nyanza,Busoro,210209,P5,...,2023-Aug-7 09:22:47,2023-Aug-7 09:22:50,2023-Aug-7 09:22:50,2023-Aug-7 09:25:09,,,,2023-Aug-7 09:25:09,student_id210209182748,uuid:00056471-339f-4339-8532-1944427947f9


In [11]:
#drop unuseful columns
df_team_1 = df_team_1.drop(['deviceid', 'devicephonenum', 'username', 'caseid','device_info','instancename', 'key'], axis=1)

In [12]:
#check object type variables again
object_cols0 = df_team_1.select_dtypes(["object"]).columns
print(object_cols0)

Index(['duration', 'district', 'sector', 'school', 'grade', 'student_code',
       'student_code_other', 'student_code_name', 'time_check_eng_1',
       'time_check_eng_2', 'time_check_eng_3', 'time_check_eng_4',
       'time_check_eng_5', 'time_check_eng_6', 'time_check_math_1',
       'time_check_math_2', 'time_check_math_3', 'time_check_math_4',
       'time_check_math_5', 'time_check_math_6', 'time_check_kiny_4',
       'time_check_kiny_5', 'time_check_kiny_6', 'time_check_kiny_1',
       'time_check_kiny_2', 'time_check_kiny_3', 'time_check_end'],
      dtype='object')


In [None]:
print(df_team_1["school"].unique())

In [None]:
print(df_team_1["student_code"].unique())
print(df_team_1["student_code_name"].unique())
print(df_team_1["student_code_other"].unique())
print(df_team_1["time_check_eng_1"].unique())
print(df_team_1["time_check_end"].unique())

In [None]:
df_team_1[["student_code","student_code_other"]][df_team_1["student_code_other"]!="."]

In [53]:
df_team_1["student_code_other"][41] == df_team_1["student_code"][41] 

True

In [13]:
df_team_1.loc[df_team_1["student_code_other"] == df_team_1["student_code"], "student_code_other"] = "."
print(df_team_1.student_code_other.unique())

['.' '-555' '555' '540712210384' '21108002025' '555555555555'
 '510712002010' '540712210425' '421807055001' '4305061550' '5707122003003'
 '4211020003025' '211001230088' '421102210191' '42111204027'
 '320028002001']


In [14]:
df_team_1.loc[df_team_1["student_code_other"] != ".", ["student_code","student_code_other"]]

Unnamed: 0,student_code,student_code_other
100,420311001002,-555
671,420311001003,-555
1268,421512190377,555
1443,420806001007,555
2433,421512190802,555
...,...,...
52350,420903001015,555
52646,421207003001,555
53804,421512190417,555
53823,421512002004,555


In [60]:
df_team_1.loc[df_team_1["student_code_other"] != ".", "student_code_other"].unique()

array(['-555', '555', '540712210384', '21108002025', '555555555555',
       '510712002010', '540712210425', '421807055001', '4305061550',
       '5707122003003', '4211020003025', '211001230088', '421102210191',
       '42111204027', '320028002001'], dtype=object)

In [61]:
df_team_1.loc[df_team_1["student_code_other"].isin(['540712210384', '21108002025',
       '510712002010', '540712210425', '421807055001', '4305061550',
       '5707122003003', '4211020003025', '211001230088', '421102210191',
       '42111204027', '320028002001']), ["student_code","student_code_other"]]

Unnamed: 0,student_code,student_code_other
2800,540712210382,540712210384
5634,421108002025,21108002025
20610,540712002010,510712002010
21698,540712001110,540712210425
23344,421807005001,421807055001
24300,430506160550,4305061550
27585,540712003003,5707122003003
31098,421102003025,4211020003025
31101,211001002110,211001230088
33039,421102001100,421102210191


let's create a column that will help us track these inconsistencies

In [15]:
df_team_1["student_code_unsure"] = np.where(df_team_1["student_code_other"].isin(['540712210384', '21108002025',
       '510712002010', '540712210425', '421807055001', '4305061550',
       '5707122003003', '4211020003025', '211001230088', '421102210191',
       '42111204027', '320028002001']),"flagged","passed")


In [16]:
print(df_team_1["student_code_unsure"].value_counts())

passed     56894
flagged       12
Name: student_code_unsure, dtype: int64


In [67]:
# check if student_code column is unique and doesn't have duplicates
df_team_1["student_code"].is_unique

False

It seems like we have duplicates in the student code variable.
Let us count how many duplicates we have

In [68]:
# check how many dups we have
df_team_1["student_code"].duplicated().sum()

451

we have around 451 duplicates

In [None]:
#let us view the duplicates
df_team_1[df_team_1["student_code"].isin(df_team_1["student_code"][df_team_1["student_code"].duplicated()])].sort_values(["student_code","submissiondate"])

In [17]:
df_team_1_unique = df_team_1.sort_values(["student_code","submissiondate"]).drop_duplicates("student_code",keep="last")
# check how many dups we have
print(df_team_1_unique["student_code"].duplicated().sum())

0


Now that we have treated student_code duplicates, we can move on with our feature cleaning activities

In [22]:
#changing column types to numeric, Datetime, and string

#numeric
df_team_1_unique[["duration","school","student_code"]] = df_team_1_unique[["duration","school","student_code"]].apply(pd.to_numeric)

In [25]:
#datetime
# select columns that starts with time_check
time_cols = [col for col in df_team_1_unique if col.startswith("time_check")]

df_team_1_unique[time_cols] = df_team_1_unique[time_cols].apply(pd.to_datetime)

In [None]:
# Let us look at how the datetime transformation was executed
df_team_1_unique[time_cols]

Unnamed: 0,time_check_eng_1,time_check_eng_2,time_check_eng_3,time_check_eng_4,time_check_eng_5,time_check_eng_6,time_check_math_1,time_check_math_2,time_check_math_3,time_check_math_4,time_check_math_5,time_check_math_6,time_check_kiny_4,time_check_kiny_5,time_check_kiny_6,time_check_kiny_1,time_check_kiny_2,time_check_kiny_3,time_check_end
41776,2023-08-19 17:18:23,2023-08-19 17:18:23,2023-08-19 17:21:22,2023-08-19 17:21:22,2023-08-19 17:21:22,2023-08-19 17:21:22,2023-08-19 17:21:25,2023-08-19 17:21:25,2023-08-19 17:23:35,2023-08-19 17:23:35,2023-08-19 17:23:35,2023-08-19 17:23:35,NaT,NaT,NaT,2023-08-19 17:23:51,2023-08-19 17:23:51,2023-08-19 17:23:51,2023-08-19 17:23:51
32697,2023-08-01 14:21:04,2023-08-01 14:21:04,2023-08-01 14:21:04,2023-08-01 14:24:00,2023-08-01 14:24:00,2023-08-01 14:24:00,2023-08-01 14:24:04,2023-08-01 14:24:04,2023-08-01 14:24:04,2023-08-01 14:25:19,2023-08-01 14:25:19,2023-08-01 14:25:19,NaT,NaT,NaT,2023-08-01 14:25:23,2023-08-01 14:25:23,2023-08-01 14:25:23,2023-08-01 14:25:23
26680,2023-08-01 10:57:57,2023-08-01 10:57:57,2023-08-01 10:57:57,2023-08-01 10:57:57,2023-08-01 10:57:57,2023-08-01 10:57:57,2023-08-01 10:59:47,2023-08-01 10:59:47,2023-08-01 10:59:47,2023-08-01 10:59:47,2023-08-01 10:59:47,2023-08-01 10:59:47,2023-08-01 11:01:30,2023-08-01 11:01:30,2023-08-01 11:01:30,NaT,NaT,NaT,2023-08-01 11:03:33
26635,2023-08-03 10:04:47,2023-08-03 10:04:47,2023-08-03 10:04:47,2023-08-03 10:04:47,2023-08-03 10:04:47,2023-08-03 10:06:16,2023-08-03 10:06:22,2023-08-03 10:06:22,2023-08-03 10:06:22,2023-08-03 10:06:22,2023-08-03 10:06:22,2023-08-03 10:07:54,2023-08-03 10:08:05,2023-08-03 10:08:05,2023-08-03 10:09:37,NaT,NaT,NaT,2023-08-03 10:09:37
28737,2023-08-02 13:31:55,2023-08-02 13:31:55,2023-08-02 13:31:55,2023-08-02 13:33:13,2023-08-02 13:33:13,2023-08-02 13:33:13,2023-08-02 13:33:26,2023-08-02 13:33:26,2023-08-02 13:33:26,2023-08-02 13:34:45,2023-08-02 13:34:45,2023-08-02 13:34:45,NaT,NaT,NaT,2023-08-02 13:34:48,2023-08-02 13:34:48,2023-08-02 13:34:48,2023-08-02 13:34:48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41335,2023-08-03 15:26:13,2023-08-03 15:26:13,2023-08-03 15:26:13,2023-08-03 15:27:41,2023-08-03 15:27:41,2023-08-03 15:27:41,2023-08-03 15:27:44,2023-08-03 15:27:44,2023-08-03 15:27:44,2023-08-03 15:28:55,2023-08-03 15:28:55,2023-08-03 15:28:55,NaT,NaT,NaT,2023-08-03 15:28:57,2023-08-03 15:28:57,2023-08-03 15:28:57,2023-08-03 15:28:57
40225,2023-07-25 11:23:42,2023-07-25 11:23:42,2023-07-25 11:23:42,2023-07-25 11:23:42,2023-07-25 11:23:42,2023-07-25 11:26:36,2023-07-25 11:26:40,2023-07-25 11:26:40,2023-07-25 11:26:40,2023-07-25 11:26:40,2023-07-25 11:26:40,2023-07-25 11:29:26,2023-07-25 11:29:31,2023-07-25 11:29:31,2023-07-25 11:32:35,NaT,NaT,NaT,2023-07-25 11:32:35
48235,2023-08-02 14:42:20,2023-08-02 14:42:20,2023-08-02 14:42:20,2023-08-02 14:44:10,2023-08-02 14:44:10,2023-08-02 14:44:10,2023-08-02 14:44:13,2023-08-02 14:44:13,2023-08-02 14:44:13,2023-08-02 14:45:40,2023-08-02 14:45:40,2023-08-02 14:45:40,NaT,NaT,NaT,2023-08-02 14:45:57,2023-08-02 14:45:57,2023-08-02 14:45:57,2023-08-02 14:45:57
2363,2023-07-31 11:50:53,2023-07-31 11:50:53,2023-07-31 11:50:53,2023-07-31 11:50:53,2023-07-31 11:50:53,2023-07-31 11:50:53,2023-07-31 11:52:32,2023-07-31 11:52:32,2023-07-31 11:52:32,2023-07-31 11:52:32,2023-07-31 11:52:32,2023-07-31 11:52:32,2023-07-31 11:54:50,2023-07-31 11:54:50,2023-07-31 11:54:50,NaT,NaT,NaT,2023-07-31 11:56:22


In [None]:
# string
# let's check how many object type columns are left
