# Preprocessing

In [1]:
import numpy as np
import pandas as pd
import sqlite3 

Create a get student sessions and student attendance details function

In [2]:
class PreprocessStudentAttendance:
    def __init__(self, folder_name, file_name):
        self.folder = folder_name
        self.file = file_name 

    def get_session_details(clm):
        '''
        Extract details of sessions from the columnn names
        Input:
            clm: the columnn names, list type 
        Return: 
            df_details: the session details, dataframe type
        '''
        # separate details
        df_details = pd.DataFrame(clm, columns= ["details"])
        df_details = df_details["details"].str.split("\n", expand= True)  
        df_details.columns = [
            "semester_week", "date", "time",
            "lecture_type", "room_id", "EMPTY"
        ] 

        # further separate columns by deliminators
        old_clms = ["semester_week", "date", "time"]
        new_clms = [
            ["semester","week"], 
            ["day", "date"], 
            ["start_time", "end_time"]
        ]
        delims = [".", " ", "-"]

        for c in range(len(old_clms)):
            df_details[new_clms[c]] = df_details[old_clms[c]].str.split(
                delims[c], 
                expand= True
            )

        new_clms = ["date", "date", "semester", "week", "room_id"]
        delims = ["(", ")", "S", "W", "..."]
        for c in range(len(new_clms)):
            df_details[new_clms[c]] = df_details[new_clms[c]].str.strip(delims[c])
        
        # format datetime columns 
        date = df_details["date"]
        df_details["start_time"]=date+" "+df_details["start_time"].str.strip()+":00"
        df_details["end_time"]=date+" "+df_details["end_time"].str.strip()+":00"

        # add session number column (used to join later)
        idx = 0
        df_details.insert(loc= idx, column= "session", value= df_details.index+1)

        # remove redundant columns 
        df_details.drop(["semester_week", "time", "EMPTY"],
            axis = 1, 
            inplace = True
        )

        return df_details

    def get_student_attendance_csv(self):
        '''
        Read all the attendance data from csv files
        Return two dataframes: 
            + Student attendance per session
            + Details of sessions
        '''
        df_student_attendance = pd.read_csv(
            "./"+ self.folder+ "/"+self.file+".csv", 
            index_col= 0
        )
        clm = df_student_attendance.columns.to_list()
        df_sessions = PreprocessStudentAttendance.get_session_details(clm)

        df_student_attendance.columns = np.arange(
            len(df_student_attendance.columns)
        )
        return (df_sessions, df_student_attendance)

    def clean_student_attendance(self, df_student_atten):
        '''
        Involves:
            + Increment all columns by 1 to represent session number
            + Replace the original strings with boolean or None
            + Remove NaN rows and columns 
        '''
        df_student_atten.index.names = ["sid"] # renames index (student id = "sid")
        df_student_atten.columns += 1

        old_vals = ['Ex', 'GPS', 'X']
        new_vals = [None, True, False]
            
        for v in range(len(old_vals)):
            df_student_atten.replace(old_vals[v], new_vals[v], inplace = True)

        df_student_atten.dropna(
            axis = 0, 
            how = 'all',
            inplace = True
        ) # drops rows with NaN 
        df_student_atten.dropna(
            axis = 1,
            how = 'all', 
            inplace = True
        ) # drops columns with NULL

        return df_student_atten

In [8]:
# Testing: 
folder = "cop504cwdata"
file = "22COA122ModuleRegister"

pp = PreprocessStudentAttendance(folder, file)
df_sessions, df_stu_atten = pp.get_student_attendance_csv()
df_clean_stu_atten = pp.clean_student_attendance(df_stu_atten)
df_clean_stu_atten
df_sessions
# files =  ["22COA111ModuleRegister", "22COA122ModuleRegister"]
# for file in files:
#     # print(get_student_attendance_csv(folder, file))

Unnamed: 0,session,date,lecture_type,room_id,semester,week,day,start_time,end_time
0,1,03-10-2022,Lecture,CC012,1,1,Monday,03-10-2022 14:00:00,03-10-2022 16:00:00
1,2,06-10-2022,Computer Lab,N001,1,1,Thursday,06-10-2022 09:00:00,06-10-2022 11:00:00
2,3,06-10-2022,Computer Lab,N001,1,1,Thursday,06-10-2022 11:00:00,06-10-2022 13:00:00
3,4,10-10-2022,Lecture,CC012,1,2,Monday,10-10-2022 14:00:00,10-10-2022 16:00:00
4,5,13-10-2022,Computer Lab,N001,1,2,Thursday,13-10-2022 09:00:00,13-10-2022 11:00:00
5,6,13-10-2022,Computer Lab,N001,1,2,Thursday,13-10-2022 11:00:00,13-10-2022 13:00:00
6,7,17-10-2022,Lecture,CC012,1,3,Monday,17-10-2022 14:00:00,17-10-2022 16:00:00
7,8,20-10-2022,Computer Lab,N001,1,3,Thursday,20-10-2022 09:00:00,20-10-2022 11:00:00
8,9,20-10-2022,Computer Lab,N001,1,3,Thursday,20-10-2022 11:00:00,20-10-2022 13:00:00
9,10,24-10-2022,Lecture,CC012,1,4,Monday,24-10-2022 14:00:00,24-10-2022 16:00:00


Create a data preproprocessing function

In [4]:
# clean_dataframe(df_stu_atten)

Create a database operations class

In [5]:
''' 
Note: 
Changing the type in python does not guarentee 
the type is maintainted in SQLite.
Therefore it was changed within the 
'''
#==============================================================================

' \nNote: \nChanging the type in python does not guarentee \nthe type is maintainted in SQLite.\nTherefore it was changed within the \n'

Create add a student to database tables function

# Testing

Run code (functions)

In [6]:
# Write test into a function

Add new entry

Re run code