# Preprocessing

#### Stage 1: Import relevant modules.

In [None]:
import pandas as pd
import sqlite3

#### Stage 2: Upload data files (.csv) into the environment as a dataframe.


In [None]:
# gets absolute path of current directory

# reads .csv file(s) data then stores it as a dataframe
dfCOA111 = pd.read_csv('cop504cwdata/22COA111ModuleRegister.csv')
dfCOA122 = pd.read_csv('cop504cwdata/22COA122ModuleRegister.csv')

In [None]:
dfCOA111

In [None]:
dfCOA122

In [None]:
# creates copy to make module sessions data frame
dfCOA111Copy = dfCOA111.copy()
dfCOA122Copy = dfCOA122.copy()

# removes student ID for new 'dfCOA122Sessions' and 'dfCOA111Sessions' dataframe
dfCOA111Copy.drop('sid_x', axis = 1, inplace = True)
dfCOA122Copy.drop('sid', axis = 1, inplace = True) 

#### Stage 3: Create dfCOA122Sessions and dfCOA111Sessions dataframes.

In [None]:
############################# FUNCTIONS #############################

def splitColumnHeading(df, deliminator, initColumnName, newColumns):
    sessionInfo = df.columns.str.split(deliminator) # splits column headings by newline
    dfSessionInfo = pd.DataFrame(sessionInfo, columns=[initColumnName]) # stores the list of column information into a new data frame

    dfNew = dfSessionInfo[initColumnName].apply(pd.Series) 
    dfNew.columns = newColumns # creates new columns based on each value in the list 
    return dfNew

def splitColumnInfo(df, newColumns, columnToSplit, deliminator):
    df[newColumns] = df[columnToSplit].str.split(deliminator, expand = True)
    return df

def removeChars(df, columnName, deliminator):
    df[columnName] = df[columnName].str.strip(deliminator)
    return df

def formatForDatetime(df, date, times): # NOTE: 'times' list is in '%H:%M' format
    for time in times:
        df[time] = df[date] + ' ' + df[time].str.strip() + ':00'
    return df 

######################################################################

    3.a Store the details of each session (found in the column heading) into new dataframes.

In [None]:
newSessionColumns = ['Semester_Week', 'Date', 'Time', 'Lecture_Type', 'Room_ID', 'TO_DELETE']
initColumn = 'SessionInfo'
delim = '\n'

dfCOA111Sessions =  splitColumnHeading(dfCOA111Copy, delim, initColumn, newSessionColumns)
dfCOA122Sessions = splitColumnHeading(dfCOA122Copy,  delim, initColumn, newSessionColumns)

    3.b Split the column values into more distinct columns.

In [None]:
newColumns = [['Semester','Week'], ['Day','Date'], ['Start_Time', 'End_Time']]
oldColumns = ['Semester_Week', 'Date', 'Time']
delims = ['.', ' ', '-'] 

for col in range(len(oldColumns)):
    splitColumnInfo(dfCOA111Sessions, newColumns[col], oldColumns[col], delims[col])
    splitColumnInfo(dfCOA122Sessions, newColumns[col], oldColumns[col], delims[col])

    3.c Remove deliminators/redundant characters from column values.

In [None]:
columns = ['Date', 'Date', 'Semester', 'Week']
delims = ['(', ')', 'S', 'W']

for col in range(len(columns)):
    removeChars(dfCOA111Sessions, columns[col], delims[col])
    removeChars(dfCOA122Sessions, columns[col], delims[col])

# For dfCOA122Sessions '...' from the end of 'NOO1' must also be removed
removeChars(dfCOA122Sessions, 'Room_ID', '...')

    3.d Reformat 'Start_Time' and 'End_Time' into datetime format.
    

In [None]:
oldFormatColumns = ['Start_Time', 'End_Time']
dateColumn = 'Date'

dfCOA111Sessions = formatForDatetime(dfCOA111Sessions, dateColumn, oldFormatColumns)
dfCOA122Sessions = formatForDatetime(dfCOA122Sessions, dateColumn, oldFormatColumns)

    3.e Add column name 'Session Number' and remove redundant columns.

In [None]:
dfCOA111Sessions['Session_Number'] = dfCOA111Sessions.index + 1 # Since session numbers do not start from 0
dfCOA122Sessions['Session_Number'] = dfCOA122Sessions.index + 1

toRemove = ['Semester_Week', 'Time', 'TO_DELETE']
dfCOA111Sessions.drop(toRemove, axis = 1, inplace = True) 
dfCOA122Sessions.drop(toRemove, axis = 1, inplace = True) 

    3.f Display final dfCOA111Sessions and dfCOA122Sessions

In [None]:
dfCOA111Sessions

In [None]:
dfCOA122Sessions

#### Stage 4: Cleaning dfCOA11 and dfCOA122 i.e., dfCOA111Clean and dfCOA122Clean

    4.a Create a copy of the original dataframes.

In [None]:
dfCOA111Clean = dfCOA111.copy()
dfCOA122Clean = dfCOA122.copy()

    4.b Rename the columns of copied data frames with a unique session id/number.

In [None]:
dfCOA111Clean.set_axis(['SID', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10', 'S11', 'S12', 'S13', 'S14'], axis = 1, inplace = True)
dfCOA122Clean.set_axis(['SID', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10', 'S11', 'S12', 'S13', 'S14',
'S15', 'S16', 'S17', 'S18', 'S19'], axis = 1, inplace = True)

    4.c Replace the following: 
        - 'Ex' with a null value
        - 'GPS' with True (boolean)
        - 'X' with False (boolean)

In [None]:
# COA111
dfCOA111Clean.replace('Ex', None, inplace = True)
dfCOA111Clean.replace('GPS', True, inplace = True)
dfCOA111Clean.replace('X', False, inplace = True)

# COA122
dfCOA122Clean.replace('Ex', None, inplace = True)
dfCOA122Clean.replace('GPS', True, inplace = True)
dfCOA122Clean.replace('X', False, inplace = True)

    4.d Remove rows and columns which comprise of just NULL values.

In [None]:
 # drops rows where all values (aside from the student id column) are NULL
dfCOA111Clean.dropna(axis = 0, how = 'all', subset = dfCOA111Clean.columns[1:len(dfCOA111Clean.columns)], inplace = True)
dfCOA111Clean.dropna(axis = 1, how = 'all', inplace = True) # drops columns where all values are NULL

dfCOA122Clean.dropna(axis = 0, how = 'all', subset = dfCOA122Clean.columns[1:len(dfCOA122Clean.columns)], inplace = True)
dfCOA122Clean.dropna(axis = 1, how = 'all', inplace = True)

In [None]:
dfCOA111Clean

In [None]:
dfCOA122Clean

    4.e Create a database in SQlite and save it as CWDatabase.db.
    Then create tables in sql and store the dataframes* into these tables.
    (*dfCOA111Sessions, dfCOA111Clean, dfCOA122Sessions and dfCOA122Clean)

In [None]:
try:
    conn = sqlite3.connect('CWDatabase.db') # connect to the empty database
    cursor = conn.cursor()

    ################## COA111 ##################

    # insert tables dfCOA111 and dfCOA111Sessions into database
    dfCOA111Clean.to_sql('COA111', conn, if_exists = 'replace', index = False)
    dfCOA111Sessions.to_sql('COA111Sessions', conn, if_exists = 'replace', index = False)

    # check data type of tables 
    cursor.execute('PRAGMA table_info (COA111)')
    print('------- Original COA111: Column Name, Data Type -------')
    for result in cursor:
        print(result[1:3])

    cursor.execute('PRAGMA table_info (COA111Sessions)')
    print('------- Original COA111Sessions: Column Name, Data Type -------')
    for result in cursor:
        print(result[1:3])

    # correct data types of dfCOA111Sessions
    cursor.execute('PRAGMA foreign_keys = off;')
    cursor.execute('BEGIN TRANSACTION;')
    cursor.execute('ALTER TABLE COA111Sessions RENAME TO COA111Sessions_Old;') # makes a copy of the table
    cursor.execute(

        '''CREATE TABLE COA111Sessions (
            Date DATE,
            Lecture_Type VARCHAR, 
            Room_ID VARCHAR, 
            Semester INTEGER, 
            Week INTEGER, 
            Day VARCHAR, 
            Start_Time TIMESTAMP, 
            End_Time TIMESTAMP, 
            Session_Number INTEGER);
        '''
    )
    cursor.execute(
        '''
        INSERT INTO COA111Sessions (
            Date, 
            Lecture_Type, 
            Room_ID, 
            Semester, 
            Week, 
            Day, 
            Start_Time, 
            End_Time, 
            Session_Number) 
        SELECT Date, Lecture_Type, Room_ID, Semester, Week, Day, Start_Time, End_Time, Session_Number 
        FROM COA111Sessions_Old;
        '''
    )
    cursor.execute('DROP TABLE COA111Sessions_Old;') # drop table to 'free-up' resources
    cursor.execute('COMMIT;')
    cursor.execute('PRAGMA foreign_keys = on;')
    
    # check if data types have been corrected and print the result
    cursor.execute('PRAGMA table_info (COA111Sessions)') # returns column name and its data type
    print('------- Corrected COA111Sessions: Column Name, Data Type -------')
    for result in cursor:
        print(result[1:3])
    
    ############################################


    ################## COA122 ##################

    # insert tables dfCOA122 and dfCOA122Sessions into database
    dfCOA122Clean.to_sql('COA122', conn, if_exists = 'replace', index = False)
    dfCOA122Sessions.to_sql('COA122Sessions', conn, if_exists = 'replace', index = False)

        # check data type of tables 
    cursor.execute('PRAGMA table_info (COA122)')
    print('------- Original COA122: Column Name, Data Type -------')
    for result in cursor:
        print(result[1:3])

    cursor.execute('PRAGMA table_info (COA122Sessions)')
    print('------- Original COA122Sessions: Column Name, Data Type -------')
    for result in cursor:
        print(result[1:3])

    # correct data types of dfCOA122Sessions
    cursor.execute('PRAGMA foreign_keys = off;')
    cursor.execute('BEGIN TRANSACTION;')
    cursor.execute('ALTER TABLE COA122Sessions RENAME TO COA122Sessions_Old;') # makes a copy of the table
    cursor.execute(

        '''CREATE TABLE COA122Sessions (
            Date DATE,
            Lecture_Type VARCHAR, 
            Room_ID VARCHAR, 
            Semester INTEGER, 
            Week INTEGER, 
            Day VARCHAR, 
            Start_Time TIMESTAMP, 
            End_Time TIMESTAMP, 
            Session_Number INTEGER);
        '''
    )
    cursor.execute(
        '''
        INSERT INTO COA122Sessions (
            Date, 
            Lecture_Type, 
            Room_ID, 
            Semester, 
            Week, 
            Day, 
            Start_Time, 
            End_Time, 
            Session_Number) 
        SELECT Date, Lecture_Type, Room_ID, Semester, Week, Day, Start_Time, End_Time, Session_Number 
        FROM COA122Sessions_Old;
        '''
    )
    cursor.execute('DROP TABLE COA122Sessions_Old;') 
    cursor.execute('COMMIT;')
    cursor.execute('PRAGMA foreign_keys = on;')
    cursor.execute('PRAGMA table_info (COA111Sessions)')

    print('------- Corrected COA122Sessions: Column Name, Data Type -------')
    for result in cursor:
        print(result[1:3])
    
    ############################################

    conn.close() # close connection to database

except sqlite3.Error as error:
    print('Failed to insert table into the database', error)

# Testing 

In [None]:
# Test case (1)