# Preprocessing

#### Stage 1: Import relevant modules.

In [134]:
import os 
import pandas as pd

#### Stage 2: Upload data files (.csv) into the environment as a dataframe.


In [137]:
# gets absolute path of current directory
curr_directory = os.getcwd()

# reads .csv file data then stores it as a dataframe
dfCOA122 = pd.read_csv(
    curr_directory + '\\cop504cwdata\\22COA111ModuleRegister.csv', 
    index_col = [False]
)

dfCOA122

Unnamed: 0_level_0,S1.W1\nMonday (03-10-2022)\n09:00 - 10:00\nLecture\nU020\n,S1.W1\nWednesday (05-10-2022)\n10:00 - 11:00\nLecture\nSMB014\n,S1.W2\nWednesday (12-10-2022)\n10:00 - 11:00\nLecture\nSMB014\n,S1.W2\nWednesday (12-10-2022)\n12:00 - 13:00\nTutorial\n\n,S1.W2\nThursday (13-10-2022)\n14:00 - 15:00\nLecture\nCC011\n,S1.W3\nWednesday (19-10-2022)\n10:00 - 11:00\nLecture\nSMB014\n,S1.W3\nWednesday (19-10-2022)\n12:00 - 13:00\nTutorial\n\n,S1.W3\nThursday (20-10-2022)\n14:00 - 15:00\nLecture\nCC011\n,S1.W4\nWednesday (26-10-2022)\n10:00 - 11:00\nLecture\nSMB014\n,S1.W4\nWednesday (26-10-2022)\n12:00 - 13:00\nTutorial\n\n,S1.W4\nThursday (27-10-2022)\n14:00 - 15:00\nLecture\nCC011\n,S1.W5\nWednesday (02-11-2022)\n10:00 - 11:00\nLecture\nSMB014\n,S1.W5\nWednesday (02-11-2022)\n12:00 - 13:00\nTutorial\n\n,S1.W5\nThursday (03-11-2022)\n14:00 - 15:00\nLecture\nCC011\n
sid_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,GPS,GPS,X,,GPS,GPS,,GPS,GPS,,X,X,,X
1,GPS,GPS,GPS,,GPS,GPS,,GPS,GPS,,GPS,GPS,,GPS
2,GPS,GPS,GPS,,X,X,,GPS,X,,GPS,X,,X
3,,,GPS,,GPS,X,,GPS,X,,GPS,X,,GPS
4,GPS,X,X,,GPS,X,,GPS,X,,GPS,X,,GPS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238,X,X,GPS,,GPS,X,,GPS,GPS,,GPS,GPS,,GPS
239,GPS,X,GPS,,GPS,GPS,,GPS,GPS,,GPS,GPS,,X
240,GPS,GPS,GPS,,GPS,X,,GPS,GPS,,GPS,GPS,,GPS
241,GPS,GPS,GPS,,GPS,GPS,,GPS,GPS,,GPS,GPS,,GPS


#### Stage 3: Store the details of each session (found in the column heading) into a new dataframe.

In [138]:
# MODULE COA122
def splitColumnInfo(df, listOfColumns, columnToSplit, deliminator):
    df[listOfColumns] = df[columnToSplit].str.split(deliminator, expand = True)
    return df

def removeDeliminator(df, columnName, deliminator):
    df[columnName] = df[columnName].str.strip(deliminator)
    return df

# 1. splits column heading by newline
sessionInfo = dfCOA122.columns.str.split('\n')

# 2. stores the list of column information into a new data frame
dfSessionInfo = pd.DataFrame(sessionInfo, columns= ['SessionInfo'])

# 3. creates new columns based on each value in the list
tempDf = dfSessionInfo.SessionInfo.apply(pd.Series)
tempDf.columns = ['Semester_Week', 'Date', 'Time', 'Lecture_Type', 'Room_ID', 'TO_DELETE']
tempDf

# 4. splits the 'Semester_Week' column by '.' and the 'Date' column  by ' '.
splitColumnInfo(tempDf, ['Semester','Week'], 'Semester_Week', '.')
splitColumnInfo(tempDf, ['Day','Date'], 'Date', ' ')

# 5. strips 'Date' column of brackets
removeDeliminator(tempDf, 'Date', '(')
removeDeliminator(tempDf, 'Date', ')')

# 6. splits 'Time' by '-' to get start and end time
splitColumnInfo(tempDf, ['Start_Time', 'End_Time'], 'Time', '-')

# 7. append time and date to 'Start_Time' and 'End_Time' then change to datetime format 
tempDf['Start_Time'] = tempDf['Date'] + ' ' + tempDf['Start_Time'].str.strip() + ':00'
tempDf['End_Time'] = tempDf['Date'] + ' ' + tempDf['End_Time'].str.strip() + ":00"

# 8. converts string to datetime date type
tempDf['Start_Time'] = pd.to_datetime(tempDf['Start_Time'])
tempDf['End_Time'] = pd.to_datetime(tempDf['End_Time'])

# 9. converts 'Semester' or 'Week' to integer data type
removeDeliminator(tempDf, 'Semester', 'S')
removeDeliminator(tempDf, 'Week', 'W')

tempDf['Semester'] = pd.to_numeric(tempDf['Semester'], downcast='integer')
tempDf['Week'] = pd.to_numeric(tempDf['Week'], downcast='integer')
tempDf

# 10. Convert all relevant columns to correct datatype

# 11. removes redundant columns: 'Date', 'Time', 'Semester_Week', 'TO_DELETE'

# 12. Check datatype of all columns
# print(tempDf.dtypes)


Unnamed: 0,Semester_Week,Date,Time,Lecture_Type,Room_ID,TO_DELETE,Semester,Week,Day,Start_Time,End_Time
0,S1.W1,03-10-2022,09:00 - 10:00,Lecture,U020,,1,1,Monday,2022-03-10 09:00:00,2022-03-10 10:00:00
1,S1.W1,05-10-2022,10:00 - 11:00,Lecture,SMB014,,1,1,Wednesday,2022-05-10 10:00:00,2022-05-10 11:00:00
2,S1.W2,12-10-2022,10:00 - 11:00,Lecture,SMB014,,1,2,Wednesday,2022-12-10 10:00:00,2022-12-10 11:00:00
3,S1.W2,12-10-2022,12:00 - 13:00,Tutorial,,,1,2,Wednesday,2022-12-10 12:00:00,2022-12-10 13:00:00
4,S1.W2,13-10-2022,14:00 - 15:00,Lecture,CC011,,1,2,Thursday,2022-10-13 14:00:00,2022-10-13 15:00:00
5,S1.W3,19-10-2022,10:00 - 11:00,Lecture,SMB014,,1,3,Wednesday,2022-10-19 10:00:00,2022-10-19 11:00:00
6,S1.W3,19-10-2022,12:00 - 13:00,Tutorial,,,1,3,Wednesday,2022-10-19 12:00:00,2022-10-19 13:00:00
7,S1.W3,20-10-2022,14:00 - 15:00,Lecture,CC011,,1,3,Thursday,2022-10-20 14:00:00,2022-10-20 15:00:00
8,S1.W4,26-10-2022,10:00 - 11:00,Lecture,SMB014,,1,4,Wednesday,2022-10-26 10:00:00,2022-10-26 11:00:00
9,S1.W4,26-10-2022,12:00 - 13:00,Tutorial,,,1,4,Wednesday,2022-10-26 12:00:00,2022-10-26 13:00:00


# Testing 
Number of test cases: 5  
Passed:  
Failed:   

In [None]:
# Test case (1)