# Preprocessing

#### Stage 1: Import relevant modules.

In [1]:
import pandas as pd #  To install (Windows): 'python pip install pandas==1.1.5'
import sqlite3 # Already included in Python's standard library

In [2]:
def getFile(folder, filename):
    fileExtension = '.csv'
    df = pd.read_csv(folder + '/' + filename + fileExtension)
    df.name = 'df' + filename[2:8]
    return df

In [None]:
folder = 'cop504cwdata'
filenames = ['22COA111ModuleRegister', '22COA122ModuleRegister']
dfs = []
dfsCopy = []

for f in filenames:
    df = getFile(folder= folder, filename= f)
    dfs.append(df)
    # copy to extract columns
    dfsCopy.append(df.drop(df.columns[0], axis= 1, inplace= False)) 

In [3]:
class extractInfo:
    def __init__(self, df):
        self.df = df
    
    def splitInfo(self, deliminator):
        info = self.df.columns.str.split(deliminator)
        dfInfo = pd.DataFrame(info, columns=['info'])
        dfNew = dfInfo['info'].apply(pd.Series)
        dfNew.columns = ['Semester_Week', 'Date', 'Time',
         'Lecture_Type', 'Room_ID', 'TO_DELETE'] 

        # Further separate columns
        dfNew[['Semester','Week']] = dfNew['Semester_Week'].str.split(
            '.', expand = True)
        dfNew[['Day','Date']] = dfNew['Date'].str.split(' ', expand = True)
        dfNew[['Start_Time', 'End_Time']] = dfNew['Time'].str.split(
            '-', expand = True)
        return dfNew

    def formatInfo(self, dfNew):
        columns = ['Date', 'Date', 'Semester', 'Week', 'Room_ID']
        delims = ['(', ')', 'S', 'W', '...']
        for col in range(len(columns)):
            dfNew[columns[col]] = dfNew[columns[col]].str.strip(delims[col])

        # format datetime columns 
        dfNew['Start_Time'] = dfNew['Date']+' '+dfNew['Start_Time'].str.strip()+':00'
        dfNew['End_Time'] = dfNew['Date']+ ' '+dfNew['End_Time'].str.strip()+':00'

        # add session number column 
        dfNew['Session_Number'] = dfNew.index + 1

        # remove redundant columns 
        dfNew.drop(['Semester_Week', 'Time', 'TO_DELETE'], axis = 1, inplace = True)
        return dfNew

#### Stage 2: Upload data files (.csv) into the environment as a dataframe.


In [4]:


# # reads .csv file(s) data then stores it as a dataframe
# dfCOA111 = pd.read_csv('cop504cwdata/22COA111ModuleRegister.csv')
# dfCOA122 = pd.read_csv('cop504cwdata/22COA122ModuleRegister.csv')

# # creates copy to make module sessions data frame 
# dfCOA111Copy = dfCOA111.copy() 
# dfCOA122Copy = dfCOA122.copy()

# # removes student ID for new 'dfCOA122Sessions' and 'dfCOA111Sessions' dataframe
# dfCOA111Copy.drop(dfCOA111.columns[0], axis = 1, inplace = True)
# dfCOA122Copy.drop(dfCOA122.columns[0], axis = 1, inplace = True) 

# dfCOA111Copy

# extract information from columns

#### Stage 3: Create dfCOA122Sessions and dfCOA111Sessions dataframes.

In [5]:
# ############################# FUNCTIONS #############################
# ## Put this into Data Cleaning Class? ##
# def splitColumnHeading(df, deliminator, initColumnName, newColumns):
#     sessionInfo = df.columns.str.split(deliminator) # splits column headings by newline
#     dfSessionInfo = pd.DataFrame(sessionInfo, columns=[initColumnName]) # stores the list of column information into a new data frame

#     dfNew = dfSessionInfo[initColumnName].apply(pd.Series) 
#     dfNew.columns = newColumns # creates new columns based on each value in the list 
#     return dfNew

# def splitColumnInfo(df, newColumns, columnToSplit, deliminator):
#     df[newColumns] = df[columnToSplit].str.split(deliminator, expand = True)
#     return df

# def removeChars(df, columnName, deliminator):
#     df[columnName] = df[columnName].str.strip(deliminator)
#     return df

# def formatForDatetime(df, date, times): # NOTE: 'times' list is in '%H:%M' format
#     for time in times:
#         df[time] = df[date] + ' ' + df[time].str.strip() + ':00'
#     return df 



# ######################################################################

    3.a Store the details of each session (found in the column heading) into new dataframes.

In [6]:


# newSessionColumns = ['Semester_Week', 'Date', 'Time', 'Lecture_Type', 'Room_ID', 'TO_DELETE']
# initColumn = 'SessionInfo'
# delim = '\n'

# dfCOA111Sessions =  splitColumnHeading(dfCOA111Copy, delim, initColumn, newSessionColumns)
# dfCOA122Sessions = splitColumnHeading(dfCOA122Copy,  delim, initColumn, newSessionColumns)

# dfSessionInfo = pd.DataFrame(dfsCopy[0].columns.str.split('\n')).apply(pd.Series)
# dfSessionInfo



# dfCOA111Sessions

# newColumns = [['Semester','Week'], ['Day','Date'], ['Start_Time', 'End_Time']]
# oldColumns = ['Semester_Week', 'Date', 'Time']
# delims = ['.', ' ', '-'] 

# for col in range(len(oldColumns)):
#     splitColumnInfo(dfCOA111Sessions, newColumns[col], oldColumns[col], delims[col])
#     splitColumnInfo(dfCOA122Sessions, newColumns[col], oldColumns[col], delims[col])

# columns = ['Date', 'Date', 'Semester', 'Week', 'Room_ID']
# delims = ['(', ')', 'S', 'W', '...']

# for col in range(len(columns)):
#     removeChars(dfCOA111Sessions, columns[col], delims[col])
#     removeChars(dfCOA122Sessions, columns[col], delims[col])

# oldFormatColumns = ['Start_Time', 'End_Time']
# dateColumn = 'Date'

# dfCOA111Sessions = formatForDatetime(dfCOA111Sessions, dateColumn, oldFormatColumns)
# dfCOA122Sessions = formatForDatetime(dfCOA122Sessions, dateColumn, oldFormatColumns)

# dfCOA111Sessions['Session_Number'] = dfCOA111Sessions.index + 1 # Since session numbers do not start from 0
# dfCOA122Sessions['Session_Number'] = dfCOA122Sessions.index + 1

# toRemove = ['Semester_Week', 'Time', 'TO_DELETE']
# dfCOA111Sessions.drop(toRemove, axis = 1, inplace = True) 
# dfCOA122Sessions.drop(toRemove, axis = 1, inplace = True) 


    3.f Display final dfCOA111Sessions and dfCOA122Sessions

In [7]:
class CleanDataframe:
    def __init__(self, df, dfTranspose):
        self.df = df
        self.dfTranspose = dfTranspose
        
    def assignIDs(self):
        id = 0
        for col in self.df.columns[0:]:
            if id == 0:
                # first column is always student id
                self.df.rename(columns={col:'SID'}, inplace=True)  
                self.dfTranspose.rename(columns={col:'SID'}, inplace=True)
            else:
                name = 'S' + str(id)
                self.df.rename(columns={col:name}, inplace=True)
                self.dfTranspose.rename(columns={col:id}, inplace=True) 
            id += 1   
        return self.df, self.dfTranspose
            
    def replaceAttendanceVals(self):
        oldVals = ['Ex', 'GPS', 'X']
        newVals = [None, True, False]
        
        for val in range(len(oldVals)):
            self.df.replace(oldVals[val], newVals[val], inplace = True)
            self.dfTranspose.replace(oldVals[val], newVals[val], inplace = True)
        return self.df, self.dfTranspose

    def removeNullRowsColumns(self):
        self.df.dropna(axis = 0, how = 'all',
         subset = self.df.columns[1:len(self.df.columns)],
         inplace = True) # drops rows with NULL
        self.df.dropna(axis = 1, how = 'all', inplace = True) # drops columns with NULL

        self.dfTranspose.dropna(axis = 0, how = 'all',
         subset = self.dfTranspose.columns[1:len(self.dfTranspose.columns)],
         inplace = True) 
        self.dfTranspose.dropna(axis = 1, how = 'all', inplace = True)

    def customTranspose(self):
        dfNew = self.dfTranspose.transpose() 
        dfNew.drop(['SID'], axis = 0, inplace = True)
        dfNew['Session_Number'] = dfNew.index # column used to join to dfCOA111Clean table
        dfNew = dfNew.add_prefix('Student')
        dfNew.rename (
            columns = {
                'StudentSession_Number':'Session_Number'
            },
            inplace = True
        )
        return dfNew

In [8]:
class WriteToSQL:
    def __init__(self, database):
        self.database = database
        self.conn = sqlite3.connect(self.database)
        self.cur = self.conn.cursor()

    def insertToDatabase(self, df):
        # gets the name of the dataframe
        name = df.name[2:] # removes 'df' prefix
        if 'Clean' in name:
            name = name[:6] # removes 'df' and 'Clean'
        try:
            df.to_sql(name, self.conn, if_exists= 'replace', index= False)
        except sqlite3.Error as error:
            print('Failed to insert table into the database', error)

    def checkDatatypes(self, df): # optional function call
        name = df.name[2:] 
        if 'Clean' in name:
            name = name[:6]
        self.cur.execute('PRAGMA table_info (' + name + ')')
        print('====== ' + name + ' Column Name, Data Type ======')
        for result in self.cur:
            print(result[1:3])

    def correctDatatypes(self, df):
        name = df.name[2:]
        if 'Clean' in name:
            name = name[:6] 
        self.cur.execute('PRAGMA foreign_keys = off;')
        self.cur.execute('BEGIN TRANSACTION;')
        self.cur.execute('ALTER TABLE ' + name + ' RENAME TO ' + name + '_Old;')
        self.cur.execute(
            'CREATE TABLE ' + name + ''' (
            Date DATE,
            Lecture_Type VARCHAR, 
            Room_ID VARCHAR, 
            Semester INTEGER, 
            Week INTEGER, 
            Day VARCHAR, 
            Start_Time TIMESTAMP, 
            End_Time TIMESTAMP, 
            Session_Number INTEGER); 
            '''
        )
        self.cur.execute(
            'INSERT INTO ' + name + ''' (
            Date, 
            Lecture_Type, 
            Room_ID, 
            Semester, 
            Week, 
            Day, 
            Start_Time, 
            End_Time, 
            Session_Number) 
            SELECT Date, Lecture_Type, Room_ID, Semester, Week, Day,
            Start_Time, End_Time, Session_Number 
            FROM ''' + name + '_Old;'
        )
        self.cur.execute('DROP TABLE ' + name + '_Old;') # drop old table to 'free-up' resources
        self.cur.execute('COMMIT;')
        self.cur.execute('PRAGMA foreign_keys = on;')

    def closeDatabase(self):
        self.conn.close() 
        print('\nThe sqlite connection is now closed.')

# Testing 

### Stage 1: Extract column information into new dataframes

In [9]:
for i in range(len(dfsCopy)):
    test = extractInfo(dfsCopy[i])
    dfsCopy[i] = test.splitInfo('\n')
    dfsCopy[i] = test.formatInfo(dfsCopy[i])

In [16]:
dfCOA111Sessions = dfsCopy[0]
dfCOA111Sessions.name = 'dfCOA111Sessions'
dfCOA122Sessions.name

'dfCOA122Sessions'

In [17]:
dfCOA122Sessions = dfsCopy[1] 
dfCOA122Sessions.name = 'dfCOA122Sessions'
dfCOA122Sessions.name

'dfCOA122Sessions'

### Stage 2: Clean dataframes 

In [18]:
# Create copies of the data frames
dfCOA111Clean, dfCOA111CleanTranspose = dfs[0].copy(), dfs[0].copy()
dfCOA122Clean, dfCOA122CleanTranspose = dfs[1].copy(), dfs[1].copy()

testCOA111 = CleanDataframe(df=dfCOA111Clean, dfTranspose=dfCOA111CleanTranspose)
testCOA122 = CleanDataframe(df=dfCOA122Clean, dfTranspose=dfCOA122CleanTranspose)

# Assign unique student ids
testCOA111.assignIDs()
testCOA122.assignIDs()

# Replace attendance values for each dataframe 
testCOA111.replaceAttendanceVals()
testCOA122.replaceAttendanceVals()

# Remove null columns and rows (Note: Rows with all NULL except the student id)
testCOA111.removeNullRowsColumns()
testCOA122.removeNullRowsColumns()

# Tranpose dfCOA111CleanTranspose and dfCOA122CleanTranspose
# These dataframes will be used to answer Section 1.3.
dfCOA111CleanTranspose = testCOA111.customTranspose()
dfCOA122CleanTranspose = testCOA122.customTranspose()

In [19]:
dfCOA111Clean

Unnamed: 0,SID,S1,S2,S3,S5,S6,S8,S9,S11,S12,S14
0,0,True,True,False,True,True,True,True,False,False,False
1,1,True,True,True,True,True,True,True,True,True,True
2,2,True,True,True,False,False,True,False,True,False,False
3,3,,,True,True,False,True,False,True,False,True
4,4,True,False,False,True,False,True,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...
231,238,False,False,True,True,False,True,True,True,True,True
232,239,True,False,True,True,True,True,True,True,True,False
233,240,True,True,True,True,False,True,True,True,True,True
234,241,True,True,True,True,True,True,True,True,True,True


In [20]:
dfCOA122Clean

Unnamed: 0,SID,S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11,S12,S13,S14,S17,S18,S19
0,0,True,True,,True,False,,False,False,,True,False,,False,False,True,,False
1,1,True,True,,True,True,,True,True,,True,True,,True,True,True,,True
2,2,True,True,,True,True,,True,True,,True,False,,False,False,True,,True
3,3,,,,,False,False,False,,True,False,,True,True,True,,True,True
4,4,True,True,,True,False,,True,False,,False,,,True,True,False,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231,238,False,False,,True,True,,True,True,,True,False,,True,True,True,,True
232,239,True,True,,True,True,,True,False,,False,True,,True,True,True,,False
233,240,True,,True,True,,True,True,,True,True,,True,True,True,,True,True
234,241,True,True,,True,True,,True,True,,True,True,,True,True,False,,True


### Stage 3: Write the dataframes in 'CWDatabase.db'

In [21]:
test = WriteToSQL(database='CWDatabase.db')  

dfsCOA111 = [dfCOA111Clean, dfCOA111CleanTranspose, dfCOA111Sessions] 
dfsCOA122 = [dfCOA122Clean, dfCOA122CleanTranspose, dfCOA122Sessions] 

# for df in range(len(dfsCOA111)):
#     test.checkDatatypes(dfsCOA111[df]) 
#     test.checkDatatypes(dfsCOA122[df])
'''
After a check (see commented code above)
the following dataframes required data type corrections:
- dfCOA111Sessions
- dfCOA122Sessions
'''
test.correctDatatypes(dfsCOA111[2])
test.correctDatatypes(dfsCOA122[2]) 

for df in range(len(dfsCOA111)):
    test.insertToDatabase(dfsCOA111[df])
    test.insertToDatabase(dfsCOA122[df])
        
test.closeDatabase()

AttributeError: 'DataFrame' object has no attribute 'name'