# Data Preprocessing

In [198]:
import pandas as pd
import numpy as np


In [199]:
#Data Collection 
raw_csv_data = pd.read_csv('bookcsv.csv', sep = ';')

In [200]:
raw_csv_data.head()

Unnamed: 0,name,grade,age,date,subject,teacher,absence,reason
0,aaa,4,9,03/01/2022,F,F1,1,1
1,bbb,4,9,03/01/2022,A,A1,1,2
2,ccc,5,10,04/01/2022,M,M1,0,0
3,ddd,4,8,04/01/2022,M,M1,0,0
4,eee,5,10,05/01/2022,M,M1,0,0


In [201]:
df = raw_csv_data.copy()

In [202]:
df.head()

Unnamed: 0,name,grade,age,date,subject,teacher,absence,reason
0,aaa,4,9,03/01/2022,F,F1,1,1
1,bbb,4,9,03/01/2022,A,A1,1,2
2,ccc,5,10,04/01/2022,M,M1,0,0
3,ddd,4,8,04/01/2022,M,M1,0,0
4,eee,5,10,05/01/2022,M,M1,0,0


In [203]:
#removing the name column

df=df.drop (['name'], axis = 1)
df.head()

Unnamed: 0,grade,age,date,subject,teacher,absence,reason
0,4,9,03/01/2022,F,F1,1,1
1,4,9,03/01/2022,A,A1,1,2
2,5,10,04/01/2022,M,M1,0,0
3,4,8,04/01/2022,M,M1,0,0
4,5,10,05/01/2022,M,M1,0,0


### splitting the "reason" column into 4 columns 

In [None]:
#For now we assume there are 4 reasons to be absent (sickness, unjustified absence etc..)

#We want to split the "reason" column into 4 columns with binary values for each one (each reason)

In [204]:
reason_columns = pd.get_dummies(df ['reason'])
reason_columns.head()

Unnamed: 0,0,1,2,3,4
0,0,1,0,0,0
1,0,0,1,0,0
2,1,0,0,0,0
3,1,0,0,0,0
4,1,0,0,0,0


In [205]:
reasons_type_1 = reason_columns.loc[:, 1]
reasons_type_2 = reason_columns.loc[:, 2]
reasons_type_3 = reason_columns.loc[:, 3]
reasons_type_4 = reason_columns.loc[:, 4]

In [206]:
reasons_type_1.head() 

0    1
1    0
2    0
3    0
4    0
Name: 1, dtype: uint8

In [207]:
df = df.drop(['reason'],axis=1)

In [208]:
df = pd.concat([df, reasons_type_1, reasons_type_2, reasons_type_3, reasons_type_4], axis = 1)
df.head()

Unnamed: 0,grade,age,date,subject,teacher,absence,1,2,3,4
0,4,9,03/01/2022,F,F1,1,1,0,0,0
1,4,9,03/01/2022,A,A1,1,0,1,0,0
2,5,10,04/01/2022,M,M1,0,0,0,0,0
3,4,8,04/01/2022,M,M1,0,0,0,0,0
4,5,10,05/01/2022,M,M1,0,0,0,0,0


In [209]:
column_names = ['grade', 'age', 'date', 'subject',
       'teacher', 'absence', 'Reasons_1','Reasons_2', 'Reasons_3' , 'Reasons_4']


In [210]:
df.columns = column_names

In [211]:
df.head()

Unnamed: 0,grade,age,date,subject,teacher,absence,Reasons_1,Reasons_2,Reasons_3,Reasons_4
0,4,9,03/01/2022,F,F1,1,1,0,0,0
1,4,9,03/01/2022,A,A1,1,0,1,0,0
2,5,10,04/01/2022,M,M1,0,0,0,0,0
3,4,8,04/01/2022,M,M1,0,0,0,0,0
4,5,10,05/01/2022,M,M1,0,0,0,0,0


### Extracting month from date

In [212]:
df_reason_mod = df.copy()

In [213]:
type(df_reason_mod['date'][0])

str

In [214]:
df_reason_mod['date'] = pd.to_datetime(df_reason_mod['date'], format = '%d/%m/%Y')

In [215]:
df_reason_mod['date'].head()

0   2022-01-03
1   2022-01-03
2   2022-01-04
3   2022-01-04
4   2022-01-05
Name: date, dtype: datetime64[ns]

In [216]:
type(df_reason_mod['date'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [217]:
months =[]
for i in range(df_reason_mod.shape[0]):
    months.append(df_reason_mod['date'][i].month)

In [218]:
df_reason_mod['Month Value'] = months

In [219]:
df_reason_mod.head()

Unnamed: 0,grade,age,date,subject,teacher,absence,Reasons_1,Reasons_2,Reasons_3,Reasons_4,Month Value
0,4,9,2022-01-03,F,F1,1,1,0,0,0,1
1,4,9,2022-01-03,A,A1,1,0,1,0,0,1
2,5,10,2022-01-04,M,M1,0,0,0,0,0,1
3,4,8,2022-01-04,M,M1,0,0,0,0,0,1
4,5,10,2022-01-05,M,M1,0,0,0,0,0,1


### Extracting weekday from date

In [220]:
days =[]
for i in range(df_reason_mod.shape[0]):
    days.append(df_reason_mod['date'][i].weekday())

In [221]:
df_reason_mod['day of the week'] = days

In [222]:
df_reason_mod.head()

Unnamed: 0,grade,age,date,subject,teacher,absence,Reasons_1,Reasons_2,Reasons_3,Reasons_4,Month Value,day of the week
0,4,9,2022-01-03,F,F1,1,1,0,0,0,1,0
1,4,9,2022-01-03,A,A1,1,0,1,0,0,1,0
2,5,10,2022-01-04,M,M1,0,0,0,0,0,1,1
3,4,8,2022-01-04,M,M1,0,0,0,0,0,1,1
4,5,10,2022-01-05,M,M1,0,0,0,0,0,1,2


In [228]:
df = df_reason_mod

Unnamed: 0,grade,age,date,subject,teacher,absence,Reasons_1,Reasons_2,Reasons_3,Reasons_4,Month Value,day of the week
0,4,9,2022-01-03,F,F1,1,1,0,0,0,1,0
1,4,9,2022-01-03,A,A1,1,0,1,0,0,1,0
2,5,10,2022-01-04,M,M1,0,0,0,0,0,1,1
3,4,8,2022-01-04,M,M1,0,0,0,0,0,1,1
4,5,10,2022-01-05,M,M1,0,0,0,0,0,1,2


In [229]:
column_names = ['Grade','Age', 'Date' ,  'Subject', 'Teacher', 'Absent','Reason 1','Reason 2', 'Reason 3',
       'Reason 4','Month', 'Weekday',]

In [230]:
df.columns = column_names

In [231]:
column_names_reordered = ['Grade','Age', 'Date' , 'Month', 'Weekday', 'Subject', 'Teacher', 'Absent','Reason 1','Reason 2', 'Reason 3',
       'Reason 4']

In [232]:
df= df[column_names_reordered]

In [233]:
df.head()

Unnamed: 0,Grade,Age,Date,Month,Weekday,Subject,Teacher,Absent,Reason 1,Reason 2,Reason 3,Reason 4
0,4,9,2022-01-03,1,0,F,F1,1,1,0,0,0
1,4,9,2022-01-03,1,0,A,A1,1,0,1,0,0
2,5,10,2022-01-04,1,1,M,M1,0,0,0,0,0
3,4,8,2022-01-04,1,1,M,M1,0,0,0,0,0
4,5,10,2022-01-05,1,2,M,M1,0,0,0,0,0
