In [37]:
import numpy as np
import pandas as pd


data = pd.read_csv(r'..\data\processed\cleaned-data.csv')
data.head()

Unnamed: 0,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,F,2016-04-28 07:35:20+00:00,2016-05-04 00:00:00+00:00,76,FORTE SÃO JOÃO,0,0,0,0,1,1,No
1,F,2016-05-05 07:17:25+00:00,2016-05-11 00:00:00+00:00,67,ROMÃO,0,0,0,0,1,1,No
2,F,2016-05-12 07:31:37+00:00,2016-05-18 00:00:00+00:00,39,ROMÃO,0,0,0,0,1,0,No
3,M,2016-05-19 08:12:01+00:00,2016-05-25 00:00:00+00:00,76,FORTE SÃO JOÃO,0,1,0,0,1,1,Yes
4,F,2016-04-28 07:35:33+00:00,2016-05-04 00:00:00+00:00,43,BONFIM,1,0,0,0,1,1,No


- We cannot use `ScheduledDay` & `AppointmentDay` columns together because this is in raw formate, that's we can extract other meaningfull insights from it like `waiting time` which is difference between appointment day and schedule day.

In [38]:
data['ScheduledDay'] = pd.to_datetime(data['ScheduledDay'])
data['AppointmentDay'] = pd.to_datetime(data['AppointmentDay'])

data['WaitingTime'] = (data['AppointmentDay'] - data['ScheduledDay']).dt.days

- Also we can create another column `Appoint Day of Week` which will show how many people are no-show on a particular day, because generally most of the people are absent on weekends.

In [39]:
data['AppointmentDayOfWeek'] = data['AppointmentDay'].dt.dayofweek  # 0 means monday, 6 sunday

- And, with the help of these columns, we will extract another feature `Wait Groups` which is categorical column and help model for best split because this is a classification problem.

In [None]:
data['WaitGroup'] = pd.cut(data['WaitingTime'],
                         bins=[-1, 0, 3, 7, 30, 120],
                         labels=['SameDay', 'Short', 'Mid', 'Long', 'VeryLong'])

- We can also create chronic status by checking the patient's `hipertension`, `diabetes` and `alcoholism`. If the patient has all these three points, then the chances of that show of pateints increases.

In [41]:
data['ChronicCount'] = data['Hipertension'] + data['Diabetes'] + data['Alcoholism']

In [42]:
data['ChronicGroup'] = pd.cut(data['ChronicCount'],
                            bins=[-1, 0, 1, 3],
                            labels=['None', 'Single', 'Multiple'])

- And lastly we will Label Encode all these columns, because machine learning models only understand numerical data. 

In [None]:
from sklearn.preprocessing import LabelEncoder

label_cols = ['Gender', 'Neighbourhood', 'AppointmentDayOfWeek', 'WaitGroup', 'ChronicGroup']
encoders = {}  # encoders['Gender'].transform(['Male'])  # Output:1 

for col in label_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    encoders[col] = le

#### We save all the encoders for encoding input feature while predicting.

In [44]:
encoders

{'Gender': LabelEncoder(),
 'Neighbourhood': LabelEncoder(),
 'AppointmentDayOfWeek': LabelEncoder(),
 'WaitGroup': LabelEncoder(),
 'ChronicGroup': LabelEncoder()}

In [45]:
data.drop(columns=['ScheduledDay', 'AppointmentDay', 'ChronicCount'], inplace=True)

In [46]:
data.head()

Unnamed: 0,Gender,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,WaitingTime,AppointmentDayOfWeek,WaitGroup,ChronicGroup
0,0,76,23,0,0,0,0,1,1,No,5,2,1,1
1,0,67,60,0,0,0,0,1,1,No,5,2,1,1
2,0,39,60,0,0,0,0,1,0,No,5,2,1,1
3,1,76,23,0,1,0,0,1,1,Yes,5,2,1,2
4,0,43,8,1,0,0,0,1,1,No,5,2,1,1


# Exporting Processed Data

In [47]:
data.to_csv('../data/processed/processed_data.csv', index=False, encoding='utf-8')