In [None]:
# Import libraries and intall the requirements
!pip install -r requirements.txt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [None]:
# Data read
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
# Frequency and percentage of No-show values
fig, axs = plt.subplots(1,2,figsize=(14,7))
sns.countplot(y='No-show',data=train,ax=axs[0])
axs[0].set_title("Frequency of No-show values")
train['No-show'].value_counts().plot(x=None,y=None, kind='pie', ax=axs[1],autopct='%1.2f%%')
axs[1].set_title("Percentage of No-show values")
plt.show()

In [None]:
# Remove ID columns
train = train.drop(['PatientId','AppointmentID'], axis=1)
test = test.drop(['PatientId','AppointmentID'], axis=1)

In [None]:
# Type change into object
train['Scholarship'] = train['Scholarship'].astype('object')
train['Hipertension'] = train['Hipertension'].astype('object')
train['Diabetes'] = train['Diabetes'].astype('object')
train['Alcoholism'] = train['Alcoholism'].astype('object')
train['Handcap'] = train['Handcap'].astype('object')
train['SMS_received'] = train['SMS_received'].astype('object')

test['Scholarship'] = test['Scholarship'].astype('object')
test['Hipertension'] = test['Hipertension'].astype('object')
test['Diabetes'] = test['Diabetes'].astype('object')
test['Alcoholism'] = test['Alcoholism'].astype('object')
test['Handcap'] = test['Handcap'].astype('object')
test['SMS_received'] = test['SMS_received'].astype('object')

In [None]:
# Type change into date time
train['ScheduledDay'] = pd.to_datetime(train['ScheduledDay']).dt.date.astype('datetime64[ns]')
train['AppointmentDay'] = pd.to_datetime(train['AppointmentDay']).dt.date.astype('datetime64[ns]')
test['ScheduledDay'] = pd.to_datetime(test['ScheduledDay']).dt.date.astype('datetime64[ns]')
test['AppointmentDay'] = pd.to_datetime(test['AppointmentDay']).dt.date.astype('datetime64[ns]')

In [None]:
# Type change test
test.info()

In [None]:
# If the Age is 0 is a baby with not Hipertension, Diabetes or Alcoholism problems
train[(train.Age <= 0) & ((train.Hipertension.astype(int) == 1) | (train.Diabetes.astype(int) == 1) | (train.Alcoholism.astype(int) == 1))]

In [None]:
# Day of the week
train['WeekDayOfSchedule'] = train['ScheduledDay'].dt.day_name()
train['WeekDayOfAppointment'] = train['AppointmentDay'].dt.day_name()

test['WeekDayOfSchedule'] = test['ScheduledDay'].dt.day_name()
test['WeekDayOfAppointment'] = test['AppointmentDay'].dt.day_name()

In [None]:
# Calculating the waiting days
train['WaitingDays'] = train['AppointmentDay'] - train['ScheduledDay']
train['WaitingDays'] = train['WaitingDays'].dt.days

test['WaitingDays'] = test['AppointmentDay'] - test['ScheduledDay']
test['WaitingDays'] = test['WaitingDays'].dt.days

In [None]:
# Time travel not allowed
train = train.drop(train[train.WaitingDays < 0].index)
test = test.drop(test[test.WaitingDays < 0].index)

In [None]:
# Rename to assign more significant name to columns
train.rename(columns={"ScheduledDay": "ScheduledDate", "AppointmentDay": "AppointmentDate"}, inplace=True)
test.rename(columns={"ScheduledDay": "ScheduledDate", "AppointmentDay": "AppointmentDate"}, inplace=True)

In [None]:
# Create new column for each date for train
train['ScheduledYear'] = train['ScheduledDate'].dt.year
train['ScheduledMonth'] = train['ScheduledDate'].dt.month
train['ScheduledDay'] = train['ScheduledDate'].dt.day
train.drop(['ScheduledDate'], axis=1, inplace=True)

train['AppointmentYear'] = train['AppointmentDate'].dt.year
train['AppointmentMonth'] = train['AppointmentDate'].dt.month
train['AppointmentDay'] = train['AppointmentDate'].dt.day
train.drop(['AppointmentDate'], axis=1, inplace=True)


In [None]:
# Create new column for each date for test
test['ScheduledYear'] = test['ScheduledDate'].dt.year
test['ScheduledMonth'] = test['ScheduledDate'].dt.month
test['ScheduledDay'] = test['ScheduledDate'].dt.day
test.drop(['ScheduledDate'], axis=1, inplace=True)

test['AppointmentYear'] = test['AppointmentDate'].dt.year
test['AppointmentMonth'] = test['AppointmentDate'].dt.month
test['AppointmentDay'] = test['AppointmentDate'].dt.day
test.drop(['AppointmentDate'], axis=1, inplace=True)


In [None]:
# Features correlation
fig, ax = plt.subplots(figsize=[12,9])
cor = train.corr()
mask = np.zeros_like(cor)
mask[np.triu_indices_from(mask)] = True
cmap = sns.diverging_palette(220, 10, as_cmap=True)
ax = sns.heatmap(cor, xticklabels=cor.columns, yticklabels=cor.columns, 
            annot=True, cmap=cmap, mask=mask);

In [None]:
# Age division into categories for train
#def age_div_train(train) :
#    if train["Age"] == 0 :
#        return "Neonati"
#    elif (train["Age"] > 0) & (train["Age"] <= 6 ):
#        return "Seconda infanzia"
#    elif (train["Age"] > 6) & (train["Age"] <= 12 ):
#        return "Terza infanzia"
#    elif (train["Age"] > 12) & (train["Age"] <= 22):
#        return "Adolescenza"
#    elif (train["Age"] > 22) & (train["Age"] <= 39):
#        return "Prima età adulta"
#    elif (train["Age"] > 39) & (train["Age"] <= 59):
#        return "Seconda età adulta"
#    elif (train["Age"] > 59) & (train["Age"] <= 75):
#        return "Terza età"
#    elif (train["Age"] > 75) & (train["Age"] <= 90):
#        return "Quarta età"
#    elif train["Age"] > 90:
#        return "Quinta età"
#train["AgeGroup"] = train.apply(lambda train:age_div_train(train),axis = 1)

In [None]:
# Age division into categories for test
#def age_div_test(test) :
#    if test["Age"] == 0 :
#        return "Neonati"
#    elif (test["Age"] > 0) & (test["Age"] <= 6 ):
#        return "Seconda infanzia"
#    elif (test["Age"] > 6) & (test["Age"] <= 12 ):
#        return "Terza infanzia"
#    elif (test["Age"] > 12) & (test["Age"] <= 22):
#        return "Adolescenza"
#    elif (test["Age"] > 22) & (test["Age"] <= 39):
#        return "Prima età adulta"
#    elif (test["Age"] > 39) & (test["Age"] <= 59):
#        return "Seconda età adulta"
#    elif (test["Age"] > 59) & (test["Age"] <= 75):
#        return "Terza età"
#    elif (test["Age"] > 75) & (test["Age"] <= 90):
#        return "Quarta età"
#    elif test["Age"] > 90:
#        return "Quinta età"
#test["AgeGroup"] = test.apply(lambda test:age_div_test(test),axis = 1)

In [None]:
# Drop Age
#train.drop(['Age'], axis=1, inplace=True)
#test.drop(['Age'], axis=1, inplace=True)

In [None]:
# Encoding train
labelEcnoder = LabelEncoder()
train['Gender'] = labelEcnoder.fit_transform(train['Gender'])
train['Neighbourhood'] = labelEcnoder.fit_transform(train['Neighbourhood'])
train['WeekDayOfSchedule'] = labelEcnoder.fit_transform(train['WeekDayOfSchedule'])
train['WeekDayOfAppointment'] = labelEcnoder.fit_transform(train['WeekDayOfAppointment'])
train['No-show'] = labelEcnoder.fit_transform(train['No-show'])

#train['AgeGroup'] = labelEcnoder.fit_transform(train['AgeGroup'])

In [None]:
# Encoding test
labelEcnoder = LabelEncoder()
test['Gender'] = labelEcnoder.fit_transform(test['Gender'])
test['Neighbourhood'] = labelEcnoder.fit_transform(test['Neighbourhood'])
test['WeekDayOfSchedule'] = labelEcnoder.fit_transform(test['WeekDayOfSchedule'])
test['WeekDayOfAppointment'] = labelEcnoder.fit_transform(test['WeekDayOfAppointment'])
test['No-show'] = labelEcnoder.fit_transform(test['No-show'])

#test['AgeGroup'] = labelEcnoder.fit_transform(test['AgeGroup'])

In [None]:
# Split in dipendent and indipendent variables
X_train = train.drop(['No-show'], axis=1)
y_train = train['No-show']
X_test = test.drop(['No-show'], axis=1)
y_test = test['No-show']

# Random Forest

L'algoritmo Random Forest sfrutta il "bagging" come metodo principale di ensamble, che rappresenta una tecnica semplice, ma potente, per combinare più previsioni di algoritmi di apprendimento automatico cosí da avere risultati più accurati rispetto al modello preso singolarmente. Come modello individuale il RF si avvale dell'albero decisionale e in effetti una "random forest" combina più alberi decisionali in un unico modello, cosí da avere delle previsioni che in media si avvicinano più al risultato. Il risultato finale restituito da questo algoritmo varia in base al suo utilizzo: se si tratta di un problema di regressione, il risultato è la media del risultato numerico restituito dai diversi alberi, se invece si tratta di un problema di classificazione, sarà la classe restituita dal maggior numero di alberi.
Intuitivamente il RF non è altro che un approccio molto adottato nella vita reale, cioè il far affidamneto su più fonti diverse, una foresta (molte fonti) è meglio di un singolo albero (singola fonte).

In [None]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# n_estimators = number of trees
# max_depth = maximum depth of the tree
model = RandomForestClassifier(n_estimators=100, max_depth=14, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Using Random Forest we have an accuracy of', np.mean((y_pred==y_test)*100) ,'%')

In [None]:
# Importance features graph
(pd.Series(model.feature_importances_, index=X_train.columns).nlargest(18).plot(kind='barh'))