In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from datetime import datetime
from sklearn.linear_model import LogisticRegression

Lettura dei dati del Dataset di Train e Test

In [4]:
train_Data = pd.read_csv('train.csv')

In [5]:
test_Data = pd.read_csv('test.csv')

Effettuo controllo dei dati di Train e Test

In [6]:
train_Data.isnull().sum(axis=0).reset_index()

Unnamed: 0,index,0
0,No-show,0
1,PatientId,0
2,AppointmentID,0
3,Gender,0
4,ScheduledDay,0
5,AppointmentDay,0
6,Age,0
7,Neighbourhood,0
8,Scholarship,0
9,Hipertension,0


In [7]:
test_Data.isnull().sum(axis=0).reset_index()

Unnamed: 0,index,0
0,No-show,0
1,PatientId,0
2,AppointmentID,0
3,Gender,0
4,ScheduledDay,0
5,AppointmentDay,0
6,Age,0
7,Neighbourhood,0
8,Scholarship,0
9,Hipertension,0


Effettuo ulteriore controllo a TrainData e TestData

Vedendo il Dataset mi accorgo che ScheduledDay e AppointmentDay sono correlate con la feature No-show, creo una colonna chiamata "Differenzagiorni" che dovrà contenere i giorni che il paziente deve aspettare prima di essere visitato, inoltre verranno cancellate PatientId e AppointmentID perchè poco importanti al contesto

In [8]:
train_Data[['ScheduledDay', 'AppointmentDay']] = train_Data[['ScheduledDay', 'AppointmentDay']].apply(pd.to_datetime)

In [9]:
train_Data['Differenzagiorni'] = train_Data["AppointmentDay"].sub(train_Data["ScheduledDay"], axis=0)

In [10]:
train_Data = train_Data.drop(columns=['ScheduledDay', 'AppointmentDay','PatientId','AppointmentID'])

In [11]:
#Converto la differenza dei giorni in tipo intero
train_Data["Differenzagiorni"] = (train_Data["Differenzagiorni"]).abs().dt.days

In [12]:
train_Data

Unnamed: 0,No-show,Gender,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,Differenzagiorni
0,No,F,60,SÃO CRISTÓVÃO,0,1,0,0,0,0,27
1,No,F,80,SANTA HELENA,0,0,1,0,0,0,0
2,No,M,61,JARDIM DA PENHA,0,1,0,0,0,0,0
3,No,F,5,ANDORINHAS,0,0,0,0,0,0,27
4,No,M,60,CENTRO,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
66310,No,M,89,ITARARÉ,0,1,0,0,0,0,0
66311,Yes,F,40,RESISTÊNCIA,1,0,0,0,0,1,28
66312,No,F,47,JARDIM DA PENHA,0,0,0,0,0,0,0
66313,No,F,58,MARIA ORTIZ,0,0,0,0,0,0,1


In [13]:
test_Data[['ScheduledDay', 'AppointmentDay']] = test_Data[['ScheduledDay', 'AppointmentDay']].apply(pd.to_datetime)

In [14]:
test_Data['Differenzagiorni'] = test_Data["AppointmentDay"].sub(test_Data["ScheduledDay"], axis=0)

In [15]:
test_Data = test_Data.drop(columns=['ScheduledDay', 'AppointmentDay','PatientId','AppointmentID'])

In [16]:
#Converto la differenza dei giorni in tipo intero
test_Data["Differenzagiorni"] = (test_Data["Differenzagiorni"]).abs().dt.days

In [17]:
test_Data

Unnamed: 0,No-show,Gender,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,Differenzagiorni
0,No,F,37,MARIA ORTIZ,0,0,0,0,0,1,36
1,No,M,7,GURIGICA,0,0,0,0,0,1,21
2,Yes,F,54,CARATOÍRA,0,1,0,0,0,0,1
3,Yes,M,30,ITARARÉ,0,0,0,0,0,1,21
4,Yes,M,36,DA PENHA,0,0,0,0,0,1,6
...,...,...,...,...,...,...,...,...,...,...,...
22101,No,M,0,DO MOSCOSO,0,0,0,0,0,0,0
22102,Yes,M,48,RESISTÊNCIA,0,1,0,0,0,0,22
22103,No,F,52,JARDIM DA PENHA,0,1,0,0,0,1,8
22104,No,M,62,JABOUR,0,1,0,0,0,1,27


Il dataset contiene valori non numerici, bisogna trasformare tutte le features in valori numerici per poter applicare gli algoritmi di classificazione.

In [18]:
s=(train_Data.dtypes == 'object')
object_cols = list(s[s].index)

print (object_cols)

['No-show', 'Gender', 'Neighbourhood']


In [19]:
label_train_data=train_Data.copy()
label_encoder=LabelEncoder()
for k in object_cols:
    label_train_data[k]=label_encoder.fit_transform(label_train_data[k])

In [20]:
label_train_data

Unnamed: 0,No-show,Gender,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,Differenzagiorni
0,0,0,60,74,0,1,0,0,0,0,27
1,0,0,80,62,0,0,1,0,0,0,0
2,0,1,61,39,0,1,0,0,0,0,0
3,0,0,5,1,0,0,0,0,0,0,27
4,0,1,60,10,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
66310,0,1,89,36,0,1,0,0,0,0,0
66311,1,0,40,58,1,0,0,0,0,1,28
66312,0,0,47,39,0,0,0,0,0,0,0
66313,0,0,58,43,0,0,0,0,0,0,1


In [21]:
s=(test_Data.dtypes == 'object')
object_cols = list(s[s].index)

print (object_cols)

['No-show', 'Gender', 'Neighbourhood']


In [22]:
label_test_data=test_Data.copy()
label_encoder=LabelEncoder()
for k in object_cols:
    label_test_data[k]=label_encoder.fit_transform(label_test_data[k])

In [23]:
label_test_data

Unnamed: 0,No-show,Gender,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,Differenzagiorni
0,0,0,37,42,0,0,0,0,0,1,36
1,0,1,7,26,0,0,0,0,0,1,21
2,1,0,54,8,0,1,0,0,0,0,1
3,1,1,30,35,0,0,0,0,0,1,21
4,1,1,36,14,0,0,0,0,0,1,6
...,...,...,...,...,...,...,...,...,...,...,...
22101,0,1,0,17,0,0,0,0,0,0,0
22102,1,1,48,58,0,1,0,0,0,0,22
22103,0,0,52,38,0,1,0,0,0,1,8
22104,0,1,62,36,0,1,0,0,0,1,27


Inizializzo il Modello

In [24]:
Train_x = label_train_data.drop(columns = ['No-show'])

In [25]:
Test_x = label_test_data.drop(columns = ['No-show'])

In [26]:
train_y = label_train_data["No-show"]

In [27]:
test_y = label_test_data["No-show"]

In [28]:
#Definisco il modello 
model=LogisticRegression(max_iter=1000, C=100)
#Alleno il modello
model.fit(Train_x, train_y)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
Training_predict = model.predict(Train_x)
Test_predict = model.predict(Test_x)

In [30]:
Train_Accurancy=accuracy_score(train_y,Training_predict.round())*100
Train_Accurancy

79.6516625197919

In [31]:
Test_Accurancy=accuracy_score(test_y,Test_predict.round())*100
Test_Accurancy

78.93332127024337