In [28]:
from tpot import TPOTRegressor
from tpot import TPOTClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [22]:
# Load the data file
file_path = '../data/patient_no_show_dataset.csv'
df = pd.read_csv(file_path)

In [23]:
# Label encoding
label_encoder = LabelEncoder()
df_le = df

columns_to_encode = ['gender', 'Ethnicity', 'Socioeconomic Status', 'AppointmentNoshow']
for column in columns_to_encode:
    df_le[f'{column}_encoded'] = label_encoder.fit_transform(df_le[column])
df_le = df_le.drop(columns=columns_to_encode)

In [24]:
print(df_le.dtypes)

Patient ID                        int64
Age                               int64
Distance to Facility            float64
Previous No-shows                 int64
gender_encoded                    int64
Ethnicity_encoded                 int64
Socioeconomic Status_encoded      int64
AppointmentNoshow_encoded         int64
dtype: object


In [25]:
X = df_le.drop('AppointmentNoshow_encoded', axis=1)
y = df_le.AppointmentNoshow_encoded
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=42)

In [29]:
# Initialize TPOTClassifier
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_patient_no_show_pipeline_classifier.ipynb')

                                                                             
Generation 1 - Current best internal CV score: 0.5933333333333334
                                                                              
Generation 2 - Current best internal CV score: 0.6
                                                                              
Generation 3 - Current best internal CV score: 0.6026666666666667
                                                                              
Generation 4 - Current best internal CV score: 0.6026666666666667
                                                                              
Generation 5 - Current best internal CV score: 0.6026666666666667
                                                                              
Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.35000000000000003, min_samples_leaf=5, min_samples_split=14, n_estimators=100)
0.588


In [26]:

# Initialize TPOTRegressor
tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42)

# Fit TPOTRegressor on the training data
tpot.fit(X_train, y_train)

# Evaluate the best pipeline on the test data
print(tpot.score(X_test, y_test))

# Export the optimized pipeline as a Python script
tpot.export('tpot_patient_no_show_pipeline_regression.ipynb')

                                                                              
Generation 1 - Current best internal CV score: -0.2420068337964348
                                                                              
Generation 2 - Current best internal CV score: -0.2405730695909094
                                                                              
Generation 3 - Current best internal CV score: -0.24038339906887032
                                                                              
Generation 4 - Current best internal CV score: -0.23993073836145662
                                                                              
Generation 5 - Current best internal CV score: -0.23993073836145662
                                                                              
Best pipeline: ExtraTreesRegressor(input_matrix, bootstrap=True, max_features=0.8500000000000001, min_samples_leaf=10, min_samples_split=5, n_estimators=100)
-0.24139291746889552
