In [71]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score

In [72]:
df=pd.read_csv('noshow.csv')

In [73]:
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [74]:
df.shape

(110527, 14)

In [75]:
df = df.sample(n=20000, random_state=42)


In [76]:
df.columns


Index(['PatientId', 'AppointmentID', 'Gender', 'ScheduledDay',
       'AppointmentDay', 'Age', 'Neighbourhood', 'Scholarship', 'Hipertension',
       'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received', 'No-show'],
      dtype='object')

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20000 entries, 84674 to 71671
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PatientId       20000 non-null  float64
 1   AppointmentID   20000 non-null  int64  
 2   Gender          20000 non-null  object 
 3   ScheduledDay    20000 non-null  object 
 4   AppointmentDay  20000 non-null  object 
 5   Age             20000 non-null  int64  
 6   Neighbourhood   20000 non-null  object 
 7   Scholarship     20000 non-null  int64  
 8   Hipertension    20000 non-null  int64  
 9   Diabetes        20000 non-null  int64  
 10  Alcoholism      20000 non-null  int64  
 11  Handcap         20000 non-null  int64  
 12  SMS_received    20000 non-null  int64  
 13  No-show         20000 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 2.3+ MB


In [78]:
df.isnull().sum()

PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64

In [79]:
df.duplicated().sum()

np.int64(0)

In [80]:
import ydata_profiling
profile = ydata_profiling.ProfileReport(df)
profile.to_file("report.html")

100%|██████████| 14/14 [00:03<00:00,  4.52it/s]4<00:00,  7.40it/s, Describe variable: No-show]      
Summarize dataset: 100%|██████████| 32/32 [00:15<00:00,  2.01it/s, Completed]                           
Generate report structure: 100%|██████████| 1/1 [00:17<00:00, 17.26s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.00s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 33.34it/s]


In [81]:
profile



In [82]:
df['Waiting_time'] = (pd.to_datetime(df['AppointmentDay']) - pd.to_datetime(df['ScheduledDay'])).dt.days

In [94]:
X=df.drop(columns=['No-show', 'PatientId', 'AppointmentID', 'ScheduledDay','AppointmentDay','Alcoholism','Neighbourhood'])
y=df['No-show'].map({'No': 0, 'Yes': 1})

In [95]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
print("Numeric features:", numeric_features)

Numeric features: Index(['Age', 'Scholarship', 'Hipertension', 'Diabetes', 'Handcap',
       'SMS_received', 'Waiting_time'],
      dtype='object')


In [96]:
categorical_features = X.select_dtypes(include=['object']).columns
print("Categorical features:", categorical_features)

Categorical features: Index(['Gender'], dtype='object')


In [97]:
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])


In [98]:
categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),   
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [99]:
preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_features),
    ("cat", categorical_pipeline, categorical_features)

])

In [None]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
# Define two separate pipelines
pipeline_lgbm = ImbPipeline([
    ("preprocessing", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", LGBMClassifier(random_state=42))
])
pipeline_xgb = ImbPipeline([
    ("preprocessing", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
])

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# Integrate SMOTE into the pipelines using imblearn.pipeline.Pipeline
# Note: SMOTE will be applied to the preprocessed features during fit
print("Before SMOTE:", y_train.value_counts())
# Fit both pipelines
pipeline_lgbm.fit(X_train, y_train)
pipeline_xgb.fit(X_train, y_train)
# To display counts after SMOTE we transform then resample separately using the preprocessing step
X_pre = pipeline_lgbm.named_steps['preprocessing'].transform(X_train)
_, y_train_smote = pipeline_lgbm.named_steps['smote'].fit_resample(X_pre, y_train)
print("After SMOTE (example):", pd.Series(y_train_smote).value_counts())

Before SMOTE: No-show
0    12786
1     3214
Name: count, dtype: int64
After SMOTE: No-show
1    12786
0    12786
Name: count, dtype: int64


In [None]:
# Evaluate both fitted pipelines on the test set and persist the best pipeline
from sklearn.metrics import classification_report, accuracy_score, f1_score
import joblib, json

y_pred_lgbm = pipeline_lgbm.predict(X_test)
y_pred_xgb = pipeline_xgb.predict(X_test)

metrics = {}
metrics['lgbm'] = {'accuracy': accuracy_score(y_test, y_pred_lgbm), 'f1': f1_score(y_test, y_pred_lgbm)}
metrics['xgb'] = {'accuracy': accuracy_score(y_test, y_pred_xgb), 'f1': f1_score(y_test, y_pred_xgb)}

print('LGBM metrics:', metrics['lgbm'])
print('XGB metrics:', metrics['xgb'])

print('\nLGBM classification report:\n', classification_report(y_test, y_pred_lgbm))
print('\nXGB classification report:\n', classification_report(y_test, y_pred_xgb))

# Choose best model by F1 score
best_name = max(metrics, key=lambda k: metrics[k]['f1'])
best_pipeline = pipeline_lgbm if best_name == 'lgbm' else pipeline_xgb

# Persist best model and metrics
joblib.dump(best_pipeline, 'best_model.pkl')
with open('metrics.json', 'w') as f: json.dump(metrics, f, indent=2)
print('Saved best model as best_model.pkl (', best_name, ') and metrics.json')