In [20]:
import sys
import os

# Get the current working directory (cwd)
cwd = os.getcwd()

# Navigate to the parent directory
parent_dir = os.path.abspath(os.path.join(cwd, os.pardir))

# Add the parent directory to the Python path
sys.path.append(parent_dir)

# Now you can import the preprocessor module
# from ml_logic import preprocessor

In [21]:
import pandas as pd
pd.set_option('display.max_columns', None)
import pickle

from ml_logic.data import clean_training_data, load_data
from ml_logic.preprocessor import preprocess_features, preprocessor

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline

In [22]:
data = load_data()
data.head()

loaded dataset with shape  (25000, 17)


Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,diag_3,glucose_test,A1Ctest,change,diabetes_med,readmitted
0,[70-80),8,72,1,18,2,0,0,Missing,Circulatory,Respiratory,Other,no,no,no,yes,no
1,[70-80),3,34,2,13,0,0,0,Other,Other,Other,Other,no,no,no,yes,no
2,[50-60),5,45,0,18,0,0,0,Missing,Circulatory,Circulatory,Circulatory,no,no,yes,yes,yes
3,[70-80),2,36,0,12,1,0,0,Missing,Circulatory,Other,Diabetes,no,no,yes,yes,yes
4,[60-70),1,42,0,7,0,0,0,InternalMedicine,Other,Circulatory,Respiratory,no,no,no,yes,no


In [23]:
train_data = clean_training_data(data)
train_data.head()

Dataset cleaned. New shape  (24779, 15)


Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,diag_1,diag_2,diag_3,A1Ctest,change,diabetes_med,readmitted
0,0.7,8,72,1,18,2,0,0,Circulatory,Respiratory,Other,no,no,yes,no
1,0.7,3,34,2,13,0,0,0,Other,Other,Other,no,no,yes,no
2,0.5,5,45,0,18,0,0,0,Circulatory,Circulatory,Circulatory,no,yes,yes,yes
3,0.7,2,36,0,12,1,0,0,Circulatory,Other,Diabetes,no,yes,yes,yes
4,0.6,1,42,0,7,0,0,0,Other,Circulatory,Respiratory,no,no,yes,no


In [24]:
X_clean = train_data.drop(columns=['readmitted'])
y = train_data['readmitted']

X = preprocess_features(X_clean)
X.head()


Preprocessing 24779 rows of 14 features...
Preprocessing done. Final shape: (24779, 34)


Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,diag_1_Circulatory,diag_1_Diabetes,diag_1_Digestive,diag_1_Injury,diag_1_Musculoskeletal,diag_1_Other,diag_1_Respiratory,diag_2_Circulatory,diag_2_Diabetes,diag_2_Digestive,diag_2_Injury,diag_2_Musculoskeletal,diag_2_Other,diag_2_Respiratory,diag_3_Circulatory,diag_3_Diabetes,diag_3_Digestive,diag_3_Injury,diag_3_Musculoskeletal,diag_3_Other,diag_3_Respiratory,A1Ctest_high,A1Ctest_no,A1Ctest_normal,change_yes,diabetes_med_yes
0,0.6,0.538462,0.633929,0.166667,0.217949,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.6,0.153846,0.294643,0.333333,0.153846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.2,0.307692,0.392857,0.0,0.217949,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
3,0.6,0.076923,0.3125,0.0,0.141026,0.5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
4,0.4,0.0,0.366071,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [25]:
model = LogisticRegression(max_iter=1000)
cv_results = cross_validate(model, X, y, cv=5)
accuracy= cv_results['test_score'].mean()
accuracy


0.6116465298257359

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3, random_state = 42)

In [27]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [28]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.613666935700834


In [29]:
confusion = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(confusion)

Confusion Matrix:
[[2978  902]
 [1970 1584]]


In [30]:
report = classification_report(y_test, y_pred)
print('Classification Report:')
print(report)

Classification Report:
              precision    recall  f1-score   support

          no       0.60      0.77      0.67      3880
         yes       0.64      0.45      0.52      3554

    accuracy                           0.61      7434
   macro avg       0.62      0.61      0.60      7434
weighted avg       0.62      0.61      0.60      7434



# Pipeline

In [31]:
pipe = Pipeline([
            ('preprocessor', preprocessor()),
            ('log_reg', model),
        ])
pipe

In [32]:
pipe_pred = pipe.predict(X_clean)
pipe_pred

array(['yes', 'no', 'no', ..., 'no', 'no', 'no'], dtype=object)

In [33]:
accuracy = accuracy_score(y, pipe_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.6114855321037975


# Pickle Pipeline

In [34]:
with open('../models/model_baseline_logreg.pkl', 'wb') as file:
    pickle.dump(pipe, file)

In [52]:
# testing...
with open('../models/model_baseline_logreg.pkl', 'rb') as file:
    pipe_test = pickle.load(file)

X_sample = X_clean.sample(1)
display(X_sample)

pipe_test.predict(X_sample)[0]

Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,diag_1,diag_2,diag_3,A1Ctest,change,diabetes_med
8962,0.6,2,27,5,21,0,0,0,Circulatory,Circulatory,Diabetes,no,no,yes


'no'