 # Patient Readmission Dataset with Logistic Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

## DATA

In [2]:
df = pd.read_csv('Readmission.csv')

In [3]:
df.head()

Unnamed: 0,age,gender,primary_diagnosis,num_procedures,days_in_hospital,comorbidity_score,discharge_to,Patient_ID,readmitted
0,52,Male,Heart Disease,3,9,3,Home,1,1
1,47,Female,Diabetes,2,4,0,Skilled Nursing Facility,2,0
2,72,Female,Heart Disease,7,12,4,Home,3,1
3,18,Female,COPD,5,14,3,Home,4,1
4,32,Male,Heart Disease,9,2,4,Rehabilitation Facility,5,1


In [4]:
df.describe()

Unnamed: 0,age,num_procedures,days_in_hospital,comorbidity_score,Patient_ID,readmitted
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,54.318,4.563,7.4195,2.053,1000.5,0.486
std,20.872631,2.846767,3.99944,1.40932,577.494589,0.499929
min,18.0,0.0,1.0,0.0,1.0,0.0
25%,36.0,2.0,4.0,1.0,500.75,0.0
50%,55.0,5.0,7.0,2.0,1000.5,0.0
75%,72.0,7.0,11.0,3.0,1500.25,1.0
max,89.0,9.0,14.0,4.0,2000.0,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                2000 non-null   int64 
 1   gender             2000 non-null   object
 2   primary_diagnosis  2000 non-null   object
 3   num_procedures     2000 non-null   int64 
 4   days_in_hospital   2000 non-null   int64 
 5   comorbidity_score  2000 non-null   int64 
 6   discharge_to       2000 non-null   object
 7   Patient_ID         2000 non-null   int64 
 8   readmitted         2000 non-null   int64 
dtypes: int64(6), object(3)
memory usage: 140.8+ KB


In [6]:
df['readmitted'].value_counts()

readmitted
0    1028
1     972
Name: count, dtype: int64

### DATA CLEANING AND PREPARATION

In [7]:
df_2 = pd.get_dummies(df, 
                      columns = ['gender','primary_diagnosis', 'discharge_to'],
                      drop_first = True)

In [8]:
final_data = df_2.drop('Patient_ID', axis=1)

In [9]:
final_data.head()

Unnamed: 0,age,num_procedures,days_in_hospital,comorbidity_score,readmitted,gender_Male,primary_diagnosis_Diabetes,primary_diagnosis_Heart Disease,primary_diagnosis_Hypertension,primary_diagnosis_Kidney Disease,discharge_to_Home Health Care,discharge_to_Rehabilitation Facility,discharge_to_Skilled Nursing Facility
0,52,3,9,3,1,True,False,True,False,False,False,False,False
1,47,2,4,0,0,False,True,False,False,False,False,False,True
2,72,7,12,4,1,False,False,True,False,False,False,False,False
3,18,5,14,3,1,False,False,False,False,False,False,False,False
4,32,9,2,4,1,True,False,True,False,False,False,True,False


## TRAINING AND SPLITTING THE DATA

In [10]:
X = final_data.drop('readmitted', axis=1)
y = final_data['readmitted']

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.55, random_state=101)

## Logistic Regression 

In [13]:
from sklearn.linear_model import LogisticRegression

In [30]:
logmodel = LogisticRegression(class_weight={0:1.6, 1:2})

In [31]:
logmodel.fit(X_train,y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
predictions = logmodel.predict(X_test)

In [28]:
from sklearn.metrics import classification_report,confusion_matrix

In [33]:
print(classification_report(y_test,predictions))
print('\n')
print(confusion_matrix(y_test,predictions))

              precision    recall  f1-score   support

           0       0.49      0.27      0.34       556
           1       0.49      0.71      0.58       544

    accuracy                           0.49      1100
   macro avg       0.49      0.49      0.46      1100
weighted avg       0.49      0.49      0.46      1100



[[148 408]
 [156 388]]


### TESTING WITH A NEW DATASET

In [34]:
new_df = pd.read_csv('train_df.csv')

In [35]:
new_df.head()

Unnamed: 0,age,gender,primary_diagnosis,num_procedures,days_in_hospital,comorbidity_score,discharge_to,readmitted
0,69,Male,Heart Disease,1,2,1,Home Health Care,0
1,32,Female,COPD,2,13,2,Rehabilitation Facility,0
2,89,Male,Diabetes,1,7,1,Home,0
3,78,Male,COPD,9,2,2,Skilled Nursing Facility,0
4,38,Male,Diabetes,6,4,4,Rehabilitation Facility,0


In [36]:
X_new = pd.get_dummies(new_df.drop('readmitted',axis =1),
                       columns = ['gender','primary_diagnosis','discharge_to'],
                       drop_first=True)
y_new = new_df['readmitted']

In [37]:
new_pred = logmodel.predict(X_new)

In [38]:
print(classification_report(y_new,new_pred))
print('\n')
print(confusion_matrix(y_new,new_pred))

              precision    recall  f1-score   support

           0       0.80      0.29      0.43      4060
           1       0.18      0.69      0.29       940

    accuracy                           0.37      5000
   macro avg       0.49      0.49      0.36      5000
weighted avg       0.69      0.37      0.40      5000



[[1197 2863]
 [ 295  645]]
