# Project : The readmission of diabetic patients

In [5]:
# Import necessary libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

#from imblearn.over_sampling import SMOTE
from collections import Counter

from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
#from xgboost import XGBClassifier

import sklearn.metrics
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score,f1_score,r2_score,mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

In [7]:
# Load the dataset and print Row and colums
df= pd.read_csv('diabetic_data.csv', delimiter=',')
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 101766 rows and 50 columns


In [8]:
#displaying first 5 rows of data
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [9]:
# dowload all the informations
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [10]:
# define the type of the variable
df.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

In [11]:
for i in df.columns:
    if np.array(df[i].value_counts())[0]>101750:
        print(df[i].value_counts())

No        101765
Steady         1
Name: acetohexamide, dtype: int64
No        101763
Steady         3
Name: troglitazone, dtype: int64
No    101766
Name: examide, dtype: int64
No    101766
Name: citoglipton, dtype: int64
No        101753
Steady        13
Name: glipizide-metformin, dtype: int64
No        101765
Steady         1
Name: glimepiride-pioglitazone, dtype: int64
No        101764
Steady         2
Name: metformin-rosiglitazone, dtype: int64
No        101765
Steady         1
Name: metformin-pioglitazone, dtype: int64


In [12]:
#cleaning the data to no longer have a duplicate
print('Total data = ', len(df))
print('Unique entries = ', len(np.unique(df['patient_nbr'])))
df.drop_duplicates(['patient_nbr'], keep = 'first', inplace = True)
print('Length after removing Duplicates:', len(df))

Total data =  101766
Unique entries =  71518
Length after removing Duplicates: 71518


In [13]:
#delete the duplicate variable in the dataset
df.drop(['glimepiride-pioglitazone','metformin-pioglitazone','patient_nbr','encounter_id','acetohexamide','troglitazone','examide','citoglipton'],axis=1,inplace=True)

In [15]:
#this code replaces all occurrences of the string '?' by NaN values in your DataFrame, which is often used to deal with missing values in data.
df.replace('?',np.nan,inplace=True)

In [16]:
#In summary, this code helps identify columns in a DataFrame that contain at least one missing value 
#and displays the percentage of missing values for each of these columns.
for i in df.columns:
    if df[i].isna().sum()>0:
        print(i,df[i].isna().sum()*100/len(df))

race 2.7237898151514304
weight 96.0107944853044
payer_code 43.4058558684527
medical_specialty 48.20744427976174
diag_1 0.01538074330937666
diag_2 0.41108532117788527
diag_3 1.7128555049078553


In [17]:
# we will delete data when the percentage of missing data is too high
df.drop(['payer_code','medical_specialty','weight'],axis=1,inplace=True)

In [18]:
#This code removes all rows in the DataFrame df that contain at least one missing value (NaN) 
#and modifies the original DataFrame accordingly.
df.dropna(inplace=True)

In [19]:
#This line of code counts the number of missing values in the 'race' column of the DataFrame df.
#The number being equal to 0 this variable is interesting
df['race'].isna().sum()

0

In [20]:
#This line of code modifies the DataFrame df keeping only the rows where the value in the 'gender' column is not equal to 'Unknown/Invalid'.
df=df[df.gender!='Unknown/Invalid']

In [21]:
# number of women and men in readmission
df['gender'].value_counts()

Female             36440
Male               31917
Unknown/Invalid        1
Name: gender, dtype: int64

In [22]:
#number of people readmitted more than 30 and less than 30
df.readmitted.value_counts()

NO     40729
>30    21539
<30     6090
Name: readmitted, dtype: int64

In [24]:
#This line of code converts the values of the 'readmitted' column in the DataFrame df. 
#If a value is equal to '<30', it is replaced by 1; otherwise, it is replaced by 0.
df.readmitted = [1 if each=='<30' else 0 for each in df.readmitted]

In [25]:
#this code replaces the age intervals in the 'age' column of the DataFrame df 
#with specific numeric values defined in the replaceDict dictionary
replaceDict = { '[0-10)' : 5,
                '[10-20)' : 15, 
                '[20-30)' : 25, 
                '[30-40)' : 35, 
                '[40-50)' : 45, 
                '[50-60)' : 55,
                '[60-70)' : 65, 
                '[70-80)' : 75,
                '[80-90)' : 85,
                '[90-100)' : 95}

df['age'] = df['age'].apply(lambda x : replaceDict[x])

In [26]:
#gives the ages of all patients
df['age']

1         15
2         25
3         35
4         45
5         55
          ..
101754    75
101755    45
101756    65
101758    85
101765    75
Name: age, Length: 68358, dtype: int64