<a href="https://colab.research.google.com/github/ArtisticWenny/Data_Cleaning/blob/main/550_Cleaning_Diabetic_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Packages

In [25]:
import pandas as pd
import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import iplot
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

# Load in CSV

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

csv_file_path = '/content/gdrive/My Drive/diabetic_data.csv'

diabetic = pd.read_csv(csv_file_path)

print(diabetic.head)

Mounted at /content/gdrive
<bound method NDFrame.head of         encounter_id  patient_nbr             race  gender      age weight  \
0            2278392      8222157        Caucasian  Female   [0-10)      ?   
1             149190     55629189        Caucasian  Female  [10-20)      ?   
2              64410     86047875  AfricanAmerican  Female  [20-30)      ?   
3             500364     82442376        Caucasian    Male  [30-40)      ?   
4              16680     42519267        Caucasian    Male  [40-50)      ?   
...              ...          ...              ...     ...      ...    ...   
101761     443847548    100162476  AfricanAmerican    Male  [70-80)      ?   
101762     443847782     74694222  AfricanAmerican  Female  [80-90)      ?   
101763     443854148     41088789        Caucasian    Male  [70-80)      ?   
101764     443857166     31693671        Caucasian  Female  [80-90)      ?   
101765     443867222    175429310        Caucasian    Male  [70-80)      ?   

      

# About This Dataset

According to UC Irvine, thie dataset is a representation of clinical care for 130 hospitals in the United States over the span of 10 years (1999 to 2008). The purpose of the data is to detect the possibility of patient readmission within 30 days of discharge.

# Inspect the shape

In [None]:
print("Number of rows:", diabetic.shape[0])
print("Number of columns:", diabetic.shape[1])

Number of rows: 101766
Number of columns: 50


# Visualization of readmissions


In [None]:
y = diabetic['readmitted']
print(f'Percentage of patients readmitted after 30 days: % {round(y.value_counts(normalize=True)[1]*100,2)} --> ({y.value_counts()[1]} patient)\nPercentage of patients not readmitted: % {round(y.value_counts(normalize=True)[0]*100,2)} --> ({y.value_counts()[0]} patient)\nPercentage of patients readmitted under 30 days: % {round(y.value_counts(normalize=True)[2]*100,2)} --> ({y.value_counts()[2]} patient)')

Percentage of patients readmitted after 30 days: % 34.93 --> (35545 patient)
Percentage of patients not readmitted: % 53.91 --> (54864 patient)
Percentage of patients readmitted under 30 days: % 11.16 --> (11357 patient)


In [None]:
fig = px.histogram(diabetic, x="readmitted", title='readmitted', width=400, height=400)
fig.show()

It is inferred from this visualization that,
*   34.93% of patients are re-admitted after 30 days, 35,545 patients.
*   53.91% of patients are not admitted, 54,864 patients.
*   11.16% of patients are re-admitted before 30 days, 11,357 patients.

In [None]:
diabetic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

# Replacing '?' with '0'

In [None]:
modified_weight = diabetic.replace('?', '0')
print(modified_weight)

        encounter_id  patient_nbr             race  gender      age weight  \
0            2278392      8222157        Caucasian  Female   [0-10)      0   
1             149190     55629189        Caucasian  Female  [10-20)      0   
2              64410     86047875  AfricanAmerican  Female  [20-30)      0   
3             500364     82442376        Caucasian    Male  [30-40)      0   
4              16680     42519267        Caucasian    Male  [40-50)      0   
...              ...          ...              ...     ...      ...    ...   
101761     443847548    100162476  AfricanAmerican    Male  [70-80)      0   
101762     443847782     74694222  AfricanAmerican  Female  [80-90)      0   
101763     443854148     41088789        Caucasian    Male  [70-80)      0   
101764     443857166     31693671        Caucasian  Female  [80-90)      0   
101765     443867222    175429310        Caucasian    Male  [70-80)      0   

        admission_type_id  discharge_disposition_id  admission_

# Checking for duplicates

In [None]:
diabetic.duplicated().sum()


0

# Drop Unnecessary Columns

In [None]:
diabetic.drop(['patient_nbr', 'encounter_id', 'diag_1', 'diag_2', 'diag_3', 'payer_code', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id' ], axis = 1, inplace = True)
diabetic.head()

Unnamed: 0,race,gender,age,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),1,Pediatrics-Endocrinology,41,0,1,0,0,...,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),3,?,59,0,18,0,0,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),2,?,11,5,13,2,0,...,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),2,?,44,1,16,0,0,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),1,?,51,0,8,0,0,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


# Finding any missing values

In [None]:
missing_count = diabetic.isnull().sum()
print("Total missing values in each column:")
print(missing_count)

Total missing values in each column:
race                        0
gender                      0
age                         0
admission_type_id           0
discharge_disposition_id    0
admission_source_id         0
time_in_hospital            0
medical_specialty           0
num_lab_procedures          0
num_procedures              0
num_medications             0
number_outpatient           0
number_emergency            0
number_inpatient            0
number_diagnoses            0
max_glu_serum               0
A1Cresult                   0
metformin                   0
repaglinide                 0
nateglinide                 0
chlorpropamide              0
glimepiride                 0
acetohexamide               0
glipizide                   0
glyburide                   0
tolbutamide                 0
pioglitazone                0
rosiglitazone               0
acarbose                    0
miglitol                    0
troglitazone                0
tolazamide                  0
exa

In [None]:
missing_values = diabetic.isnull().any()
if missing_values.any():
    print("Missing values detected in columns:", missing_values[missing_values].index.tolist())
else:
    print("No missing values.")

No missing values.
