In [1]:
import pandas as pd
import numpy as np

In [2]:
diabetes = pd.read_csv('diabetic_data.csv')

### Variables used in the analysis (to get distributions of)

1. **change**
2. **diabetesMed**
3. **num_lab_procedures**
4. **num_procedures**
5. **num_medications**
6. **number_outpatient**
7. **number_emergency**
8. **number_inpatient**
9. **number_diagnoses**
10. age
11. time_in_hospital
12. **admission_type_id**
13. **discharge_disposition_id**
14. **admission_source_id**
15. **diag_1**
16. **diag_2**

### change (categorical)


|             | `change`     |
| ----------- | ----------- |
| Levels      | 2 (No, Change) |
| Missing values   | 0        |
| Number of unique values   | 2        |
| Frequency at all levels   | {No : 54755, Change: 47011}   |

In [3]:
diabetes.change.value_counts()

No    54755
Ch    47011
Name: change, dtype: int64

### diabetesMed (categorical)

|             | `diabetesMed`     |
| ----------- | ----------- |
| Levels      | 2 (No, Yes) |
| Missing values   | 0        |
| Number of unique values   | 2        |
| Frequency at all levels   | {No : 23403, Yes: 78363}   |

In [4]:
diabetes.diabetesMed.value_counts()

Yes    78363
No     23403
Name: diabetesMed, dtype: int64

In [5]:
diabetes.diabetesMed.isnull().sum()

0

### diag_1

|             | `diag_1`     |
| ----------- | ----------- |
| Levels      | 101766 |
| Missing values   | 0        |
| Number of unique values   | 717        |
| Frequency at all levels   | It depends on the level. There are so many levels that binning here is necessary to make this variable interpretable   |

In [6]:
diabetes.diag_1.value_counts().sum()

101766

In [7]:
vals = diabetes.diag_1.unique()
len(vals)

717

In [8]:
diabetes.diag_1.isnull().sum()

0

### diag_2

|             | `diag_2`     |
| ----------- | ----------- |
| Levels      | 749 |
| Missing values   | 0        |
| Number of unique values   | 749       |
| Frequency at all levels   | It depends on the level. There are so many levels that binning here is necessary to make this variable interpretable   |

In [14]:
diabetes.diag_2.value_counts()

276     6752
428     6662
250     6071
427     5036
401     3736
        ... 
E918       1
46         1
V13        1
E850       1
927        1
Name: diag_2, Length: 749, dtype: int64

In [16]:
unique = diabetes.diag_2.unique()
len(unique)

749

In [10]:
vals = diabetes.diag_2.unique()
len(vals)

749

In [11]:
diabetes.diag_2.isnull().sum()

0

### admission_source_id

|             | `admission_source_id`     |
| ----------- | ----------- |
| Levels      | 17 |
| Missing values   | 0        |
| Number of unique values   | 17       |
| Frequency at all levels   | {7 : 57494, 1 : 29565, 17 : 6781, 4 : 3187} *Note, only showing the frequency of the first few most popular levels*  |

In [20]:
diabetes.admission_source_id.value_counts()

7     57494
1     29565
17     6781
4      3187
6      2264
2      1104
5       855
3       187
20      161
9       125
8        16
22       12
10        8
14        2
11        2
25        2
13        1
Name: admission_source_id, dtype: int64

In [18]:
vals = diabetes.admission_source_id.unique()
len(vals)

17

### discharge_disposition_id

|             | `discharge_disposition_id`     |
| ----------- | ----------- |
| Levels      | 26 |
| Missing values   | 0        |
| Number of unique values   | 26       |
| Frequency at all levels   | {1 : 60234, 3 : 13954, 6 : 12902, 18 : 3691, 2 : 2128} *Note, only showing the frequency of the first few most popular levels*  |

In [21]:
diabetes.discharge_disposition_id.value_counts()

1     60234
3     13954
6     12902
18     3691
2      2128
22     1993
11     1642
5      1184
25      989
4       815
7       623
23      412
13      399
14      372
28      139
8       108
15       63
24       48
9        21
17       14
16       11
19        8
10        6
27        5
12        3
20        2
Name: discharge_disposition_id, dtype: int64

In [22]:
vals = diabetes.discharge_disposition_id.unique()
len(vals)

26

### admission_type_id

|             | `admission_type_id`     |
| ----------- | ----------- |
| Levels      | 8 |
| Missing values   | 0        |
| Number of unique values   | 8       |
| Frequency at all levels   | {1 : 53990, 3 : 18869, 2 : 18480, 6 : 5291} *Note, only showing the frequency of the first few most popular levels*  |

In [23]:
diabetes.admission_type_id.value_counts()

1    53990
3    18869
2    18480
6     5291
5     4785
8      320
7       21
4       10
Name: admission_type_id, dtype: int64

In [24]:
vals = diabetes.admission_type_id.unique()
len(vals)

8

### num_lab_procedures

|             | `num_lab_procedures`     |
| ----------- | ----------- |
| count      | 101766.000000 |
| mean   | 43.095641        |
| std   | 19.674362       |
| min   | 1.000000       |
| 25%   | 31.000000       |
| 50%   | 44.000000       |
| 75%   | 57.000000       |
| max   | 132.000000       |


In [25]:
diabetes.num_lab_procedures.describe()

count    101766.000000
mean         43.095641
std          19.674362
min           1.000000
25%          31.000000
50%          44.000000
75%          57.000000
max         132.000000
Name: num_lab_procedures, dtype: float64

### num_procedures

|             | `num_procedures`     |
| ----------- | ----------- |
| count      | 101766.000000 |
| mean   | 1.339730        |
| std   | 1.705807       |
| min   | 0.000000      |
| 25%   | 0.000000       |
| 50%   | 1.000000       |
| 75%   | 2.000000       |
| max   | 6.000000       |

In [26]:
diabetes.num_procedures.describe()

count    101766.000000
mean          1.339730
std           1.705807
min           0.000000
25%           0.000000
50%           1.000000
75%           2.000000
max           6.000000
Name: num_procedures, dtype: float64

### num_medications

|             | `num_medications`     |
| ----------- | ----------- |
| count      | 101766.000000 |
| mean   | 16.021844        |
| std   | 8.127566       |
| min   | 1.000000      |
| 25%   | 10.000000       |
| 50%   | 15.000000       |
| 75%   | 20.000000       |
| max   | 81.000000       |

In [28]:
diabetes.num_medications.describe()

count    101766.000000
mean         16.021844
std           8.127566
min           1.000000
25%          10.000000
50%          15.000000
75%          20.000000
max          81.000000
Name: num_medications, dtype: float64

### number_outpatient

|             | `number_outpatient`     |
| ----------- | ----------- |
| count      | 101766.000000 |
| mean   | 0.369357        |
| std   | 1.267265       |
| min   | 0.000000      |
| 25%   | 0.000000       |
| 50%   | 0.000000      |
| 75%   | 0.000000       |
| max   | 42.000000       |

In [30]:
diabetes.number_outpatient.describe()

count    101766.000000
mean          0.369357
std           1.267265
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          42.000000
Name: number_outpatient, dtype: float64

### number_emergency

|             | `number_emergency`     |
| ----------- | ----------- |
| count      | 101766.000000 |
| mean   | 0.197836        |
| std   | 0.930472       |
| min   | 0.000000      |
| 25%   | 0.000000       |
| 50%   | 0.000000      |
| 75%   | 0.000000       |
| max   | 76.000000      |

In [31]:
diabetes.number_emergency.describe()

count    101766.000000
mean          0.197836
std           0.930472
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          76.000000
Name: number_emergency, dtype: float64

### number_inpatient

|             | `number_inpatient`     |
| ----------- | ----------- |
| count      | 101766.000000 |
| mean   | 0.635566       |
| std   | 1.262863       |
| min   | 0.000000      |
| 25%   | 0.000000       |
| 50%   | 0.000000      |
| 75%   | 1.000000       |
| max   | 21.000000      |

In [32]:
diabetes.number_inpatient.describe()

count    101766.000000
mean          0.635566
std           1.262863
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max          21.000000
Name: number_inpatient, dtype: float64

### number_diagnoses

|             | `number_diagnoses`     |
| ----------- | ----------- |
| count      | 101766.000000 |
| mean   | 7.422607       |
| std   | 1.933600       |
| min   | 1.000000      |
| 25%   | 6.000000       |
| 50%   | 8.000000      |
| 75%   | 9.000000       |
| max   | 16.000000      |

In [36]:
diabetes.number_diagnoses.describe()

count    101766.000000
mean          7.422607
std           1.933600
min           1.000000
25%           6.000000
50%           8.000000
75%           9.000000
max          16.000000
Name: number_diagnoses, dtype: float64

### age

|             | `age`     |
| ----------- | ----------- |
| count      | 101766 |
| unique   | 10       |
| top   | [70-80)       |
| freq   | 26068      |


In [37]:
diabetes.age.describe()

count      101766
unique         10
top       [70-80)
freq        26068
Name: age, dtype: object

### time_in_hospital

|             | `time_in_hospital`     |
| ----------- | ----------- |
| count      | 101766.000000 |
| mean   | 4.395987       |
| std   | 2.985108       |
| min   | 1.000000      |
| 25%   | 2.000000       |
| 50%   | 4.000000      |
| 75%   | 6.000000       |
| max   | 14.000000      |

In [39]:
diabetes.time_in_hospital.describe()

count    101766.000000
mean          4.395987
std           2.985108
min           1.000000
25%           2.000000
50%           4.000000
75%           6.000000
max          14.000000
Name: time_in_hospital, dtype: float64