# Case Study: Diabetes Data Analysis

In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv("diabetes.csv", sep=',')
df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,138,62,35,0,33.6,0.127,47,1
1,0,84,82,31,125,38.2,0.233,23,0
2,0,145,0,0,0,44.2,0.63,31,1
3,0,135,68,42,250,42.3,0.365,24,1
4,1,139,62,41,480,40.7,0.536,21,0
5,0,173,78,32,265,46.5,1.159,58,0
6,4,99,72,17,0,25.6,0.294,28,0
7,8,194,80,0,0,26.1,0.551,67,0
8,2,83,65,28,66,36.8,0.629,24,0
9,2,89,90,30,0,33.5,0.292,42,0


<b>Data structures Series</b>

In [2]:
row_0 = df.iloc[0]
type(row_0)

pandas.core.series.Series

In [3]:
print(row_0)

Pregnancies                   2.000
Glucose                     138.000
BloodPressure                62.000
SkinThickness                35.000
Insulin                       0.000
BMI                          33.600
DiabetesPedigreeFunction      0.127
Age                          47.000
Outcome                       1.000
Name: 0, dtype: float64


In [4]:
row_0.index

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [5]:
row_0['Pregnancies']

2.0

In [6]:
'Age' in row_0

True

<b>DataFrames</b>

In [9]:
df.head

<bound method NDFrame.head of       Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0               2      138             62             35        0  33.6   
1               0       84             82             31      125  38.2   
2               0      145              0              0        0  44.2   
3               0      135             68             42      250  42.3   
4               1      139             62             41      480  40.7   
5               0      173             78             32      265  46.5   
6               4       99             72             17        0  25.6   
7               8      194             80              0        0  26.1   
8               2       83             65             28       66  36.8   
9               2       89             90             30        0  33.5   
10              4       99             68             38        0  32.8   
11              4      125             70             18      122  28.

In [8]:
df.index

RangeIndex(start=0, stop=2000, step=1)

In [10]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [11]:
#Extract 0,3, 700
df.iloc[[0,3,700]]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,138,62,35,0,33.6,0.127,47,1
3,0,135,68,42,250,42.3,0.365,24,1
700,2,122,76,27,200,35.9,0.483,26,0


# Descriptive Statistics

In [14]:
df['Pregnancies'].describe()

count    2000.000000
mean        3.703500
std         3.306063
min         0.000000
25%         1.000000
50%         3.000000
75%         6.000000
max        17.000000
Name: Pregnancies, dtype: float64

In [15]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,3.7035,121.1825,69.1455,20.935,80.254,32.193,0.47093,33.0905,0.342
std,3.306063,32.068636,19.188315,16.103243,111.180534,8.149901,0.323553,11.786423,0.474498
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,63.5,0.0,0.0,27.375,0.244,24.0,0.0
50%,3.0,117.0,72.0,23.0,40.0,32.3,0.376,29.0,0.0
75%,6.0,141.0,80.0,32.0,130.0,36.8,0.624,40.0,1.0
max,17.0,199.0,122.0,110.0,744.0,80.6,2.42,81.0,1.0


In [16]:
df['Age'].mean()

33.0905

In [18]:
df['Age'].median()

29.0

In [19]:
df['Age'].min()

21

In [20]:
df['Age'].max()

81

In [21]:
filter_1 = df['Age'] > 40
filter_1

0        True
1       False
2       False
3       False
4       False
5        True
6       False
7        True
8       False
9        True
10      False
11       True
12      False
13       True
14      False
15      False
16       True
17      False
18      False
19      False
20       True
21       True
22      False
23      False
24      False
25      False
26      False
27       True
28       True
29      False
        ...  
1970    False
1971    False
1972    False
1973     True
1974    False
1975     True
1976    False
1977    False
1978    False
1979    False
1980     True
1981    False
1982     True
1983    False
1984    False
1985    False
1986     True
1987    False
1988    False
1989    False
1990    False
1991    False
1992    False
1993    False
1994    False
1995    False
1996    False
1997     True
1998    False
1999    False
Name: Age, Length: 2000, dtype: bool

In [22]:
filter_1.any()

True

## Data Cleaning: Handling Missing Data

In [27]:
df.shape

(2000, 9)

In [28]:
#making row 100 have a null value
df.loc[100,'Age'] = np.nan

In [29]:
df.isnull().any()

Pregnancies                 False
Glucose                     False
BloodPressure               False
SkinThickness               False
Insulin                     False
BMI                         False
DiabetesPedigreeFunction    False
Age                          True
Outcome                     False
dtype: bool

In [30]:
#then we drop it
df = df.dropna()

In [32]:
df.isnull().any()

Pregnancies                 False
Glucose                     False
BloodPressure               False
SkinThickness               False
Insulin                     False
BMI                         False
DiabetesPedigreeFunction    False
Age                         False
Outcome                     False
dtype: bool

In [34]:
#Calculating the sum of all null records
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [35]:
df.shape

(1999, 9)