## Loading CSV Data File & Cleaning

In [40]:
# Dependencies And Setup
import pandas as pd
import numpy as np
from pathlib import Path

# File Path To Target CSV File
diabetes_data_to_load = Path("Resources/diabetes_prediction_dataset.csv")

# Reading Of Diabetes CSV Data File, Storage In Pandas DataFrame
diabetes_data = pd.read_csv(diabetes_data_to_load)

# Display DataFrame
diabetes_data.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
5,Female,20.0,0,0,never,27.32,6.6,85,0
6,Female,44.0,0,0,never,19.31,6.5,200,1
7,Female,79.0,0,0,No Info,23.86,5.7,85,0
8,Male,42.0,0,0,never,33.64,4.8,145,0
9,Female,32.0,0,0,never,27.32,5.0,100,0


In [41]:
# Renaming Columns
diabetes_data.rename(columns={"gender": "Gender", "age": "Age", "hypertension": "Hypertension Status", "heart_disease": "Heart Disease Status", "smoking_history": "Smoking History", "bmi": "Body Mass Index", "HbA1c_level": "HbA1c Level", "blood_glucose_level": "Blood Glucose Level", "diabetes": "Diabetes Status"}, inplace=True)

diabetes_data.head(10)

Unnamed: 0,Gender,Age,Hypertension Status,Heart Disease Status,Smoking History,Body Mass Index,HbA1c Level,Blood Glucose Level,Diabetes Status
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
5,Female,20.0,0,0,never,27.32,6.6,85,0
6,Female,44.0,0,0,never,19.31,6.5,200,1
7,Female,79.0,0,0,No Info,23.86,5.7,85,0
8,Male,42.0,0,0,never,33.64,4.8,145,0
9,Female,32.0,0,0,never,27.32,5.0,100,0


In [42]:
# Data Types For Columns
diabetes_data.dtypes

Gender                   object
Age                     float64
Hypertension Status       int64
Heart Disease Status      int64
Smoking History          object
Body Mass Index         float64
HbA1c Level             float64
Blood Glucose Level       int64
Diabetes Status           int64
dtype: object

In [43]:
# Counts For Each Column
diabetes_data.count()

Gender                  100000
Age                     100000
Hypertension Status     100000
Heart Disease Status    100000
Smoking History         100000
Body Mass Index         100000
HbA1c Level             100000
Blood Glucose Level     100000
Diabetes Status         100000
dtype: int64

In [44]:
pd.set_option("display.max_rows", None)

#Check For Data Within Age Column
print(diabetes_data["Age"].unique())
print("\n")

#Check For Data Within Body Mass Index Column
print(diabetes_data["Body Mass Index"].value_counts())
print("\n")

#Check For Data Within HbA1c Level Column
print(diabetes_data["HbA1c Level"].value_counts())
print("\n")

#Check For Data Within Blood Glucose Level Column
print(diabetes_data["Blood Glucose Level"].value_counts())
print("\n")

[80.   54.   28.   36.   76.   20.   44.   79.   42.   32.   53.   78.
 67.   15.   37.   40.    5.   69.   72.    4.   30.   45.   43.   50.
 41.   26.   34.   73.   77.   66.   29.   60.   38.    3.   57.   74.
 19.   46.   21.   59.   27.   13.   56.    2.    7.   11.    6.   55.
  9.   62.   47.   12.   68.   75.   22.   58.   18.   24.   17.   25.
  0.08 33.   16.   61.   31.    8.   49.   39.   65.   14.   70.    0.56
 48.   51.   71.    0.88 64.   63.   52.    0.16 10.   35.   23.    0.64
  1.16  1.64  0.72  1.88  1.32  0.8   1.24  1.    1.8   0.48  1.56  1.08
  0.24  1.4   0.4   0.32  1.72  1.48]


Body Mass Index
27.32    25495
23.00      103
27.12      101
27.80      100
24.96      100
22.40       99
25.00       99
25.60       98
26.70       94
24.50       94
21.30       93
28.45       93
22.32       93
29.29       93
22.05       92
27.22       90
25.20       90
24.00       89
24.05       89
26.00       89
25.94       89
28.00       88
29.60       87
24.36       86
25.84     

In [45]:
# #Check For Data Within Body Mass Index Column
# counts = diabetes_data["Body Mass Index"].value_counts()
# counts.to_csv('name.csv',index=False)

In [46]:
# Check For Data Within Gender Column
print(diabetes_data["Gender"].value_counts())
print("\n")

#Check For Data Within Hypertension Column
print(diabetes_data["Hypertension Status"].value_counts())
print("\n")

#Check For Data Within Heart Disease Status Column
print(diabetes_data["Heart Disease Status"].value_counts())
print("\n")

#Check For Data Within Smoking History Column
print(diabetes_data["Smoking History"].value_counts())
print("\n")

#Check For Data Within Diabetes Status Column
print(diabetes_data["Diabetes Status"].value_counts())
print("\n")

Gender
Female    58552
Male      41430
Other        18
Name: count, dtype: int64


Hypertension Status
0    92515
1     7485
Name: count, dtype: int64


Heart Disease Status
0    96058
1     3942
Name: count, dtype: int64


Smoking History
No Info        35816
never          35095
former          9352
current         9286
not current     6447
ever            4004
Name: count, dtype: int64


Diabetes Status
0    91500
1     8500
Name: count, dtype: int64




In [47]:
# Clean Smoking Column For Repeats
diabetes_data["Smoking History"] = diabetes_data["Smoking History"].replace({"No Info": "N/A", "never": "Never", "former": "Former", "current": "Current", "not current": "Former", "ever": "Never"})
diabetes_data.head(10)

Unnamed: 0,Gender,Age,Hypertension Status,Heart Disease Status,Smoking History,Body Mass Index,HbA1c Level,Blood Glucose Level,Diabetes Status
0,Female,80.0,0,1,Never,25.19,6.6,140,0
1,Female,54.0,0,0,,27.32,6.6,80,0
2,Male,28.0,0,0,Never,27.32,5.7,158,0
3,Female,36.0,0,0,Current,23.45,5.0,155,0
4,Male,76.0,1,1,Current,20.14,4.8,155,0
5,Female,20.0,0,0,Never,27.32,6.6,85,0
6,Female,44.0,0,0,Never,19.31,6.5,200,1
7,Female,79.0,0,0,,23.86,5.7,85,0
8,Male,42.0,0,0,Never,33.64,4.8,145,0
9,Female,32.0,0,0,Never,27.32,5.0,100,0


In [None]:
## Breaking Down Into Age Quartiles