In [27]:
import os
import pickle
import click
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi
import logging

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [31]:
def read_dataframe():
    directory_name = "../Data"

    try:
        os.mkdir(directory_name)
        logging.info(f"Directory '{directory_name}' created successfully.")
    except FileExistsError:
        logging.warning(f"Directory '{directory_name}' already exists.")
    except PermissionError:
        logging.error(f"Permission denied: Unable to create '{directory_name}'.")
    except Exception as e:
        logging.error(f"An error occurred: {e}")

    dataset = 'alexteboul/diabetes-health-indicators-dataset'
    download_path = '../Data'
    
    # Removing data if they are present in the Data Directory 
    for filename in os.listdir(download_path):
        file_path = os.path.join(download_path, filename)
        if os.path.isfile(file_path):
            os.remove(file_path)

    api = KaggleApi()
    api.authenticate()
    
    api.dataset_download_files(dataset, path=download_path, unzip=True)
    
    csv_file = '../Data/diabetes_012_health_indicators_BRFSS2015.csv'

    try:
        diabetes_df = pd.read_csv(csv_file)
        logging.info(f"File .csv loaded. Num of rows: {len(diabetes_df)}")
    except FileNotFoundError:
        logging.info("Diabetes.csv not found.")
    except Exception as e:
        logging.info(f"Error while loading Diabetes.csv: {e}")

    return diabetes_df

In [32]:
diabetes_df = read_dataframe()



Dataset URL: https://www.kaggle.com/datasets/alexteboul/diabetes-health-indicators-dataset


In [18]:
diabetes_df.describe()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
count,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,...,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0
mean,0.296921,0.429001,0.424121,0.96267,28.382364,0.443169,0.040571,0.094186,0.756544,0.634256,...,0.951053,0.084177,2.511392,3.184772,4.242081,0.168224,0.440342,8.032119,5.050434,6.053875
std,0.69816,0.494934,0.49421,0.189571,6.608694,0.496761,0.197294,0.292087,0.429169,0.481639,...,0.215759,0.277654,1.068477,7.412847,8.717951,0.374066,0.496429,3.05422,0.985774,2.071148
min,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,1.0,24.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,5.0
50%,0.0,0.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,7.0
75%,0.0,1.0,1.0,1.0,31.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,2.0,3.0,0.0,1.0,10.0,6.0,8.0
max,2.0,1.0,1.0,1.0,98.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,1.0,13.0,6.0,8.0


In [19]:
# Top features for ML from Features.ipynb
top_features = ['Diabetes_012', 'BMI', 'Age','Income','PhysHlth','Education','GenHlth','MentHlth','HighBP','Fruits']

In [20]:
diabetes_df = diabetes_df[top_features]

In [21]:
diabetes_df.head()

Unnamed: 0,Diabetes_012,BMI,Age,Income,PhysHlth,Education,GenHlth,MentHlth,HighBP,Fruits
0,0.0,40.0,9.0,3.0,15.0,4.0,5.0,18.0,1.0,0.0
1,0.0,25.0,7.0,1.0,0.0,6.0,3.0,0.0,0.0,0.0
2,0.0,28.0,9.0,8.0,30.0,4.0,5.0,30.0,1.0,1.0
3,0.0,27.0,11.0,6.0,0.0,3.0,2.0,0.0,1.0,1.0
4,0.0,24.0,11.0,4.0,0.0,5.0,2.0,3.0,1.0,1.0


In [24]:
continuous = ['Age', 'BMI', 'Income', 'PhysHlth', 'MentHlth']
scaler = StandardScaler()
diabetes_df[continuous] = scaler.fit_transform(diabetes_df[continuous])

In [25]:
diabetes_df.head()

Unnamed: 0,Diabetes_012,BMI,Age,Income,PhysHlth,Education,GenHlth,MentHlth,HighBP,Fruits
0,0.0,1.757936,0.3169,-1.474487,1.233999,4.0,5.0,1.998592,1.0,0.0
1,0.0,-0.511806,-0.337933,-2.440138,-0.486592,6.0,3.0,-0.42963,0.0,0.0
2,0.0,-0.057858,0.3169,0.939638,2.95459,4.0,5.0,3.617407,1.0,1.0
3,0.0,-0.209174,0.971733,-0.026012,-0.486592,3.0,2.0,-0.42963,1.0,1.0
4,0.0,-0.663122,0.971733,-0.991662,-0.486592,5.0,2.0,-0.024926,1.0,1.0


In [None]:
def balancing_classes(X_train, y_train):
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

    return X_resampled, y_resampled

In [None]:
def split_data(diabetes_df):
    X = diabetes_df.drop("Diabetes_binary", axis=1)
    y = diabetes_df["Diabetes_binary"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train, y_train = balancing_classes(X_train, y_train)

    with open("../Data/X_train.pkl", "wb") as f:
        pickle.dump(X_train, f)

    with open("../Data/X_test.pkl", "wb") as f:
        pickle.dump(X_test, f)

    with open("../Data/y_train.pkl", "wb") as f:
        pickle.dump(y_train, f)

    with open("../Data/y_test.pkl", "wb") as f:
        pickle.dump(y_test, f)