In [2]:
import pandas as pd

# Load the dataset into a DataFrame
data = pd.read_csv('diabetes_012_health_indicators_BRFSS2015.csv')

# Shows the first 5 rows to get a quick look at the data
print(data.head())

   Diabetes_012  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0           0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1           0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2           0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3           0.0     1.0       0.0        1.0  27.0     0.0     0.0   
4           0.0     1.0       1.0        1.0  24.0     0.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  ...  AnyHealthcare  \
0                   0.0           0.0     0.0  ...            1.0   
1                   0.0           1.0     0.0  ...            0.0   
2                   0.0           0.0     1.0  ...            1.0   
3                   0.0           1.0     1.0  ...            1.0   
4                   0.0           1.0     1.0  ...            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          0.0      5.0      18.0      15.0       1.0  0.0   9.0        4.0   
1     

In [3]:
# Print info about the column types and non-null counts)
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_012          253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   HeartDiseaseorAttack  253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [9]:
# Show basic statistics (mean, min, max, etc.) for each column
# Helps understand the range and distribution of data
print(data.describe())

        Diabetes_012         HighBP       HighChol      CholCheck  \
count  253680.000000  253680.000000  253680.000000  253680.000000   
mean        0.296921       0.429001       0.424121       0.962670   
std         0.698160       0.494934       0.494210       0.189571   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000000       0.000000       0.000000       1.000000   
50%         0.000000       0.000000       0.000000       1.000000   
75%         0.000000       1.000000       1.000000       1.000000   
max         2.000000       1.000000       1.000000       1.000000   

                 BMI         Smoker         Stroke  HeartDiseaseorAttack  \
count  253680.000000  253680.000000  253680.000000         253680.000000   
mean       28.382364       0.443169       0.040571              0.094186   
std         6.608694       0.496761       0.197294              0.292087   
min        12.000000       0.000000       0.000000              0.000000  

In [2]:
# Print the initial shape of the dataset (rows and columns count)
print("Initial shape of the dataframe:", data.shape)

Initial shape of the dataframe: (253680, 19)


In [5]:
# Check for missing values in each column and print the result
missing_values= data.isnull().sum()
print("Number of missing values for each column:")
print(missing_values)

Number of missing values for each column:
Diabetes_012            0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64


In [3]:
# Remove columns with over 95% of the same value (low variability)
# This removes unhelpful columns, making the dataset more useful
for column in data.columns:
    if data[column].value_counts(normalize=True, dropna=False).iloc[0] > 0.95:
        data.drop(column, axis=1, inplace=True)

print("Shape of the dataframe after dropping columns:", data.shape)

Shape of the dataframe after dropping columns: (253680, 19)


In [21]:
# Import libraries for data splitting and feature scaling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Define the target column (Diabetes status) and separate features and target
X = data.drop('Diabetes_012', axis=1)
y = data['Diabetes_012']

# Identify continuous features to apply standardization
continuous_columns = [col for col in X.columns if data[col].dtype in ['int64', 'float64']]
scaler = StandardScaler()
X[continuous_columns] = scaler.fit_transform(X[continuous_columns])

# Split data into training(70%), validation(15%), and test sets(15%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Check the shapes of the resulting datasets
print("Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)
print("Test set shape:", X_test.shape)


Training set shape: (177576, 18)
Validation set shape: (38052, 18)
Test set shape: (38052, 18)
