# Pre-processing and creating training data
-Create dummy or indicator features for categorical variables
-Standardize the magnitude of numeric features using a scaler
-Split your data into testing and training datasets

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import scale

In [2]:
diabetes_cleaned = pd.read_csv(r'C:\Users\ashle\OneDrive\Documents\GitHub\Capstone-2\Diabetes_Class_Cleaned.csv', index_col=0)

In [3]:
diabetes_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 390 entries, 1 to 390
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Cholesterol      390 non-null    int64  
 1   Glucose          390 non-null    int64  
 2   HDL Chol         390 non-null    int64  
 3   Chol/HDL ratio   390 non-null    float64
 4   Age              390 non-null    int64  
 5   Gender           390 non-null    object 
 6   Height           390 non-null    int64  
 7   Weight           390 non-null    int64  
 8   BMI              390 non-null    float64
 9   Systolic BP      390 non-null    int64  
 10  Diastolic BP     390 non-null    int64  
 11  waist            390 non-null    int64  
 12  hip              390 non-null    int64  
 13  Waist/hip ratio  390 non-null    float64
 14  Diabetes         390 non-null    object 
dtypes: float64(3), int64(10), object(2)
memory usage: 48.8+ KB


### Create dummy features:

In [4]:
diabetes_cleaned['Diabetes'].unique()

array(['No diabetes', 'Diabetes'], dtype=object)

In [5]:
diabetes_cleaned['Diabetes'].replace('Diabetes', 1, inplace=True)
diabetes_cleaned['Diabetes'].replace('No diabetes', 0, inplace=True)

In [6]:
diabetes_cleaned.head()

Unnamed: 0_level_0,Cholesterol,Glucose,HDL Chol,Chol/HDL ratio,Age,Gender,Height,Weight,BMI,Systolic BP,Diastolic BP,waist,hip,Waist/hip ratio,Diabetes
Patient number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,193,77,49,3.9,19,female,61,119,22.5,118,70,32,38,0.84,0
2,146,79,41,3.6,19,female,60,135,26.4,108,58,33,40,0.83,0
3,217,75,54,4.0,20,female,67,187,29.3,110,72,40,45,0.89,0
4,226,97,70,3.2,20,female,64,114,19.6,122,64,31,39,0.79,0
5,164,91,67,2.4,20,female,70,141,20.2,122,86,32,39,0.82,0


In [7]:
diabetes_cleaned = pd.get_dummies(diabetes_cleaned)

In [8]:
class_counts = diabetes_cleaned['Diabetes'].value_counts()
class_counts

0    330
1     60
Name: Diabetes, dtype: int64

In [9]:
class_percentages = pd.Series([(x / diabetes_cleaned.shape[0]) * 100.00 for x in class_counts])
class_percentages

0    84.615385
1    15.384615
dtype: float64

### Standardize the data:

In [10]:
scaler = preprocessing.StandardScaler()
columns = diabetes_cleaned.columns

In [11]:
scaled_df = scaler.fit_transform(diabetes_cleaned) 
scaled_df = pd.DataFrame(scaled_df, columns=columns)

### Train and test data:

In [12]:
y=scaled_df.pop('Diabetes')
X=scaled_df
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size = 0.25, random_state = 246)