In [1]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: ignored

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

## 1. Data Description

### 1.1 Important Risk Factors

Research has identified the following as **important risk factors** for diabetes:

```high blood pressure, high cholesterol, smoking, obesity, age and sex, race, diet, exercise, alcohol consumption, BMI, household income, marital status, sleep, time since last checkup, education, health care coverage, mental Health```

Given these risk factors, we selected features from a open survey of diabetes related to these risk factors.



### 1.2 Features

`Diabetes_binary`

(Ever diagonsed) diabetes 

`HighBP` -> `Bool`

High Blood Pressure

`HighChol` -> `Bool`

High Cholesterol

`CholCheck` -> `Bool`

Cholesterol check within past five years

`BMI` -> `Float`

Body Mass Index (BMI)

`Smoker` -> `Bool`

Have you smoked at least 100 cigarettes (5 packs) in your entire life? 

`Stroke` -> `Bool`

(Ever diagosed) stroke. 

`HeartDiseaseorAttack` -> `Bool`

Respondents that have ever reported having coronary heart disease (CHD) or myocardial infarction (MI)

`PhysActivity` -> `Bool`

Adults who reported doing physical activity or exercise during the past 30 days other than their regular job

`Fruits` -> `Bool`

Consume Fruit 1 or more times per day 

`Veggies` -> `Bool`

Consume Vegetables 1 or more times per day 

`HvyAlcoholConsump` -> `Bool`

Heavy drinkers (adult men having more than 14 drinks per week and adult women having more than 7 drinks per week)

`AnyHealthcare` -> `Bool`

Do you have any kind of health care coverage, including health insurance, prepaid plans such as HMOs, or government plans such as Medicare, or Indian Health Service? 

`NoDocbcCost` -> `Bool`

Was there a time in the past 12 months when you needed to see a doctor but could not because of cost?

`GenHlth` -> `Int`

Would you say that in general your health is between 5 (highest) and 1 (lowest).

`MentHlth` -> `Int`

Now thinking about your mental health, which includes stress, depression, and problems with emotions, for how many days during the past 30 days was your mental health not good? 

`PhysHlth` -> `Int`

Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good? 

`DiffWalk` -> `Int`

Do you have serious difficulty walking or climbing stairs? 


`Sex`, and `Age` -> `Int`

`Education` -> `Int`

This is already an ordinal variable with 1 being never attended school or kindergarten only up to 6 being college 4 years or more


`Income` -> `Int`

Variable is already ordinal with 1 being less than \$10,000 all the way up to 8 being \$75,000 or more

To make life easier, we convert above data type to data type `float64` in Panda.

## 2. Task

Your task is to design a machine learning/deep learning algorithm to help determine/predict whether the patient is non-diabetic (int `0`) or diabetic (int `1`).


## 3. Data Loading
### 3.1 Load the data 



In [None]:
full_train = pd.read_csv('full_data_train.csv')
X_test = pd.read_csv('indicators_test.csv')
y_test = pd.read_csv('y_test.csv')

### 3.2 Check the infomation of the raw data

Use ```pandas.DataFrame.info``` to describe null values, data type, memory usage

In [None]:
full_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228312 entries, 0 to 228311
Data columns (total 23 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   PID                   228312 non-null  int64  
 1   Diabetes_binary       228312 non-null  float64
 2   HighBP                228312 non-null  float64
 3   HighChol              228312 non-null  float64
 4   CholCheck             228312 non-null  float64
 5   BMI                   228312 non-null  float64
 6   Smoker                228312 non-null  float64
 7   Stroke                228312 non-null  float64
 8   HeartDiseaseorAttack  228312 non-null  float64
 9   PhysActivity          228312 non-null  float64
 10  Fruits                228312 non-null  float64
 11  Veggies               228312 non-null  float64
 12  HvyAlcoholConsump     228312 non-null  float64
 13  AnyHealthcare         228312 non-null  float64
 14  NoDocbcCost           228312 non-null  float64
 15  

In [None]:
y_test[:5]

Unnamed: 0,PID,Diabetes_binary
0,81031,0.0
1,117450,0.0
2,94759,0.0
3,45241,0.0
4,101007,0.0


In [None]:
y_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25368 entries, 0 to 25367
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PID              25368 non-null  int64  
 1   Diabetes_binary  25368 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 396.5 KB


## 4. Data Preprocessing
Notice that there is one column name **PID** in both *full_train* and *X_test* which we should drop it.

### 4.1 Why dropping PID

In [None]:
full_train = full_train.drop(['PID'],axis=1) #drop the PID column
X_test = X_test.drop(['PID'],axis=1) #drop the PID column

Seperate the features (indicators) and labels in *full_train*

In [None]:
X_train = full_train.drop(['Diabetes_binary'],axis=1)
y_train = full_train['Diabetes_binary']
y_test = y_test['Diabetes_binary']

### 4.2 Feature Scaling 
Notice that some of the features are categorical data. You are not allowed to use any package for normalizing (e.g. `StandardScaler`).

In [None]:
# Creating StandardScaler instance
sc = StandardScaler()

# Fitting Standard Scaller
X_train = sc.fit_transform(X_train)

# Scaling data
X_test = sc.fit_transform(X_test)

## 5 Machine Learning Model

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
grid_models = [(LogisticRegression(),[{'C':[0.25,0.5,0.75,1],'random_state':[0]}]), 
               (GaussianNB(),[{'var_smoothing': [1e-09]}]), 
               (DecisionTreeClassifier(),[{'criterion':['gini','entropy'],'random_state':[0]}]), 
            #    (RandomForestClassifier(),[{'n_estimators':[100,150,200],'criterion':['gini','entropy'],'random_state':[0]}]),
            #    (AdaBoostClassifier(),[{'n_estimators':[100,150,200],'learning_rate':[0.1, 0.5, 0.8, 1],'algorithm':['SAMME', 'SAMME.R'], 'random_state':[0]}]),
            #    (GradientBoostingClassifier(),[{'n_estimators':[100,150,200],'criterion':['friedman_mse','mse'],'loss':['deviance','exponential'],'learning_rate':[0.1, 0.5, 0.8, 1],'random_state':[0]}]),
               (XGBClassifier(), [{'learning_rate': [0.01, 0.05, 0.1], 'eval_metric': ['error']}])]

In [None]:
for i,j in grid_models:
    grid = GridSearchCV(estimator=i,param_grid = j, scoring = 'accuracy',cv=None)
    grid.fit(X_train, y_train)
    best_accuracy = grid.best_score_
    best_param = grid.best_params_
    print('{}:\nBest Accuracy : {:.2f}%'.format(i,best_accuracy*100))
    print('Best Parameters : ',best_param)
    print('')
    print('----------------')
    print('')

LogisticRegression():
Best Accuracy : 86.38%
Best Parameters :  {'C': 0.5, 'random_state': 0}

----------------

GaussianNB():
Best Accuracy : 77.39%
Best Parameters :  {'var_smoothing': 1e-09}

----------------

DecisionTreeClassifier():
Best Accuracy : 80.10%
Best Parameters :  {'criterion': 'entropy', 'random_state': 0}

----------------

XGBClassifier():
Best Accuracy : 86.66%
Best Parameters :  {'eval_metric': 'error', 'learning_rate': 0.1}

----------------



## 6. Deep Learning Model

In [None]:
import os
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

In [None]:
class YourModel(nn.Module):
    """ Your model should inherite from torch.nn.Module.
    """
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(21,64)
        self.fc2 = nn.Linear(64,128)
        self.fc3 = nn.Linear(128,512)
        self.fc4 = nn.Linear(512,2)
        self.dropout = nn.Dropout(0.25)

    def forward(self, x):
        # Forward pass.
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.fc4(x)
        print(x.shape)
        return x

### Prepare the data for PyTorch

1. The input data type should be the same with model and loss function.

2. The input of 1D convolution should be $(N, C_{in}, L_{in})$ where $N$ is the batch size, $C_{in}$ is the channel (feature), and $L_{in}$ is the length.

In [None]:
X = torch.from_numpy(X_train).to(torch.float)
# X = X.unsqueeze(1)
y = y_train.to_numpy()
y = torch.tensor(y, dtype=torch.long)

X_t = torch.from_numpy(X_test).to(torch.float)
y_t = y_test.to_numpy()
y_t = torch.tensor(y_test, dtype=torch.long)

In [None]:
def train(model, train_loader, criterion, optimizer, epoch):
    model.train()
    # Iterate over the DataLoader for training data
    for batch_idx, (data, target) in enumerate(train_loader):
        # Zero the gradients
        optimizer.zero_grad()
        # Perform forward pass
        output = model(data)
        # Compute loss
        loss = criterion(output, target)
        # Perform backward pass
        loss.backward()
        # Perform optimization
        optimizer.step()
        # Printing
        if batch_idx % 50 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))


def test(model, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data)
            test_loss += criterion(output, target).item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    

def main(train_data, train_label, test_data, test_label, batch_size, epochs):
    """ Training your model.

    Args:
        train_data (tensor): The training data. It should have a shape of (n_instance, 1, n_features).
        train_label (tensor): The labels of training instances. It should have a shape of (n_instance, 1).
        batch_size  (Union[int, NoneType]): The number of samples loaded for one iteration.
        epochs (Union[int, NoneType]): The number of epochs. When this reaches, the training stops.
    """
    # Set fixed random number seed. DO NOT CHANGE IT.
    torch.manual_seed(336699)
    
    # Prepare series dataset.
    train_dataset = TensorDataset(train_data, train_label)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    test_dataset = TensorDataset(test_data, test_label)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, drop_last=False)

    # Initialize proposed model.
    model = YourModel()

    # Define the loss function and optimizer. You can freely choose your loss function and optimizer based on your task.
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    criterion = torch.nn.CrossEntropyLoss()
    criterion_test = torch.nn.CrossEntropyLoss(reduction='sum')
    # Run the training loop
    for epoch in range(1, epochs+1):
        # Print epoch
        print(f'Starting epoch {epoch}')

        train(model, train_loader, criterion, optimizer, epoch)
        test(model, test_loader, criterion_test)
        
    # Process is complete.
    print('Training process has finished.')


if __name__ == '__main__':
    main(X, y, X_t, y_t, 512, 5)

Starting epoch 1
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 2])
torch.Size([512, 