In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

In [2]:
df = pd.read_csv("../data/diabetes_prediction_dataset_preprocessed.csv")
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0.438677,1.000000,0.199008,0.810193,0.160953,0.300626,0.622459,6.144175e-06,0
1,0.438677,0.674675,0.199008,0.189807,0.199005,0.342808,0.622459,3.775135e-11,0
2,0.561323,0.349349,0.199008,0.189807,0.160953,0.342808,0.017986,2.248168e-04,0
3,0.438677,0.449449,0.199008,0.189807,0.172344,0.266167,0.000553,1.233946e-04,0
4,0.561323,0.949950,0.800992,0.810193,0.172344,0.200615,0.000203,1.233946e-04,0
...,...,...,...,...,...,...,...,...,...
99995,0.438677,1.000000,0.199008,0.189807,0.160953,0.342808,0.182426,2.789468e-10,0
99996,0.438677,0.024024,0.199008,0.189807,0.160953,0.145758,0.500000,2.061154e-09,0
99997,0.561323,0.824825,0.199008,0.189807,0.287018,0.352908,0.017986,1.233946e-04,0
99998,0.438677,0.299299,0.199008,0.189807,0.160953,0.503221,0.000004,2.061154e-09,0


## GPU를 사용할 수 있다면 사용 가능하도록 설정

In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")

## Pytorch가 사용할 수 있도록 tensor로 변형

In [4]:
df_np = df.to_numpy()
df_np

array([[4.38676740e-01, 1.00000000e+00, 1.99008115e-01, ...,
        6.22459331e-01, 6.14417460e-06, 0.00000000e+00],
       [4.38676740e-01, 6.74674675e-01, 1.99008115e-01, ...,
        6.22459331e-01, 3.77513454e-11, 0.00000000e+00],
       [5.61323260e-01, 3.49349349e-01, 1.99008115e-01, ...,
        1.79862100e-02, 2.24816770e-04, 0.00000000e+00],
       ...,
       [5.61323260e-01, 8.24824825e-01, 1.99008115e-01, ...,
        1.79862100e-02, 1.23394576e-04, 0.00000000e+00],
       [4.38676740e-01, 2.99299299e-01, 1.99008115e-01, ...,
        3.72663928e-06, 2.06115362e-09, 0.00000000e+00],
       [4.38676740e-01, 7.12212212e-01, 1.99008115e-01, ...,
        6.22459331e-01, 2.78946809e-10, 0.00000000e+00]])

In [5]:
df_tensor = torch.tensor(df_np).to(device)
df_tensor

tensor([[4.3868e-01, 1.0000e+00, 1.9901e-01,  ..., 6.2246e-01, 6.1442e-06,
         0.0000e+00],
        [4.3868e-01, 6.7467e-01, 1.9901e-01,  ..., 6.2246e-01, 3.7751e-11,
         0.0000e+00],
        [5.6132e-01, 3.4935e-01, 1.9901e-01,  ..., 1.7986e-02, 2.2482e-04,
         0.0000e+00],
        ...,
        [5.6132e-01, 8.2482e-01, 1.9901e-01,  ..., 1.7986e-02, 1.2339e-04,
         0.0000e+00],
        [4.3868e-01, 2.9930e-01, 1.9901e-01,  ..., 3.7266e-06, 2.0612e-09,
         0.0000e+00],
        [4.3868e-01, 7.1221e-01, 1.9901e-01,  ..., 6.2246e-01, 2.7895e-10,
         0.0000e+00]], device='cuda:0', dtype=torch.float64)

## trainset, testset으로 나눈다 (4:1 비율로 나누기)

In [6]:
cutting = df_tensor.shape[0] // 5
cutting = cutting * 4

trainset = df_tensor[:cutting]
testset = df_tensor[cutting:]

print(trainset.shape)
print(testset.shape)

torch.Size([80000, 9])
torch.Size([20000, 9])


## feature와 target으로 나눈다

## 참고로 tensor의 열은 [:,n] 형식으로 접근한다

In [7]:
x_trainset = trainset[:, 0:8]
x_testset = testset[:, 0:8]

y_trainset = trainset[:,8].unsqueeze(dim = 1)
y_testset = testset[:,8].unsqueeze(dim = 1)

In [8]:
print(x_trainset)
x_trainset.shape

tensor([[4.3868e-01, 1.0000e+00, 1.9901e-01,  ..., 3.0063e-01, 6.2246e-01,
         6.1442e-06],
        [4.3868e-01, 6.7467e-01, 1.9901e-01,  ..., 3.4281e-01, 6.2246e-01,
         3.7751e-11],
        [5.6132e-01, 3.4935e-01, 1.9901e-01,  ..., 3.4281e-01, 1.7986e-02,
         2.2482e-04],
        ...,
        [5.6132e-01, 3.2432e-01, 1.9901e-01,  ..., 3.2063e-01, 1.1920e-01,
         1.0000e+00],
        [4.3868e-01, 5.8709e-01, 8.0099e-01,  ..., 3.2320e-01, 6.2246e-01,
         1.0000e+00],
        [4.3868e-01, 8.1231e-01, 8.0099e-01,  ..., 5.2540e-01, 2.0343e-04,
         2.7458e-04]], device='cuda:0', dtype=torch.float64)


torch.Size([80000, 8])

In [9]:
y_trainset

tensor([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [0.]], device='cuda:0', dtype=torch.float64)

## traget 텐서를 원-핫 인코딩 형태로 바꾸자

In [10]:
y_trainset = torch.nn.functional.one_hot(y_trainset.to(torch.int64), num_classes=2)
y_testset = torch.nn.functional.one_hot(y_testset.to(torch.int64), num_classes=2)

In [11]:
y_trainset

tensor([[[1, 0]],

        [[1, 0]],

        [[1, 0]],

        ...,

        [[0, 1]],

        [[0, 1]],

        [[1, 0]]], device='cuda:0')

## 모델을 설계하자

In [12]:
# 공사중
class FullyConnected(nn.Module):
    def __init__(self, device=device):
        super().__init__()
        self.continuous_input = nn.Linear(in_features=4, out_features=6, device=device, dtype=torch.float64)
        self.discrete_input = nn.Linear(in_features=4, out_features=6, device=device)
        self.continuous_hidden = nn.Linear(in_features=6, out_features=1, device=device)
        self.discrete_hidden = nn.Linear(in_features=6, out_features=1, device=device)
        self.output = nn.Linear(in_features=2, out_features=1, device=device)
        self.softmax = nn.Softmax()

    def forward(self, x):
        # gender age hypertension heart_disease smoking_history bmi HbA1c_level blood_glucose_level diabetes
        x_continuous = x[[1, 5, 6, 7]]
        x_discrete = x[[0, 2, 3, 4]]

        out1 = self.continuous_input(x_continuous)
        out2 = self.discrete_input(x_discrete)
        out1 = self.continuous_hidden(out1)
        out2 = self.discrete_hidden(out2)

        out = output([out1, out2])
        out = self.softmax(out)
        return out

model = FullyConnected().to(device)

In [13]:
class FullyConnected2(nn.Module):
    def __init__(self, device=device):
        super().__init__()
        self.input = nn.Linear(in_features=8, out_features=12, device=device, dtype=torch.float64)
        self.hidden = nn.Linear(in_features=12, out_features=6, device=device, dtype=torch.float64)
        self.output = nn.Linear(in_features=6, out_features=2, device=device, dtype=torch.float64)
        self.softmax = nn.Softmax()

    def forward(self, x):
        out = self.input(x)
        out = self.hidden(out)
        out = self.output(out)
        out = self.softmax(out)
        return out # [100, 2]

model = FullyConnected().to(device)
model2 = FullyConnected2().to(device)

## DataLoader, learning rate, optimizer, criterion을 정의하자

In [14]:
trainset = TensorDataset(x_trainset, y_trainset)
print(type(trainset))
print(trainset)

trainbatchsize = 100
trainloader = DataLoader(trainset, batch_size = trainbatchsize, shuffle = True)

lr = 0.0001
optimizer = torch.optim.Adam(model2.parameters(), lr=lr)

criterion = torch.nn.BCELoss()

<class 'torch.utils.data.dataset.TensorDataset'>
<torch.utils.data.dataset.TensorDataset object at 0x000002709CB83D30>


## 실제로 학습시킨다.

In [15]:
num_epochs = 8 # 8회가 96%로 제일 높음
batchcount = 0

for epoch in range(num_epochs):
    batchcount = 0
    running_loss = 0.0
    
    for features, targets in trainloader:
        features = features.to(device)
        targets = targets.to(device)
        outputs = model2(features)

        targets = torch.squeeze(targets)
        targets = targets.to(torch.float64)
        
        loss = criterion(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        batchcount += 1

        running_loss += loss.item()
        if batchcount % 100 == 0:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {batchcount}] loss: {running_loss / 100:.3f}')
            running_loss = 0.0
        
        if batchcount % 100 == 0:
            print("%dth epoch and %dth batchcount" %(epoch + 1, batchcount))

  return self._call_impl(*args, **kwargs)


[1, 100] loss: 0.643
1th epoch and 100th batchcount
[1, 200] loss: 0.586
1th epoch and 200th batchcount
[1, 300] loss: 0.524
1th epoch and 300th batchcount
[1, 400] loss: 0.471
1th epoch and 400th batchcount
[1, 500] loss: 0.420
1th epoch and 500th batchcount
[1, 600] loss: 0.381
1th epoch and 600th batchcount
[1, 700] loss: 0.353
1th epoch and 700th batchcount
[1, 800] loss: 0.334
1th epoch and 800th batchcount
[2, 100] loss: 0.314
2th epoch and 100th batchcount
[2, 200] loss: 0.306
2th epoch and 200th batchcount
[2, 300] loss: 0.303
2th epoch and 300th batchcount
[2, 400] loss: 0.290
2th epoch and 400th batchcount
[2, 500] loss: 0.288
2th epoch and 500th batchcount
[2, 600] loss: 0.278
2th epoch and 600th batchcount
[2, 700] loss: 0.269
2th epoch and 700th batchcount
[2, 800] loss: 0.270
2th epoch and 800th batchcount
[3, 100] loss: 0.270
3th epoch and 100th batchcount
[3, 200] loss: 0.253
3th epoch and 200th batchcount
[3, 300] loss: 0.249
3th epoch and 300th batchcount
[3, 400] los

## 모델을 검증한다

In [16]:
testset = TensorDataset(x_testset, y_testset)
print(type(testset))
print(testset)

testbatchsize = 100
testloader = DataLoader(testset, batch_size = testbatchsize, shuffle = True)

<class 'torch.utils.data.dataset.TensorDataset'>
<torch.utils.data.dataset.TensorDataset object at 0x00000270CA8C16D0>


In [17]:
correct = 0
total = 0
count = 0

with torch.no_grad():
    for data in testloader:
        features, targets = data
        features = features.to(device)
        targets = targets.to(device)
        targets = torch.squeeze(targets)

        outputs = model2(features)
        total += outputs.size(0)
        for idx, output in enumerate(outputs):
            if output[0] >= output[1]:
                if targets[idx][0] == 1 and targets[idx][1] == 0:
                    correct += 1
                else:
                    pass
            else:
                if targets[idx][0] == 0 and targets[idx][1] == 1:
                    correct += 1
                else:
                    pass
print("Accuracy : %d %%" %(100*correct / total))

Accuracy : 96 %
