In [32]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torchvision.transforms as transforms

from torchvision.datasets import MNIST
from torch.utils.data import DataLoader, TensorDataset

from torch import nn

In [33]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(device)

cuda


## 데이터셋 불러오기

In [53]:
df = pd.read_csv("all_data.csv")

# target 열에 데이터가 있으면 train, 없으면 final 나누기
train = df[df.target.notnull()]
final = df[df.target.isnull()]

  df = pd.read_csv("all_data.csv")


In [54]:
from sklearn.preprocessing import LabelEncoder

# 범주형 변수를 인코딩하는 함수
def encode_categorical_features(data):
    label_encoders = {}
    for column in data.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column].astype(str))  # 문자열로 변환 후 인코딩
        label_encoders[column] = le
    return data, label_encoders

In [55]:
#test and split 임포트
from sklearn.model_selection import train_test_split

X = train.drop('target', axis=1)
y = train['target']

#데이터셋 분리(학습X, 테스트X, 학습Y, 테스트Y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


X_train, _ = encode_categorical_features(X_train)
X_test, _ = encode_categorical_features(X_test)

# Prepare data for DataLoader
X_train = torch.tensor(X_train.values, dtype=torch.float32)

y_temp = y_train.values
y_temp = np.where(y_temp == "Normal", 0, 1)
y_train = torch.tensor(y_temp, dtype=torch.long)

X_test = torch.tensor(X_test.values, dtype=torch.float32)

y_temp = y_test.values
y_temp = np.where(y_temp == "Normal", 0, 1)
y_test = torch.tensor(y_temp, dtype=torch.long)

train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

In [56]:
# Final데이터(test.csv)
X_final = final.drop('target', axis=1)
X_final, _ = encode_categorical_features(X_final)
X_final = torch.tensor(X_final.values, dtype=torch.float32)

final_dataset = TensorDataset(X_final)
final_loader = DataLoader(final_dataset, batch_size=128, shuffle=False)

In [57]:
#데이터로더 만들기
train_loader = DataLoader(dataset=train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=128, shuffle=False)

## Fully Connected Model

In [58]:
# Model definition
model1 = nn.Sequential(
    nn.Linear(in_features=X_train.shape[1], out_features=256), 
    nn.ReLU(),
    nn.Linear(in_features=256, out_features=2)  # Assuming binary classification
    #nn.Softmax() #꼭 해줘야함. 확률값으로 바꿔주는 것
)

In [59]:
# 만든 모델을 device(cuda)로 전달하자
#device는 위에 cuda, cpu 부분 참고
#연산을 GPU의 VRAM으로 모델을 보내줌 데이터는 아직 안보냄 후에 배치로 데이터를 보낼 것이다.
model1.to(device)

Sequential(
  (0): Linear(in_features=47, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=2, bias=True)
)

### 실제 학습이 일어나는 train() 함수

In [60]:
from torch import optim
import time

In [61]:
def train(model, train_loader):
    epochs = 10
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    criterion = nn.CrossEntropyLoss()
    
    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        print(f'epochs:{epoch + 1} / {epochs}')
        
        for x_t, y_t in train_loader:
            x_t = x_t.to(device)
            y_t = y_t.to(device)
            
            pred = model(x_t)
            loss = criterion(pred, y_t)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        model.eval()
        correct = 0
        # Evaluate model on test_loader
        for xx, yy in test_loader:  
            xx = xx.to(device)
            yy = yy.to(device)
            pred = model(xx)
            _, predicted = torch.max(pred, 1)
            correct += predicted.eq(yy.data).sum()

        print(f'test_accuracy: {(100. * correct / len(test_loader.dataset)).item()}')

    torch.cuda.synchronize()
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f'Total training time: {elapsed_time // 60} minutes {elapsed_time % 60} seconds.')

In [62]:
train(model1, train_loader)

epochs:1 / 10
test_accuracy: 81.54776763916016
epochs:2 / 10
test_accuracy: 93.5571517944336
epochs:3 / 10
test_accuracy: 94.28536224365234
epochs:4 / 10
test_accuracy: 49.605037689208984
epochs:5 / 10
test_accuracy: 94.13725280761719
epochs:6 / 10
test_accuracy: 88.75586700439453
epochs:7 / 10
test_accuracy: 93.60652160644531
epochs:8 / 10
test_accuracy: 93.69291687011719
epochs:9 / 10
test_accuracy: 94.27302551269531
epochs:10 / 10
test_accuracy: 94.17427825927734
Total training time: 0.0 minutes 7.249930381774902 seconds.


## Submission 파일 만들기

In [63]:
import numpy as np

In [64]:
model1.eval()

all_predictions = []

#예측 수행
with torch.no_grad():
    for xx in final_loader:
        xx = xx[0].to(device)
        pred = model1(xx)
        _, predicted = torch.max(pred, 1)
        all_predictions.extend(predicted.cpu().numpy())

In [65]:
print(all_predictions)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [66]:
submission = pd.read_csv("./data/submission.csv")

#예측값을 데이터프레임에 넣어줌
submission['target'] = np.where(np.array(all_predictions) == 0, "Normal", "AbNormal")

submission.to_csv("submission_dl2.csv", index=False)

In [67]:
#submission의 target열의 AbNormal의 개수를 세어줌
submission['target'].value_counts()


target
AbNormal    13110
Normal       4251
Name: count, dtype: int64