In [41]:
import torch
import torch.nn as nn

In [42]:
# !conda install anaconda::pandas -y
# !conda install anaconda::seaborn -y

In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
dataset = pd.read_csv("data/car_evaluation.csv")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   price         1728 non-null   object
 1   maint         1728 non-null   object
 2   doors         1728 non-null   object
 3   persons       1728 non-null   object
 4   lug_capacity  1728 non-null   object
 5   safety        1728 non-null   object
 6   output        1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [9]:
categorical_columns = ['price','maint','doors','persons','lug_capacity','safety']
for category in categorical_columns:
    dataset[category] = dataset[category].astype('category') # category로 type 변경

In [12]:
# 범주형데이터 -> dataset[category] -> 넘파이배열 -> 텐서
price = dataset['price'].cat.codes.values
maint = dataset['maint'].cat.codes.values
doors = dataset['doors'].cat.codes.values
persons = dataset['persons'].cat.codes.values
lug_capacity = dataset['lug_capacity'].cat.codes.values
safety = dataset['safety'].cat.codes.values

In [13]:
categorical_data = np.stack([price,maint,doors,persons,lug_capacity,safety],1) # 축이 1인 넘파이객체를 만듦
categorical_data[:10]

array([[3, 3, 0, 0, 2, 1],
       [3, 3, 0, 0, 2, 2],
       [3, 3, 0, 0, 2, 0],
       [3, 3, 0, 0, 1, 1],
       [3, 3, 0, 0, 1, 2],
       [3, 3, 0, 0, 1, 0],
       [3, 3, 0, 0, 0, 1],
       [3, 3, 0, 0, 0, 2],
       [3, 3, 0, 0, 0, 0],
       [3, 3, 0, 1, 2, 1]], dtype=int8)

In [15]:
outputs = pd.get_dummies(dataset["output"]).values
outputs

array([[False, False,  True, False],
       [False, False,  True, False],
       [False, False,  True, False],
       ...,
       [False, False,  True, False],
       [False,  True, False, False],
       [False, False, False,  True]])

In [18]:
categorical_data = torch.tensor(categorical_data, dtype=torch.int64)
outputs = torch.tensor(outputs).flatten() # 원핫인코딩(자기해당되는 칸 빼고는 0으로 나오는 행렬 -> flatten으로 하나짜리로 만들어줘야함)

In [21]:
print(categorical_data.size(), outputs.size())

torch.Size([1728, 6]) torch.Size([6912])


In [None]:
# data와 output의 size가 달라서 

In [25]:
categorical_column_sizes =  [len(dataset[column].cat.categories) for column in categorical_columns]
categorical_embedding_sizes = [(col_size, min(50, (col_size)+1)//2) for col_size in categorical_column_sizes]
categorical_embedding_sizes

In [26]:
total_records = 1728
test_records = int(total_records*0.2)

In [30]:
categorical_train_data = categorical_data[:total_records-test_records]
categorical_test_data = categorical_data[total_records-test_records:total_records]
train_outputs = outputs[:total_records-test_records]
test_outputs = outputs[total_records-test_records:total_records]
print(len(categorical_train_data), len(train_outputs), len(categorical_test_data), len(test_outputs))

1383 1383 345 345


In [44]:
class Model(nn.Module):

    # 생성자
    def __init__(self, embedding_size, output_size, layers, p=0.4):
        super().__init__()
        self.all_embeddings = nn.ModuleList([nn.Embedding(ni,nf) for ni,
                                            nf in embedding_size])
        self.embedding_dropout = nn.Dropout(p) # 일정량 수준으로 데이터를 끊어주면 결과가 잘나옴
 
        all_layers = []
        num_categorical_cols = sum((nf for ni, nf in embedding_size))
        input_size = num_categorical_cols # 입력층의 크기를 찾으려고 범주형 칼럼 개수 저장 (p=0.4)

        for i in layers:
            all_layers.append(nn.Linear(input_size, i))
            all_layers.append(nn.ReLU(inplace = True))
            all_layers.append(nn.BatchNorm1d(i))
            all_layers.append(nn.Dropout(p))
            input_size = 1

        all_layers.append(nn.Linear(layers[-1], output_size))
        self.layers = nn.Sequential(*all_layers) # 순차적으로 실행되도록 모든 계층 목록을 nn.Sequential로 전달

    # forward 전방연산자
    def forward(self, x_categorical):
        embeddings = []
        for i,e in enumerate(self.all_embeddings):
            embeddings.append(e(x_categorical[:,i]))
            x = torch.cat(embeddings, 1)
            x = self.embedding_dropout(x)
            x = self.layers(x)
            return x

In [45]:
model = Model(categorical_embedding_sizes, 4, [200,100,50], p=0.4) # [200,100,50] 중간과정에서 몇개를 받을지
print(model)
# ReLU : 가중치를 줘서 버릴거버림
# Dropout : 기울기를 0으로 만들어서 0.4 버림
# 200 -> 100 -> 50 -> 4

Model(
  (all_embeddings): ModuleList(
    (0-2): 3 x Embedding(4, 2)
    (3-5): 3 x Embedding(3, 2)
  )
  (embedding_dropout): Dropout(p=0.4, inplace=False)
  (layers): Sequential(
    (0): Linear(in_features=12, out_features=200, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=1, out_features=100, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=1, out_features=50, bias=True)
    (9): ReLU(inplace=True)
    (10): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.4, inplace=False)
    (12): Linear(in_features=50, out_features=4, bias=True)
  )
)


In [46]:
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) # loss_function을 가능한 작게 하는데 도움을 줌

In [47]:
epochs = 500
aggregated_losses = []
train_outputs = train_outputs.to(device="cpu", dtype=torch.int64)
for i in range(epochs):
    i += 1
    y_pred = model(categorical_train_data)
    single_loss = loss_function(y_pred, train_outputs)
    aggregated_losses.append(single_loss)

    if i % 25 == 1:
        print(f"epoch: {i:3} loss: {single_loss.item():10.8f}")

    optimizer.zero_grad()
    single_loss.backward() # optimizer값 초기화
    optimizer.step()

print(f"epoch: {i:3} loss: {single_loss.item():10.10f}")

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1383x2 and 12x200)