In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn.init as init

def set_seed(seed=777):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Seed 고정
set_seed(777)


#####################
# YOU MUST WRITE YOUR STUDENT ID IN THE VARIABLE STUDENT_ID
# EXAMPLE: STUDENT_ID = "12345678"
#####################
STUDENT_ID = "20251189"


In [2]:
def save_data(df1, df2):
    # BUT you should keep the file name as "{STUDENT_ID}_simple_seq.p#.answer.csv"
    df1.to_csv(f'{STUDENT_ID}_simple_seq.p1.answer.csv')
    df2.to_csv(f'{STUDENT_ID}_simple_seq.p2.answer.csv')

In [2]:
class CustomLinear(torch.nn.Module):
    def __init__(self, in_features, out_features):
        super(CustomLinear, self).__init__()
        self.W = torch.nn.Parameter(torch.empty(in_features, out_features, dtype=torch.float32))
        self.b = torch.nn.Parameter(torch.empty(out_features, dtype=torch.float32))
        
        # Xavier 초기화 적용
        init.xavier_uniform_(self.W)
        init.zeros_(self.b)
    
    def forward(self, x):
        return x.mm(self.W) + self.b

class CustomModel(torch.nn.Module):
    def __init__(self, input_dim):
        super(CustomModel, self).__init__()
        self.layer1 = CustomLinear(input_dim, 1000)
        self.layer2 = CustomLinear(1000, 100)
        self.layer3 = CustomLinear(100, 19)
    
    def forward(self, x):
        x = self.layer1.forward(x)
        x = torch.relu(x)
        x = self.layer2.forward(x)
        x = torch.relu(x)
        x = self.layer3.forward(x)
        return x


In [3]:
column_names = [f'{i}' for i in range(1, 22)]
train = pd.read_csv('./dataset/simple_seq.train.csv', names=column_names)

def extract_last_value(row):
    non_nan_values = row.dropna().tolist() 
    return non_nan_values[-1]

true_labels = train.apply(extract_last_value, axis=1) 

def remove_last_value(row):
    non_nan_values = row.dropna().tolist()  
    non_nan_values.pop()
    return pd.Series(non_nan_values)

train = train.apply(remove_last_value, axis=1) 
train.fillna("PAD", inplace=True)

true_labels_df = pd.DataFrame(true_labels, columns=["true_label"])

print(train.head(7))
print(true_labels_df.head())
print(len(train))
print(len(true_labels_df))


     0     1     2     3     4    5    6     7     8     9    10    11    12  \
0   W25   W26   W27   W19   W28  W29  W30   W31   W32   W33  W34   W35   W36   
1   W41    W4   W42   W43   W44  W45  W46   W47   W48   W49  W50   W51   W52   
2   W55   W19   W46   W32   W32  W56  W57   W58   W59   W19  W13   W60   W19   
3   W13   W83   W32   W32   W56  W57  W13   W84   W19   W28  W85   W86   W24   
4   W87   W88   W89   W90   W32  W91  W13   W92   W93   W90  W94   W95   W24   
5   W13   W52   W32   W53   W17  W13  W96   W97   W10    W2  W98   W99   W19   
6  W122  W123  W110  W124  W125  W19  W13  W126  W127  W128  W32  W129  W130   

    13    14    15    16   17    18   19  
0  W37   W38   W39   W24  W40   PAD  PAD  
1  W53   W17   W54   W24  PAD   PAD  PAD  
2  W13   W61   W62   PAD  PAD   PAD  PAD  
3  PAD   PAD   PAD   PAD  PAD   PAD  PAD  
4  PAD   PAD   PAD   PAD  PAD   PAD  PAD  
5  W13  W100   W24   PAD  PAD   PAD  PAD  
6  W36   W13  W131  W132  W17  W133  W24  
  true_label
0 

In [4]:
column = [f'{i}' for i in range(1, 22)]
test = pd.read_csv('./dataset/simple_seq.test.csv', header=None, names=column)
test.fillna("PAD", inplace=True)
test.drop(columns=['21'], inplace=True)
print(test.head(7))

       1     2      3     4      5      6     7      8      9     10     11  \
0    W13   W81    W19  W346   W846  W1582   W70    W28  W5433    W19  W1163   
1  W5413  W111  W5414   W32    W68  W5415   W12  W2402    W19  W5438  W5439   
2  W5413  W111  W5414   W32    W68  W5415   W12   W417   W346   W336    W17   
3  W5413  W111  W5414   W32    W68  W5415   W12   W346    W32  W2833    W93   
4  W5413  W111  W5414   W32    W68  W5415   W12   W111   W346    W47   W336   
5  W5413  W111  W5414   W32    W68  W5415   W12   W346   W168  W2464  W5448   
6    W87   W31    W47   W38  W1196    W97  W627  W5449    PAD    PAD    PAD   

      12     13     14     15     16     17    18     19   20  
0  W2261    W24    PAD    PAD    PAD    PAD   PAD    PAD  PAD  
1  W5440    W12   W346   W240  W5441  W5442   W24    PAD  PAD  
2    W28  W5443    W12   W122    W47    W38  W335  W1248  W24  
3    W28  W5444  W5445    W17   W346  W5446   W24    PAD  PAD  
4   W286  W5415   W552  W5447   W641   W346   W

  test.fillna("PAD", inplace=True)


In [5]:
unique_words= set(train.values.flatten()).union({"UNK"})
vocab=sorted(list(unique_words))
word_to_index = {word: i for i, word in enumerate(vocab)}
print(len(vocab))
print(vocab[:10])

unique_labels = set(true_labels_df['true_label'].values)
label_vocab = sorted(list(unique_labels))
label_to_index = {label: i for i, label in enumerate(label_vocab)}
print(len(label_vocab))
print(label_vocab[:19])

2548
['PAD', 'UNK', 'W1', 'W10', 'W100', 'W1003', 'W1004', 'W1008', 'W1009', 'W1010']
19
['D1', 'D11', 'D12', 'D13', 'D15', 'D16', 'D17', 'D18', 'D19', 'D20', 'D21', 'D27', 'D28', 'D3', 'D32', 'D4', 'D5', 'D6', 'D7']


In [6]:
# def one_hot_encode(row, vocab_size, word_to_index):
#     one_hot_matrix = np.zeros((len(row), vocab_size))
#     for i, word in enumerate(row):
#         word = word if word in word_to_index else "UNK"
#         if word != "PAD":
#             one_hot_matrix[i, word_to_index[word]] = 1
#     return one_hot_matrix

def one_hot_encode_label(label, label_to_index):   
    one_hot_matrix = np.zeros((1, len(label_vocab)))
    one_hot_matrix[0, label_to_index[label]] = 1
    return one_hot_matrix

In [7]:
def one_hot_encode(row, vocab_size, word_to_index, max_seq_length=20):
    word_indices = []
    for word in row[:max_seq_length]:
        word = word if word in word_to_index else "UNK"
        word_indices.append(word_to_index[word])  
    return word_indices 

In [15]:
X_train_encoded = [one_hot_encode(row, len(vocab), word_to_index) for row in train.values]
Y_train_encoded = np.array([one_hot_encode_label(label, label_to_index) for label in true_labels_df["true_label"]])

X_train = torch.tensor(X_train_encoded, dtype=torch.float32)
y_train = torch.tensor(np.array([np.argmax(one_hot) for one_hot in Y_train_encoded]), dtype=torch.long)

input_dim = X_train.shape[1]

model = CustomModel(input_dim)
optimizer = torch.optim.SGD(model.parameters(), lr=5e-4)
criterion = torch.nn.CrossEntropyLoss()

batch_size = 32
dataset = torch.utils.data.TensorDataset(X_train, y_train)
generator = torch.Generator().manual_seed(777)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, worker_init_fn=lambda _: np.random.seed(777), generator=generator)

epochs = 500
for epoch in range(epochs):
    total_loss = 0.0

    for batch_X, batch_y in dataloader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}")

Epoch 1, Loss: 226.36210966932362
Epoch 2, Loss: 2.907420857199307
Epoch 3, Loss: 2.9153895213686187
Epoch 4, Loss: 2.851698875427246
Epoch 5, Loss: 2.847069559426143
Epoch 6, Loss: 2.823448131824362
Epoch 7, Loss: 2.9939020502156226
Epoch 8, Loss: 2.850068947364544
Epoch 9, Loss: 2.8209226789145636
Epoch 10, Loss: 2.82082357899896
Epoch 11, Loss: 2.812918243737056
Epoch 12, Loss: 2.8442505556961586
Epoch 13, Loss: 2.812787754782315
Epoch 14, Loss: 2.805862911816301
Epoch 15, Loss: 2.798146996004828
Epoch 16, Loss: 2.794185317795852
Epoch 17, Loss: 2.783752260536983
Epoch 18, Loss: 2.748584434903901
Epoch 19, Loss: 2.747150388257257
Epoch 20, Loss: 2.7277306770456247
Epoch 21, Loss: 2.7071708070820777
Epoch 22, Loss: 2.7031356301800957
Epoch 23, Loss: 2.6981079413973053
Epoch 24, Loss: 2.6828941805609343
Epoch 25, Loss: 2.64678562098536
Epoch 26, Loss: 2.5917820930480957
Epoch 27, Loss: 2.598069585602859
Epoch 28, Loss: 2.616361502943368
Epoch 29, Loss: 2.485735457519005
Epoch 30, Loss

In [16]:
# 모델 평가 함수
def evaluate_model(model, X_train, y_train, label_vocab):
    model.eval()  # 평가 모드
    with torch.no_grad():
        outputs = model(X_train)
        predictions = torch.argmax(outputs, dim=1)  # 예측값 (정수 인덱스)

        # y_train이 원핫 벡터인지 확인 후 변환
        if y_train.dim() > 1:
            y_train = torch.argmax(y_train, dim=1)

        correct = (predictions == y_train).sum().item()  # 정답 개수
        accuracy = correct / len(y_train) * 100  # Accuracy 계산

    print(f"Train Accuracy: {accuracy:.2f}%")

    # 일부 샘플 출력
    print("\n===== Sample Predictions =====")
    for i in range(500):
        true_label = label_vocab[y_train[i].item()]
        pred_label = label_vocab[predictions[i].item()]
        print(f"Sample {i+1}: True Label = {true_label}, Predicted = {pred_label}")

# Accuracy 평가 실행
evaluate_model(model, X_train, y_train, label_vocab)


Train Accuracy: 76.89%

===== Sample Predictions =====
Sample 1: True Label = D11, Predicted = D16
Sample 2: True Label = D1, Predicted = D20
Sample 3: True Label = D3, Predicted = D3
Sample 4: True Label = D20, Predicted = D12
Sample 5: True Label = D20, Predicted = D12
Sample 6: True Label = D20, Predicted = D12
Sample 7: True Label = D20, Predicted = D20
Sample 8: True Label = D1, Predicted = D20
Sample 9: True Label = D20, Predicted = D20
Sample 10: True Label = D20, Predicted = D20
Sample 11: True Label = D20, Predicted = D20
Sample 12: True Label = D12, Predicted = D12
Sample 13: True Label = D20, Predicted = D12
Sample 14: True Label = D15, Predicted = D15
Sample 15: True Label = D15, Predicted = D20
Sample 16: True Label = D4, Predicted = D16
Sample 17: True Label = D20, Predicted = D20
Sample 18: True Label = D20, Predicted = D20
Sample 19: True Label = D5, Predicted = D16
Sample 20: True Label = D16, Predicted = D16
Sample 21: True Label = D3, Predicted = D3
Sample 22: True L

In [19]:
print("Label to index mapping:", label_to_index)


Label to index mapping: {'D1': 0, 'D11': 1, 'D12': 2, 'D13': 3, 'D15': 4, 'D16': 5, 'D17': 6, 'D18': 7, 'D19': 8, 'D20': 9, 'D21': 10, 'D27': 11, 'D28': 12, 'D3': 13, 'D32': 14, 'D4': 15, 'D5': 16, 'D6': 17, 'D7': 18}


In [20]:
# true_labels_df에서 변환된 Y_train_encoded 확인
for label in list(true_labels_df["true_label"])[:10]:  # 10개 샘플 확인
    one_hot_label = one_hot_encode_label(label, label_to_index)
    print(f"Label: {label}, One-Hot: {one_hot_label}")


Label: D11, One-Hot: [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Label: D1, One-Hot: [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Label: D3, One-Hot: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]
Label: D20, One-Hot: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Label: D20, One-Hot: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Label: D20, One-Hot: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Label: D20, One-Hot: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Label: D1, One-Hot: [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Label: D20, One-Hot: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Label: D20, One-Hot: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [40]:
print(true_labels_df.head(20))  # 처음 20개의 true label 출력
print(true_labels_df['true_label'].value_counts())  # 각 라벨 개수 출력


   true_label
0         D11
1          D1
2          D3
3         D20
4         D20
5         D20
6         D20
7          D1
8         D20
9         D20
10        D20
11        D12
12        D20
13        D15
14        D15
15         D4
16        D20
17        D20
18         D5
19        D16
true_label
D20    271
D12    200
D15    143
D1      81
D16     61
D3      49
D28     30
D4      18
D17     10
D5       8
D11      7
D27      6
D19      5
D6       3
D13      3
D18      2
D21      1
D7       1
D32      1
Name: count, dtype: int64


In [13]:
def one_hot_encode(row, vocab_size, word_to_index, max_seq_length=20):
    """
    단어 등장 순서를 유지하는 원핫 벡터 변환 함수
    """
    word_indices = []
    for word in row[:max_seq_length]:  # 최대 길이만큼 단어 처리
        word = word if word in word_to_index else "UNK"
        if word != "PAD":
            word_indices.append(word_to_index[word])  # 순서 유지한 채 저장
    return word_indices  # 단어 인덱스 리스트 반환

def decode_one_hot(encoded_indices, word_to_index):
    """
    원핫 벡터로 변환된 인덱스 리스트를 다시 단어 시퀀스로 변환
    """
    index_to_word = {i: word for word, i in word_to_index.items()}  # 인덱스를 단어로 변환
    words = [[index_to_word[i] for i in encoded_row if i in index_to_word] for encoded_row in encoded_indices]
    return words


# 테스트 문장 (임의로 선정)
test_sentence = ["W42", "W17", "W31", "W10", "W100"]
test_encoded = one_hot_encode(test_sentence, len(vocab), word_to_index)

print("\n===== Encoded One-Hot Vector =====")
print(test_encoded)
test_decoded = decode_one_hot([test_encoded], word_to_index)

print("\n===== Decoded Words =====")
print(test_decoded)




===== Encoded One-Hot Vector =====
[1662, 397, 1044, 3, 4]

===== Decoded Words =====
[['W42', 'W17', 'W31', 'W10', 'W100']]
