In [2]:
import numpy as np
from collections import defaultdict
import pandas as pd

In [10]:
data = pd.read_csv("data.csv")

texts = data["Message"].tolist()
labels = data["Category"].tolist()
print(texts)
print(labels)

[nan, "Box this lap, we're switching to Plan B.", 'Box now, box now for softs.', 'Copy that, coming in at the end of this lap.', 'Stay out, stay out, abort pit stop.', 'Pit confirm, pit confirm.', 'Box opposite to what Leclerc does this lap.', "Let's undercut Hamilton, box now, box now.", 'Safety car deployed, box now for softs.', 'Box this lap, box box box.', 'Box now, we need to react to Verstappen.', 'Box now for intermediates, rain is getting heavier.', "We'll pit in two laps, prepare for a front wing change.", 'Virtual safety car, box now, box now.', 'The undercut is powerful here, box this lap.', 'Staying out, staying out, this tire has more life.', 'Pit next lap for the prime tire, acknowledge.', "Box when you feel it's right, your call on slicks.", "Box this lap, we need to cover Alonso's stop.", 'Pit now for mediums, we have a gap in traffic.', 'Box box box, slow puncture suspected.', 'Rear tire pressures dropping, keep an eye on it.', 'Engine temperatures rising, we need to m

In [9]:
tokenized = [text.lower().split() for text in texts]
word2idx = {}
idx2word = {}
idx = 0
for sentence in tokenized:
    for word in sentence:
        if word not in word2idx:
            word2idx[word] = idx
            idx2word[idx] = word
            idx += 1
vocab_size = len(word2idx)

AttributeError: 'float' object has no attribute 'lower'

In [6]:
label2idx = {
    "진입 명령": 0,
    "차량 상태": 1,
    "불만/요청": 2,
    "전략/전술": 3,
    "격려": 4
}
idx2label = {v: k for k, v in label2idx.items()}

In [7]:
X_data = []
y_data = []

for sentence, label in zip(tokenized, labels):
    context_vec = np.zeros(vocab_size)
    for word in sentence:
        context_vec[word2idx[word]] += 1
    X_data.append(context_vec / len(sentence))
    y_data.append(label)

X = np.array(X_data)
y = [label2idx[label] for label in labels]
num_classes = len(set(y))

In [8]:
embedding_dim = 16
W_embed = np.random.randn(vocab_size, embedding_dim) * 0.01  # 단어 임베딩
W_out = np.random.randn(embedding_dim, num_classes) * 0.01   # 출력 가중치
b_out = np.zeros((1, num_classes))
learning_rate = 0.1

In [14]:
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

def cross_entropy(probs, y_true):
    batch_size = probs.shape[0]
    log_probs = np.log(probs + 1e-9)
    loss = -log_probs[range(batch_size), y_true].mean()
    return loss

In [16]:
def train_test_split(X, y, test_size=0.2, seed=42):
    np.random.seed(seed)
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    split = int(len(X) * (1 - test_size))
    train_idx, test_idx = indices[:split], indices[split:]
    return (
        np.array([X[i] for i in train_idx]),  # Convert to NumPy array
        np.array([X[i] for i in test_idx]),  # Convert to NumPy array
        [y[i] for i in train_idx],
        [y[i] for i in test_idx]
    )

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
epochs = 100

for epoch in range(epochs):
    # 1) 임베딩 평균화
    X_embed = np.dot(X_train, W_embed)  # shape: (batch_size, embedding_dim)

    # 2) 출력 계산
    logits = np.dot(X_embed, W_out) + b_out
    probs = softmax(logits)

    # 3) 손실
    loss = cross_entropy(probs, y_train)

    # 4) 역전파
    N = X_train.shape[0]
    one_hot = np.zeros_like(probs)
    one_hot[np.arange(N), y_train] = 1
    dL_dz = (probs - one_hot) / N

    dW_out = np.dot(X_embed.T, dL_dz)
    db_out = np.sum(dL_dz, axis=0, keepdims=True)

    dX_embed = np.dot(dL_dz, W_out.T)  # shape: (N, embedding_dim)
    dW_embed = np.dot(X_train.T, dX_embed)  # shape: (vocab_size, embedding_dim)

    # 5) 파라미터 업데이트
    W_out -= learning_rate * dW_out
    b_out -= learning_rate * db_out
    W_embed -= learning_rate * dW_embed

    if epoch % 10 == 0:
        print(f"Epoch {epoch} | Loss: {loss:.4f}")

Epoch 0 | Loss: 1.5590
Epoch 10 | Loss: 1.5581
Epoch 20 | Loss: 1.5571
Epoch 30 | Loss: 1.5561
Epoch 40 | Loss: 1.5550
Epoch 50 | Loss: 1.5537
Epoch 60 | Loss: 1.5521
Epoch 70 | Loss: 1.5502
Epoch 80 | Loss: 1.5478
Epoch 90 | Loss: 1.5448


In [26]:
test_logits = np.dot(X_test, W_embed) @ W_out + b_out  # Forward pass
test_probs = softmax(test_logits)  # Softmax to get probabilities
preds = np.argmax(test_probs, axis=1)  # Predicted class
acc = np.mean(preds == y_test)  # Accuracy calculation

print("\n테스트 정확도:", acc)


테스트 정확도: 1.0


In [23]:
# new data 예측
new_texts = [
    "Push harder now", 
    "Engine is failing"
]

# 1. 分词并向量化
new_tokenized = [text.lower().split() for text in new_texts]
new_X_data = []

for sentence in new_tokenized:
    context_vec = np.zeros(vocab_size)
    for word in sentence:
        if word in word2idx:  # if in vocabulary
            context_vec[word2idx[word]] += 1
    new_X_data.append(context_vec / len(sentence))

new_X = np.array(new_X_data)

new_X_embed = np.dot(new_X, W_embed)

new_logits = np.dot(new_X_embed, W_out) + b_out
new_probs = softmax(new_logits)

new_y_pred = np.argmax(new_probs, axis=1)

new_y_pred_labels = [idx2label[idx] for idx in new_y_pred]

for text, label in zip(new_texts, new_y_pred_labels):
    print(f"Text: \"{text}\" | Predicted Label: \"{label}\"")

Text: "Push harder now" | Predicted Label: "전략/전술"
Text: "Engine is failing" | Predicted Label: "차량 상태"


In [24]:
import pickle

model_data = {
    "W_embed": W_embed,
    "W_out": W_out,
    "b_out": b_out,
    "word2idx": word2idx,
    "idx2label": idx2label,
    "vocab_size": vocab_size,
    "embedding_dim": embedding_dim,
    "num_classes": num_classes
}

with open("cbow_model.pkl", "wb") as f:
    pickle.dump(model_data, f)

print("Model saved successfully!")

Model saved successfully!
