In [5]:
import numpy as np
from collections import defaultdict
from itertools import islice
import re

In [2]:
# 1. 데이터셋 (간단 예시)
texts = [
    "Box box box",
    "My tyres are gone",
    "Tell him to get out",
    "We need to push",
    "Engine is overheating",
    "Good job keep pushing",
    "I'm losing power",
    "Switching to plan B",
    "Push now push now",
    "Let me race please"
]
labels = [
    "진입 명령", "차량 상태", "불만/요청", "전략/전술", "차량 상태",
    "격려", "차량 상태", "전략/전술", "전략/전술", "불만/요청"
]

In [6]:
def generate_ngrams(text, n=2):
    """Generate n-grams from a given text."""
    tokens = re.findall(r'\b\w+\b', text.lower())
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

In [7]:
def build_vocab(texts, ngram_range=(1, 2)):
    """Build vocabulary for n-grams."""
    vocab = defaultdict(int)
    for text in texts:
        for n in range(ngram_range[0], ngram_range[1] + 1):
            ngrams = generate_ngrams(text, n)
            for ngram in ngrams:
                vocab[ngram] += 1
    return {word: idx for idx, word in enumerate(vocab.keys())}

In [8]:
def vectorize_texts(texts, vocab, ngram_range=(1, 2)):
    """Convert texts to vectorized form using the vocabulary."""
    vectors = []
    for text in texts:
        vec = np.zeros(len(vocab))
        for n in range(ngram_range[0], ngram_range[1] + 1):
            ngrams = generate_ngrams(text, n)
            for ngram in ngrams:
                if ngram in vocab:
                    vec[vocab[ngram]] += 1
        vectors.append(vec)
    return np.array(vectors)