In [1]:
import pandas as pd
import re

# TSV 파일 읽기
with open("review.sorted.uniq.preprocessing.tok.bpe.tsv", 'r') as f:
    lines = f.readlines()

# 데이터를 저장할 리스트 초기화
data = []
labels = []

# 각 줄을 처리
for line in lines:
    # 탭(\t)으로 구분
    split_line = line.strip().split('\t')
    
    # 첫 번째 열: 레이블 처리
    if split_line[0].lower() == 'positive':
        labels.append(1)
    elif split_line[0].lower() == 'negative':
        labels.append(0)
    else:
        raise ValueError(f"Unknown label: {split_line[0]}")
    data.append(split_line[1])

In [10]:
## 리뷰 벡터화
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer 객체 생성
count_vectorizer = CountVectorizer()

# sentences 데이터에 대한 피처 변환 수행
# sentences는 분석할 텍스트 데이터의 리스트

# neg는 앞에, pos는 뒤에 있기에 이를 합침
# 30만개는 너무 커서 한번에 안담긴다...
data_mini = data[:10000]+data[-10000:]
labels_mini = labels[:10000] +labels[-10000:]
features = count_vectorizer.fit_transform(data_mini)
print(f"document 수: {features.shape[0]}")
print(f"단어수: {features.shape[1]}")

# features 객체를 NumPy 배열로 변환
vectorized_sentences = features.toarray()

document 수: 20000
단어수: 15327


In [11]:
feature_names = count_vectorizer.get_feature_names_out()
# 벡터화된 문장과 피처 이름을 이용해 DataFrame 생성
df = pd.DataFrame(vectorized_sentences, columns=feature_names)

# 데이터프레임의 인덱스 이름 지정
df.index.name = 'sentence'

## 원핫 인코딩

In [9]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# 문장으로 부터 상위 100 개 단어로 vocabulary 작성
tokenizer = Tokenizer(num_words=1000, oov_token='<OOV>')
# sentences에 포함된 문장들을 기반으로 단어의 토큰화를
# 수행하며, 각 단어에 고유한 인덱스를 할당
tokenizer.fit_on_texts(data_mini)

# sentences 데이터를 시퀀스로 변환
sequences = tokenizer.texts_to_sequences(data_mini)

# 시퀀스에 패딩 적용 (문장의 뒤쪽을 패딩하고, 필요시 뒤쪽을 잘라냄)
padded = pad_sequences(sequences, padding='post', truncating='post')

In [104]:
# 패딩된 시퀀스를 원-핫 인코딩으로 변환
ohencode =to_categorical(padded, num_classes=None)

MemoryError: Unable to allocate 224. GiB for an array with shape (30000000, 1000) and data type float64

In [26]:
import reservoirpy as rpy
from reservoirpy.nodes import Reservoir
from reservoirpy.nodes import Reservoir, Ridge, Input
import numpy as np

# 단순한 +이어붙이기는 오류가 발생한다, padded는 2차원이기 때문이다
X_train = np.concatenate([padded[:9000], padded[-9000:]], axis=0)
Y_train = labels_mini[:9000] + labels_mini[-9000:]
X_test = padded[9001 : 10999]
Y_test = labels_mini[9001 : 10999]

source = Input()
reservoir = Reservoir(500, sr=0.9, lr=0.1)
readout = Ridge(ridge=1e-6)

model = source >> reservoir >> readout

In [None]:
import logging

# 로그 레벨을 WARNING 이상으로 설정하여 디버그 메시지를 무시합니다.
logging.basicConfig(level=logging.WARNING)  # ERROR로 설정하면 더 적은 메시지가 출력됩니다.

# 벡터에서 학습
states_train = []
for x in X_train:  # disable=True will hide the progress bar
    states = reservoir.run(x, reset=True)
    states_train.append(states[-1, np.newaxis])

Running Reservoir-0: 100%|██████████| 1/1 [00:00<?, ?it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<?, ?it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<?, ?it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<00:00, 1002.94it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<?, ?it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<?, ?it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<?, ?it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<?, ?it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<?, ?it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<?, ?it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<00:00, 1003.66it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<?, ?it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<?, ?it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<00:00, 996.98it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<00:00, 1001.98it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<00:00, 999.36it/s]
Running 

In [14]:
readout.fit(states_train, Y_train)

'Ridge-0': Ridge(ridge=1e-06, input_bias=True, in=500, out=1)

In [15]:
Y_pred = []
for x in X_test:
    states = reservoir.run(x, reset=True)
    y = readout.run(states[-1, np.newaxis])
    Y_pred.append(y)
Y_pred[:5]

Running Reservoir-0: 100%|██████████| 1/1 [00:00<00:00, 994.85it/s]
Running Ridge-0: 100%|██████████| 1/1 [00:00<?, ?it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<00:00, 937.69it/s]
Running Ridge-0: 100%|██████████| 1/1 [00:00<?, ?it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<00:00, 1000.31it/s]
Running Ridge-0: 100%|██████████| 1/1 [00:00<00:00, 1001.74it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<00:00, 1000.31it/s]
Running Ridge-0: 100%|██████████| 1/1 [00:00<?, ?it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<?, ?it/s]
Running Ridge-0: 100%|██████████| 1/1 [00:00<?, ?it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<?, ?it/s]
Running Ridge-0: 100%|██████████| 1/1 [00:00<?, ?it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<00:00, 998.64it/s]
Running Ridge-0: 100%|██████████| 1/1 [00:00<00:00, 1000.55it/s]
Running Reservoir-0: 100%|██████████| 1/1 [00:00<00:00, 1000.07it/s]
Running Ridge-0: 100%|██████████| 1/1 [00:00<?, ?it/s]
Running Res

[array([[0.71276431]]),
 array([[0.65262871]]),
 array([[0.20028738]]),
 array([[0.06812992]]),
 array([[0.51088992]])]

In [18]:
from sklearn.metrics import accuracy_score

Y_pred_class = [np.argmax(y_p) for y_p in Y_pred]
Y_test_class = [np.argmax(y_t) for y_t in Y_test]
print(Y_test_class)

score = accuracy_score(Y_test_class, Y_pred_class)

print("Accuracy: ", f"{score * 100:.3f} %")
score

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

1.0

In [62]:
# DataFrame 생성
df_data = pd.DataFrame(ohencode)  # 데이터 열
df_labels = pd.DataFrame(labels, columns=['label'])  # 레이블 열

# 레이블이 0번 열, 데이터가 1번 열부터 오도록 concat
df = pd.concat([df_labels, df_data], axis=1)

# TSV 파일로 저장
df.to_csv("review.sorted.uniq.preprocessing.tok.bpe.labelint.tsv", sep='\t', index=False, header=False)

ValueError: Must pass 2-d input. shape=(2000, 1500, 100)