# Tensorflow(one-hot encoding & Embedding)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

corpus = ['우리말은 우리 생활과 문화와 역사와 정신과 모든 것을 담고 있는 우리 삶의 기본이다. 우리말은 우리 삶 그 자체다.']
# 토크나이징
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
print('----tokenizing----')
print(tokenizer.word_index)
print(tokenizer.word_counts)

# 정수화
integers = tokenizer.texts_to_sequences(corpus)
print('----to integers----')
print(integers) # row의 개수


# 벡터화
# (1) 원핫벡터 
n = len(tokenizer.word_index) + 1
onehot_encoding = to_categorical(integers, num_classes = n)
print('----one-hot vectors----')
print(onehot_encoding) # row의 개수

# (2) 임베딩 벡터
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
model = Sequential()
model.add(Embedding(input_dim = n, output_dim = 3))
model.compile(optimizer = 'rmsprop', loss = 'mse')
embedding = model.predict(integers)
print(embedding.shape) # row의 개수

----tokenizing----
{'우리': 1, '우리말은': 2, '생활과': 3, '문화와': 4, '역사와': 5, '정신과': 6, '모든': 7, '것을': 8, '담고': 9, '있는': 10, '삶의': 11, '기본이다': 12, '삶': 13, '그': 14, '자체다': 15}
OrderedDict([('우리말은', 2), ('우리', 3), ('생활과', 1), ('문화와', 1), ('역사와', 1), ('정신과', 1), ('모든', 1), ('것을', 1), ('담고', 1), ('있는', 1), ('삶의', 1), ('기본이다', 1), ('삶', 1), ('그', 1), ('자체다', 1)])
----to integers----
[[2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 1, 11, 12, 2, 1, 13, 14, 15]]
----one-hot vectors----
[[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0

In [None]:
len(tokenizer.word_index)

15

In [None]:
len(integers[0])

18

# Word2Vec

In [None]:
!pip install konlpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Collecting JPype1>=0.7.0
  Downloading JPype1-1.4.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.6/465.6 KB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.1 konlpy-0.6.0


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from konlpy.tag import Okt

In [None]:
train_data = pd.read_csv('ratings_train.txt', header = 0, delimiter = '\t') 
test_data = pd.read_csv('ratings_test.txt', header = 0, delimiter = '\t') 
print(train_data.shape)
print(test_data.shape)

FileNotFoundError: ignored

In [None]:
data = pd.concat([train_data, test_data])
print(data.shape)

## 데이터 전처리

### 결측치 처리

In [None]:
data.isnull().sum()

In [None]:
data[data['document'].isnull()]

In [None]:
data = data.dropna()
print(data.shape)

## 이상치 처리

In [None]:
# 문장길이
setnece_length = data['document'].apply(lambda x: len(x))

# 띄어쓰기 개수
spacing_count = data['document'].apply(lambda x: len(x.split()))

In [None]:
setnece_length.plot.hist()

In [None]:
setnece_length.describe()

In [None]:
spacing_count.plot.hist()

In [None]:
spacing_count.describe()

In [None]:
data[spacing_count <= 2]

In [None]:
# 성희님 data = data[spacing_count > 3]
# 종현님 data = data[setnece_length > 10]
# 한솔님 data = data[(spacing_count > 3) & (setnece_length > 10)]
# 나머지 : data

In [None]:
# 정규 표현식을 활용 한글, 띄어쓰기 만 남기기
import re
data['document'] = data['document'].apply(lambda text: re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','', text))
# data['document'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '')

In [None]:
# 불용어 제거 
from tqdm import tqdm
# 불용어 정의
stopwords = ['의', '를', '와', '과', '으로', '것', '잘', '되', '주', '보', '수',
             '좀', '이', '있', '은', '한', '아', '도', '에', '하', '는', '자',
             '하다', '가', '들', '걍', '등']

okt = Okt()
tokenized_data = []
for sentence in tqdm(data['document']):
  tokenized_sentence = okt.morphs(sentence, stem = True)
  stopwords_removed_sentences = [token for token in tokenized_sentence if not token in stopwords]
  tokenized_data.append(stopwords_removed_sentences)

In [None]:
# 강사의 데이터를 불러오는 경우
# tokenized_data = pd.read_pickle(r'/content/tokenized_data.pkl')

- size : 임베딩 된 벡터의 차원 
- window : 컨텍스트 윈도우 크기(사용할 주변 단어의 개수)
- min_count : 단어의 최소 빈도 수
- workers : 학습을 위한 프로세스 수
- sg = 0 (CBOW), 1이 Skip-gram

In [None]:
len(tokenized_data)

In [None]:
# len(([item for sublist in tokenized_data for item in sublist])) ** (1/4) # 네제곱근

In [None]:
from gensim.models import Word2Vec
model = Word2Vec(sentences = tokenized_data, size = 100, window = 5, min_count = 5, workers = 4, sg = 0)

In [None]:
len(tokenized_data)

In [None]:
len(data)

In [None]:
# model.wv.vocab.keys() 

In [None]:
model.wv.save_word2vec_format('Word2Vec') # 모델저장

In [None]:
from gensim.models import KeyedVectors
loaded_model = KeyedVectors.load_word2vec_format('Word2Vec') # 모델 불러오기

In [None]:
print(loaded_model.wv.most_similar('노잼'))

In [None]:
print(loaded_model.wv.most_similar('교도소'))

In [None]:
print(loaded_model.wv.most_similar('오버'))

In [None]:
print(loaded_model.wv.most_similar('액션'))

In [None]:
print(loaded_model.wv.most_similar('노잼'))

In [None]:
# !pip install gensim==3.6.0

In [None]:
# list(model.wv.vocab)

In [None]:
model[model.wv.vocab].shape

In [None]:
from sklearn.manifold import TSNE

vocab = list(model.wv.vocab)
X = model[vocab]
USED_VECTOR_COUNT = 200

tsne = TSNE(n_components = 2)
X_tsne = tsne.fit_transform(X[:USED_VECTOR_COUNT,:]) # 200개만 선택해서 차원축소

In [None]:
df = pd.DataFrame(X_tsne, index = vocab[:USED_VECTOR_COUNT], columns = ['x', 'y'])
df.head()

In [None]:
import plotly.express as px
fig = px.scatter(df, x="x", y="y", text = df.index)
fig.update_traces(textposition="top center")
fig.show()

In [None]:
# https://projector.tensorflow.org/     참고

In [None]:
!python -m gensim.scripts.word2vec2tensor --input Word2Vec --output Word2vecTSV

In [None]:
import pickle
 
with open('labels.pkl', 'wb') as f: # label
  pickle.dump(data['label'].values, f) 
with open('tokenized_data.pkl', 'wb') as f: # 토큰화가 완료된 리스트
  pickle.dump(tokenized_data, f) 