In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
# 문제 1

In [4]:
class Tokenizer():
  def __init__(self):
    self.word_dict = {'oov': 0}
    self.fit_checker = False
  
  def preprocessing(self, sequences):
    result = []

    # 문제 1-1
    from nltk.tokenize import word_tokenize
    import re

    for sentence in sequences:

      # 특수문자 제거
      sentence = re.sub('[-=+,#/\?:^.@*\"※~ㆍ!』‘|\(\)\[\]`\'…》\”\“\’·]','',sentence)
      # 대문자 소문자화
      words = [word.lower() for word in word_tokenize(sentence)]
      result.append(words)

    return result
  
  def fit(self, sequences):
    self.fit_checker = False

    # 문제 1-2
    from collections import OrderedDict

    # 토큰화
    tokenized_list = self.preprocessing(sequences)
    #리스트 내 list 벗기기
    tokens = sum(tokenized_list, [])
  
    # 리스트 내 중복 토큰 제거한 말뭉치 생성
    for word in tokens:
      if self.word_dict.get(word)==None:
        self.word_dict[word]=len(self.word_dict)

    self.fit_checker = True
  
  def transform(self, sequences):
    result = []
    tokens = self.preprocessing(sequences)
    if self.fit_checker:

      # 문제 1-3

      # 말뭉치를 활용해 문장별 정수로 인덱싱
      for one_sentence in tokens:
        int_indexing = [self.word_dict.get(word) if word in self.word_dict \
                    else self.word_dict.get('oov') for word in one_sentence]
        result.append(int_indexing)

      return result
    else:
      raise Exception("Tokenizer instance is not fitted yet.")
      
  def fit_transform(self, sequences):
    self.fit(sequences)
    result = self.transform(sequences)
    return result

In [5]:
### test
input_list = ['I go to school.','I LIKE pizza!']
test_input = ["You are So beautiful.","i can't make it"]

In [7]:
tok = Tokenizer()

In [8]:
tok.fit_transform(input_list)

[[1, 2, 3, 4], [1, 5, 6]]

In [9]:
# 문제 2

In [17]:
class TfidfVectorizer():
  def __init__(self, tokenizer):
    self.tokenizer = tokenizer
    self.fit_checker = False
    
    
  
  def fit(self, sequences):
    tokenized = self.tokenizer.fit_transform(sequences)
    
    # 문제 2-1
    import scipy as sp

    idf_matrix = []
    # 전체 문장의 갯수
    N = len(tokenized)
    # unique한 단어 뭉치 만들기
    tokens = list(set(sum(tokenized, [])))

    for t in tokens:
      # 단어별 df값 구하기
      df = len([doc for doc in tokenized if t in doc])
      # 식을 이용해 단어별 idf값 구하기
      idf = sp.log(int(N)/float(1+df))
      idf_matrix.append(idf)

    self.fit_checker = True
    return idf_matrix
    
    

  def transform(self, sequences):
    if self.fit_checker:
      tokenized = self.tokenizer.transform(sequences)
      
      # 문제 2-2

      idf_matrix = self.fit(sequences)
      N = len(tokenized)
      tokens = list(set(sum(tokenized, [])))
      

      #tf행렬 만들기
      tf_matrix = []

      for idx in range(N):
        sentence = tokenized[idx]
        tf = [sentence.count(t) for t in tokens]
        tf_matrix.append(tf)

      # tf-idf행렬은  tf x idf
      self.tfidf_matrix = []

      for tf in tf_matrix:
        tfidf = [tf[idx] * idf_matrix[idx] for idx in range(len(tf))]
        self.tfidf_matrix.append(tfidf)


      return self.tfidf_matrix
    else:
      raise Exception("TfidfVectorizer instance is not fitted yet.")

  
  def fit_transform(self, sequences):
    self.fit(sequences)
    return self.transform(sequences)

In [18]:
# 테스트
test_list = ['I go to school.','I LIKE pizza!','You know you are So beautiful.','hey guys! you love pizza?']

In [19]:
tfidf = TfidfVectorizer(Tokenizer())

In [20]:
#idf행렬
tfidf.fit(test_list)



[0.28768207245178085,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453,
 0.28768207245178085,
 0.28768207245178085,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453]

In [21]:
#tf-idf값
tfidf.fit_transform(test_list)



[[0.28768207245178085,
  0.6931471805599453,
  0.6931471805599453,
  0.6931471805599453,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.28768207245178085,
  0.0,
  0.0,
  0.0,
  0.6931471805599453,
  0.28768207245178085,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.5753641449035617,
  0.6931471805599453,
  0.6931471805599453,
  0.6931471805599453,
  0.6931471805599453,
  0.0,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.28768207245178085,
  0.28768207245178085,
  0.0,
  0.0,
  0.0,
  0.0,
  0.6931471805599453,
  0.6931471805599453,
  0.6931471805599453]]