In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
# 문제 1

In [4]:
class Tokenizer():
  def __init__(self):
    self.word_dict = {'oov': 0}
    self.fit_checker = False
  
  def preprocessing(self, sequences):
    result = []
    # 문제 1-1
    from nltk.tokenize import word_tokenize
    symbols = '[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]'

    for sentence in sequences:
      words = [word.lower() for word in word_tokenize(sentence)]
      words = [word for word in words if word not in symbols]
      result.append(words)

    return result
  
  def fit(self, sequences):
    self.fit_checker = False

    # 문제 1-2
    from collections import OrderedDict

    tokenized_list = self.preprocessing(sequences)
    tokens = sum(tokenized_list, [])
    tokens = list(OrderedDict.fromkeys(tokens))

    for word in tokens:
      if self.word_dict.get(word)==None:
        self.word_dict[word]=len(self.word_dict)

    self.fit_checker = True
  
  def transform(self, sequences):
    result = []
    tokens = self.preprocessing(sequences)
    if self.fit_checker:

      # 문제 1-3
      for one_list in tokens:
        indexing = [self.word_dict.get(word) if word in self.word_dict else self.word_dict.get('oov') for word in one_list]
        result.append(indexing)

      return result
    else:
      raise Exception("Tokenizer instance is not fitted yet.")
      
  def fit_transform(self, sequences):
    self.fit(sequences)
    result = self.transform(sequences)
    return result

In [5]:
### test
input_list = ['I go to school.','I LIKE pizza!']
test_input = ['You are So beautiful.','hey guys! you love pizza?']

In [6]:
a = Tokenizer()

In [7]:
a.fit_transform(input_list)

[[1, 2, 3, 4], [1, 5, 6]]

In [8]:
# 문제 2

In [9]:
class TfidfVectorizer():
  def __init__(self, tokenizer):
    self.tokenizer = tokenizer
    self.fit_checker = False
    # tfidf_matrix 선언
    self.tfidf_matrix = []
  
  def fit(self, sequences):
    tokenized = self.tokenizer.fit_transform(sequences)
    
    # 문제 2-1

    n = len(tokenized)
    tokens = list(set(sum(tokenized, [])))

    import scipy as sp

    #idf행렬 만들기
    idf_matrix = []
    for t in tokens:
      df = len([doc for doc in tokenized if t in doc])
      idf = sp.log(n/float(1+df))
      idf_matrix.append(idf)

    self.fit_checker = True
    return idf_matrix

  def transform(self, sequences):

    if self.fit_checker:
      tokenized = self.tokenizer.transform(sequences)
      
      # 문제 2-2

      #idf행렬 불러오기
      idf_mat = self.fit(sequences)
      tokens = list(set(sum(tokenized, [])))

      # tf행렬 만들기
      tf_mat = []
      for idx in range(len(tokenized)):
        doc = tokenized[idx]
        tf = [doc.count(t) for t in tokens]
        tf_mat.append(tf)

      # tf-idf = tf * idf
      for tf_one in tf_mat:
        multi = [tf_one[idx] * idf_mat[idx] for idx in range(len(tf_one))]
        self.tfidf_matrix.append(multi)

      return self.tfidf_matrix
    else:
      raise Exception("TfidfVectorizer instance is not fitted yet.")

  
  def fit_transform(self, sequences):
    self.fit(sequences)
    return self.transform(sequences)

In [10]:
# 테스트
test_list = ['I go to school.','I LIKE pizza!','You know you are So beautiful.','hey guys! you love pizza?']

In [11]:
tfidf = TfidfVectorizer(Tokenizer())

In [12]:
#idf행렬
tfidf.fit(test_list)



[0.28768207245178085,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453,
 0.28768207245178085,
 0.28768207245178085,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453]

In [13]:
#tf-idf값
tfidf.fit_transform(test_list)



[[0.28768207245178085,
  0.6931471805599453,
  0.6931471805599453,
  0.6931471805599453,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.28768207245178085,
  0.0,
  0.0,
  0.0,
  0.6931471805599453,
  0.28768207245178085,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.5753641449035617,
  0.6931471805599453,
  0.6931471805599453,
  0.6931471805599453,
  0.6931471805599453,
  0.0,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.28768207245178085,
  0.28768207245178085,
  0.0,
  0.0,
  0.0,
  0.0,
  0.6931471805599453,
  0.6931471805599453,
  0.6931471805599453]]