In [1]:
import math
import re


In [2]:
class Tokenizer():
    def __init__(self):
        self.word_dict = {'oov': 0}
        self.fit_checker = False
  
    def preprocessing(self, sequences):       
        # 전처리 함수 list(단수,복수 문자열), 문자열 입력가능
        
        def prepro(sequences):
            token=sequences.lower()                #소문자 변경
            token=re.sub(r"[^a-z0-9 ]","",token)   #소문자와 공백,숫자 빼고 제거
            token=token.split(" ")                 #빈칸 기준으로 스플릿            
            return token
    
        if type(sequences) == list:  #list인지 아닌지를 구분            
            return [prepro(text) for text in sequences]
        else:                        #문자열(위와 똑같이쓰면 문자별로 나눠버린다)
            return prepro(sequences)      
    
  
    def fit(self, sequences):
        #받은 인자를 기준으로 토큰을 만든다
        self.fit_checker = False
        result=self.preprocessing(sequences)     
        
        if type(result[0]) == list:           #nested list를 flat하게 만든다
            te=[x for y in result for x in y]
        else:te=result
    
        ext_re=list(set(te))                  # 중복 제거
        ext_re.sort()
        
        for x in ext_re:
            if x not in self.word_dict:
                self.word_dict[x]=len(self.word_dict)   # dict 갯수 = 새로 들어올 단어의 인덱스 (0이 이미 있기떄문에 -1을 안함)
            else:pass     
          
        self.fit_checker = True
        
  
    def transform(self, sequences):
        #만든 토큰을 기준으로 변환을 한다
        
        result = []
        tokens = self.preprocessing(sequences)
        if self.fit_checker:
            # output 형태를 nested list으로 맞추기위해 단수 문자열도 nested list으로 출력
            for token in tokens:
                token_list=[]                            
                for word in token:
                    if word in self.word_dict:      
                        token_list.append(self.word_dict[word]) #있으면 변환값을 넣음
                    else:
                        token_list.append(self.word_dict['oov']) #없으면 oov 값 출력
                result.append(token_list)  
            return result
        else:
            raise Exception("Tokenizer instance is not fitted yet.")
            
    def fit_transform(self, sequences):
        self.fit(sequences)
        result = self.transform(sequences)
        
        return result

In [3]:
class TfidfVectorizer:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.fit_checker = False     
        
    def fit(self, sequences):
        #idf를 구하는 함수
        tokenized = self.tokenizer.fit_transform(sequences) #tokenize로 변환된 문자열
        tokens = list(set([x for y in tokenized for x in y])) #tokenized에서 중복을 제거한 리스트
        tokens.sort()       
       
        def idf(word):
            df=0            
            for text in tokenized:#문서당 해당 단어를 얼마나 포함하고있는지 체크
                df += word in text                
            return math.log(len(tokenized)/(df+1),math.e) #log e 총 문서량/포함한 문서량 +1(분모가 0이 아니게 하려고)
                
        self.fit_checker = True
        
        return [idf(token) for token in tokens]
      
    

    def transform(self, sequences,idf):
        #tf-idf를 계산하는 함수
        if self.fit_checker:
            tokenized = self.tokenizer.transform(sequences) #tokenize로 변환된 문자열
            tokens = list(set([x for y in tokenized for x in y])) #tokenized에서 중복을 제거한 리스트
            tokens.sort()                            
            def tf_idf(text):
                #tf 문자열(문서) 마다 몇개씩 들어갔는지 * 이전에 구한 idf
                return [text.count(tokens[n])*idf[n] for n in range(len(tokens))]
            #문장마다 계산한다
            return [tf_idf(text) for text in tokenized]
        else:
            raise Exception("TfidfVectorizer instance is not fitted yet.")

  
    def fit_transform(self, sequences):             
        idf=self.fit(sequences)
        return self.transform(sequences,idf)

In [4]:
import pandas as pd #확인용 판다스

test=['I go to school.', 'I LIKE pizza!']
test2=['Im so sorry~ but i love u~ Like Pizza!']
test3='Holly Molly 2'
test4=['If you are not sure about your level, dont worry, you can take this online test.',
       'There are 60 multiple-choice questions and there is no time limit.',
       'You will be able to see the answers when you finish the test.'
       'Improve your writing with the exercises suggested in each lesson.',
       'You will learn how to organise and connect the text in your compositions.',
       'Different types of texts for each level: A1, A2, B1, B1+, or B2.',]
test5 = [
  'eat want apple',
  'eat want banana',
  'long yello banana banana',
  'i"m furuit like'
] 


In [5]:
for_test1=Tokenizer()

for_test1.fit_transform(test5)

[[3, 8, 1], [3, 8, 2], [7, 9, 2, 2], [5, 4, 6]]

In [6]:
for_test2=TfidfVectorizer(Tokenizer())

df=pd.DataFrame(for_test2.fit_transform(test5),columns=list(for_test2.tokenizer.word_dict.keys())[1:]) #간단하게 확인하기 위해 word_dict의 key값을 가져왔지만 실제로는 하나씩 있는걸 가져와야할 것
df

Unnamed: 0,apple,banana,eat,furuit,im,like,long,want,yello
0,0.693147,0.0,0.287682,0.0,0.0,0.0,0.0,0.287682,0.0
1,0.0,0.287682,0.287682,0.0,0.0,0.0,0.0,0.287682,0.0
2,0.0,0.575364,0.0,0.0,0.0,0.0,0.693147,0.0,0.693147
3,0.0,0.0,0.0,0.693147,0.693147,0.693147,0.0,0.0,0.0
