In [9]:
import nltk

import utils.luigi_wrapper as luigi
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

stop_words = set(stopwords.words('english'))
porter = PorterStemmer()

from preprocess.data_extractor import DataExtractor
from preprocess.page_list_extractor import WikipediaListExtractorTask
from utils.utils import *

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [10]:
def requires():
    return DataExtractor()

def tokenize_doc(doc):
    tokens = nltk.word_tokenize(doc)
    tokens = [w for w in tokens if not w in stop_words]
        
    tokens = [porter.stem(w) for w in tokens]
    return list(tokens)

def __get_vocabulary(texts) -> set:
    vocab: set = set()
    for doc in texts:
        tokens = tokenize_doc(doc)
        vocab = vocab.union(tokens)
    return vocab

In [14]:
full_df = requires().get_output()

vocab = __get_vocabulary(full_df['text'])

tokenized_text = full_df['text'].apply(lambda text: ' '.join(tokenize_doc(text)))

vectorizer = TfidfVectorizer()
vectorizer.fit(tokenized_text)
transformed_array = vectorizer.transform(tokenized_text)

tokenized_df = pd.DataFrame(index=full_df.index)
for col, value in zip(vectorizer.get_feature_names(), transformed_array.toarray().T):
    tokenized_df[col] = value
# save_data(tokenized_df, output().path)


'DataExtractor' output's path is D:\אלון\תואר שני\שיטות מתקדמות בלמידה חישובית\wikipedia-summary\cache\full_df.pickle


In [13]:
tokenized_df

Unnamed: 0_level_0,000,007,01,02,03,04,0489,05,06,07,...,여자,예쁘장한,오징어,육성재,전화기를,정대현,좋아,즐거운,참소녀,한방울
page,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Selene Vigil-Wilk,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jay Park,0.029705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.004952,0.0,0.004952,0.0,0.0,0.004952
Miho Miyazaki,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Wu Xuanyi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Shizuka Ōya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Brad Arnold,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Emika Kamieda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Yook Sung-jae,0.0,0.0,0.0,0.015464,0.0,0.0,0.0,0.014278,0.0,0.0,...,0.0,0.017136,0.017136,0.017136,0.0,0.0,0.0,0.0,0.0,0.0
Kazumi Urano,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Tetsuya Kajiwara (musician),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
