Download hebrew dataset from wikipedia
   - Go to: https://dumps.wikimedia.org/hewiki/latest/
   - Download `hewiki-latest-pages-articles.xml.bz2`
   
   In linux this can be easily done using: 
   
   wget https://dumps.wikimedia.org/hewiki/latest/hewiki-latest-pages-articles.xml.bz2

   

In [18]:
import os
import sys
import bz2
import logging
import multiprocessing

import gensim
from config import PROJECT_DIR
from nlp.yap_api import YapApi


# initialize parameters:

In [19]:

DATA_PATH   = os.path.join(PROJECT_DIR, 'data', 'wiki')
MODEL_PATH  = os.path.join(PROJECT_DIR, 'word2vec/')

DICTIONARY_FILEPATH = os.path.join(DATA_PATH, 'wiki-hebrew_wordids.txt.bz2')
DICTIONARY_FILEPATH_PROCESSED =  os.path.join(DATA_PATH, 'wiki-hebrew_wordids_PROCESSED.txt.bz2')
WIKI_DUMP_FILEPATH = os.path.join(DATA_PATH, 'hewiki-latest-pages-articles.xml.bz2')

# Check if the required files have been downloaded
if not WIKI_DUMP_FILEPATH:
    print('Wikipedia articles dump could not be found..')
    print('Please see README.md for instructions!')
    sys.exit()

# Get number of available cpus
cores = multiprocessing.cpu_count()
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)

# Initialize logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Build corpus from Wikipedia

In [20]:

if not os.path.isfile(DICTIONARY_FILEPATH):
        logging.info('Dictionary has not been created yet..')
        logging.info('Creating dictionary (takes about 9h)..')

        # Construct corpus
        wiki = gensim.corpora.WikiCorpus(WIKI_DUMP_FILEPATH)

        # Remove words occuring less than 20 times, and words occuring in more
        # than 10% of the documents. (keep_n is the vocabulary size)
        wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=100000)

        # Save dictionary to file
        wiki.dictionary.save_as_text(DICTIONARY_FILEPATH)
        del wiki

# Load dictionary from file
dictionary = gensim.corpora.Dictionary.load_from_text(DICTIONARY_FILEPATH)

# Construct corpus using dictionary
wiki = gensim.corpora.WikiCorpus(WIKI_DUMP_FILEPATH, dictionary=dictionary)


# Preprocess wiki

In [None]:
if not os.path.isfile(DICTIONARY_FILEPATH_PROCESSED):
    logging.info('preprocessing dictionary (takes about 9h)..')
    yap = YapApi()

    for doc in wiki:
        

# Train word2vec on corpus

Start Yap call
Tokens: 4
End Yap call 0 /0


In [30]:

class SentencesIterator:
        def __init__(self, wiki, processed=True):
            self.wiki = wiki
            self.processed = processed

        def __iter__(self):
            for sentence in self.wiki.get_texts():
                if (self.processed):
                    print(" ".join(sentence))
                    raw, processed, lemmas, dep_tree, md_lattice, ma_lattice = yap.run(" ".join(sentence))
                    sentence = " ".split(processed)
                yield list(map(lambda x: x, sentence))


In [84]:

# Initialize simple sentence iterator required for the Word2Vec model
sentences = SentencesIterator(wiki)
logging.info('Training word2vec model..')
model = gensim.models.Word2Vec(sentences=sentences, workers=cores)
# Save model
logging.info('Saving model..')
model.save(os.path.join(MODEL_PATH, 'word2vec.model'))
logging.info('Done training word2vec model!')

2022-03-29 19:33:28,088 : INFO : Training word2vec model..
2022-03-29 19:33:28,089 : INFO : collecting all words and their counts
2022-03-29 19:33:29,173 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


KeyboardInterrupt: 

# Train doc2vec on corpus

In [241]:
import gensim.models.doc2vec

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
assert gensim.models.doc2vec.FAST_VERSION  > -1, "This will be painfully slow otherwise"
MODEL_PATH  = os.path.join(PROJECT_DIR, 'doc2vec/')



In [242]:

class DocsIterator:
        i = 0
        def __init__(self, wiki):
            self.wiki = wiki

        def __iter__(self):
            for sentence in self.wiki.get_texts():
                self.i+=1
                yield TaggedDocument(list(map(lambda x: x, sentence)), [self.i])

In [243]:
sentences = DocsIterator(wiki)


model = gensim.models.doc2vec.Doc2Vec(dm=1, dm_concat=1, vector_size=100, min_count=2, epochs=10, workers=multiprocessing.cpu_count(), negative=5, window=10)


2022-03-30 16:33:28,878 : INFO : using concatenative 2100-dimensional layer1
2022-03-30 16:33:28,879 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/c,d100,n5,w10,mc2,s0.001,t8)', 'datetime': '2022-03-30T16:33:28.879410', 'gensim': '4.1.2', 'python': '3.8.13 | packaged by conda-forge | (default, Mar 25 2022, 06:04:14) \n[Clang 12.0.1 ]', 'platform': 'macOS-12.3-arm64-arm-64bit', 'event': 'created'}


In [244]:
model.build_vocab(sentences)
model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)

logging.info('Saving model..')
model.save(os.path.join(MODEL_PATH, 'doc2vec.model'))
logging.info('Done training doc2vec model!')

2022-03-30 16:33:32,101 : INFO : collecting all words and their counts
2022-03-30 16:33:33,325 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2022-03-30 16:34:05,679 : INFO : PROGRESS: at example #10000, processed 13621561 words (421096/s), 477061 word types, 0 tags
2022-03-30 16:34:31,000 : INFO : PROGRESS: at example #20000, processed 22638376 words (356116/s), 633026 word types, 0 tags
2022-03-30 16:34:52,561 : INFO : PROGRESS: at example #30000, processed 30181875 words (349873/s), 736766 word types, 0 tags
2022-03-30 16:35:13,333 : INFO : PROGRESS: at example #40000, processed 36863708 words (321705/s), 823066 word types, 0 tags
2022-03-30 16:35:32,401 : INFO : PROGRESS: at example #50000, processed 42985227 words (321041/s), 904963 word types, 0 tags
2022-03-30 16:35:52,587 : INFO : PROGRESS: at example #60000, processed 49200063 words (307911/s), 981085 word types, 0 tags
2022-03-30 16:36:13,082 : INFO : PROGRESS: at example #70000, processed 550

In [255]:
##### base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# matplotlib
%matplotlib inline

# display
from IPython.display import display

# autoreload
%load_ext autoreload
%autoreload 2

# warnings
import warnings
warnings.filterwarnings('ignore')

# fix random seed
from numpy.random import seed as set_random_seed
set_random_seed(42)


# debug
from icecream import ic
debug = ic


from data_extraction.articles import get_articles
articles_processed = get_articles(processed=True)
articles_processed.sample().iloc[0]
for col in articles_processed.columns:
    articles_processed = articles_processed[articles_processed[col].str.len() > 0]
from spacy.lang.he.stop_words import STOP_WORDS
STOP_WORDS.update(',')
from feature_extraction.article import tokenize_article
articles = articles_processed.apply(tokenize_article, axis = 1, args = [[STOP_WORDS]])
raw_articles = get_articles()

data = pd.merge(
    articles,
    raw_articles,
    right_index = True,
    left_index = True,
    how = 'inner',
    suffixes= ('', '_raw')
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [305]:
def read_corpus(data, tokens_only=False):
    for i in range(len(data)):
        article = data.iloc[i]['content']
        article = [word for line in article for word in line]
        if tokens_only:
            yield article
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(article, [i])


sents = list(read_corpus(data))


In [306]:
model = gensim.models.doc2vec.Doc2Vec( vector_size=100, min_count=2, epochs=20, workers=multiprocessing.cpu_count(), negative=5)

model.build_vocab(sents)
model.train(sents, total_examples=model.corpus_count, epochs=model.epochs)

logging.info('Saving model..')
model.save(os.path.join(MODEL_PATH, 'doc2vec.model'))
logging.info('Done training doc2vec model!')

2022-03-30 19:47:48,714 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d100,n5,w5,mc2,s0.001,t8)', 'datetime': '2022-03-30T19:47:48.714044', 'gensim': '4.1.2', 'python': '3.8.13 | packaged by conda-forge | (default, Mar 25 2022, 06:04:14) \n[Clang 12.0.1 ]', 'platform': 'macOS-12.3-arm64-arm-64bit', 'event': 'created'}
2022-03-30 19:47:48,715 : INFO : collecting all words and their counts
2022-03-30 19:47:48,716 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2022-03-30 19:47:48,906 : INFO : collected 36780 word types and 1791 unique tags from a corpus of 1791 examples and 886260 words
2022-03-30 19:47:48,907 : INFO : Creating a fresh vocabulary
2022-03-30 19:47:49,031 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=2 retains 22815 unique words (62.03099510603589%% of original 36780, drops 13965)', 'datetime': '2022-03-30T19:47:49.031191', 'gensim': '4.1.2', 'python': '3.8.13 | packaged by conda-forge | (default, Mar 25 2022, 06:

# TF-IDF Model

In [86]:
##### base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os


from data_extraction.articles import get_articles
articles_processed = get_articles(processed=True)
articles_processed.sample().iloc[0]
for col in articles_processed.columns:
    articles_processed = articles_processed[articles_processed[col].str.len() > 0]
# from spacy.lang.he.stop_words import STOP_WORDS
# STOP_WORDS.update(',')
STOP_WORDS =[]


from feature_extraction.article import tokenize_article
articles = articles_processed.apply(tokenize_article, axis = 1, args = [[STOP_WORDS]])
raw_articles = get_articles()


data = pd.merge(
    articles,
    raw_articles,
    right_index = True,
    left_index = True,
    how = 'inner',
    suffixes= ('', '_raw')
)


  art_tokenized = pd.Series(index=art.index, name=art.name)


In [89]:

def read_corpus(data, tokens_only=False):
    for i in range(len(data)):
        article = data.iloc[i]['content']
        article = [word for line in article for word in line]
        yield article
   

sents = list(read_corpus(data))


In [90]:
dictionary = gensim.corpora.Dictionary(sents)

2022-04-01 22:06:01,396 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2022-04-01 22:06:01,860 : INFO : built Dictionary(36780 unique tokens: ['', ',', 'אורניום', 'אחוז', 'אחרים']...) from 1791 documents (total 886260 corpus positions)
2022-04-01 22:06:01,861 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(36780 unique tokens: ['', ',', 'אורניום', 'אחוז', 'אחרים']...) from 1791 documents (total 886260 corpus positions)", 'datetime': '2022-04-01T22:06:01.861435', 'gensim': '4.1.2', 'python': '3.8.13 | packaged by conda-forge | (default, Mar 25 2022, 06:04:14) \n[Clang 12.0.1 ]', 'platform': 'macOS-12.3-arm64-arm-64bit', 'event': 'created'}


In [91]:
bow_corpus = [dictionary.doc2bow(text) for text in sents]


In [92]:
model = gensim.models.TfidfModel(bow_corpus, normalize=True)


2022-04-01 22:06:14,199 : INFO : collecting document frequencies
2022-04-01 22:06:14,201 : INFO : PROGRESS: processing document #0
2022-04-01 22:06:14,287 : INFO : TfidfModel lifecycle event {'msg': 'calculated IDF weights for 1791 documents and 36780 features (504125 matrix non-zeros)', 'datetime': '2022-04-01T22:06:14.287025', 'gensim': '4.1.2', 'python': '3.8.13 | packaged by conda-forge | (default, Mar 25 2022, 06:04:14) \n[Clang 12.0.1 ]', 'platform': 'macOS-12.3-arm64-arm-64bit', 'event': 'initialize'}


In [98]:
words = [ 'ממשלה', 'מה', 'אתם','מקרר']
print(model[dictionary.doc2bow(words)])


[(82, 0.033901128074771594), (169, 0.22013880197684657), (404, 0.08236429332515928), (35474, 0.9713937124382183)]


In [108]:
TFIDF_PATH  = os.path.join(PROJECT_DIR, 'tfidf/')

model.save(os.path.join(TFIDF_PATH, 'tfidf.model'))

2022-04-02 12:46:19,763 : INFO : TfidfModel lifecycle event {'fname_or_handle': '/Users/uri/Documents/Code/ynet-summarizer/tfidf/tfidf.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-04-02T12:46:19.763617', 'gensim': '4.1.2', 'python': '3.8.13 | packaged by conda-forge | (default, Mar 25 2022, 06:04:14) \n[Clang 12.0.1 ]', 'platform': 'macOS-12.3-arm64-arm-64bit', 'event': 'saving'}
2022-04-02 12:46:19,811 : INFO : saved /Users/uri/Documents/Code/ynet-summarizer/tfidf/tfidf.model


In [109]:
dictionary.save_as_text(os.path.join(TFIDF_PATH, 'dict.txt'))

2022-04-02 12:46:31,461 : INFO : saving dictionary mapping to /Users/uri/Documents/Code/ynet-summarizer/tfidf/dict.txt
