In [1]:
import logging
import gensim
import pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer

from helpers import remove_tags, seperate_ontology

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

In [3]:
data = pd.read_excel('../data/0_raw/RegInsight_Dataset.xlsx', engine='openpyxl')
data['RegInsightTextNative_Clean']=data['RegInsightTextNative'].apply(lambda cw : remove_tags(cw))
data['RegOntologyId_Clean']=data['RegOntologyId'].apply(lambda cw : seperate_ontology(cw))

In [4]:
NorthAmerica = ['United States of America', 'Canada', 'US State - New York']
df_NorthAmerica = data[data['CUBEJurisdiction'].isin(NorthAmerica)]

In [21]:
df=pd.DataFrame(data['RegInsightTextNative_Clean'])
df.columns=['text']

In [22]:
%%time
documents = [_text.split() for _text in df.text] 

CPU times: user 1.74 s, sys: 822 ms, total: 2.56 s
Wall time: 3.97 s


In [23]:
w2v_model = gensim.models.word2vec.Word2Vec(vector_size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            workers=8)

2021-10-04 10:22:30,925 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=300, alpha=0.025)', 'datetime': '2021-10-04T10:22:30.924798', 'gensim': '4.1.2', 'python': '3.7.0 (v3.7.0:1bf9cc5093, Jun 26 2018, 23:26:24) \n[Clang 6.0 (clang-600.0.57)]', 'platform': 'Darwin-18.0.0-x86_64-i386-64bit', 'event': 'created'}


In [24]:
w2v_model.build_vocab(documents);

2021-10-04 10:22:31,015 : INFO : collecting all words and their counts
2021-10-04 10:22:31,021 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-10-04 10:22:34,756 : INFO : collected 393219 word types from a corpus of 12808329 raw words and 8693 sentences
2021-10-04 10:22:34,759 : INFO : Creating a fresh vocabulary
2021-10-04 10:22:35,247 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 retains 43889 unique words (11.16146473084973%% of original 393219, drops 349330)', 'datetime': '2021-10-04T10:22:35.246985', 'gensim': '4.1.2', 'python': '3.7.0 (v3.7.0:1bf9cc5093, Jun 26 2018, 23:26:24) \n[Clang 6.0 (clang-600.0.57)]', 'platform': 'Darwin-18.0.0-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2021-10-04 10:22:35,248 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 leaves 12141544 word corpus (94.79412966359624%% of original 12808329, drops 666785)', 'datetime': '2021-10-04T10:22:35.248469', 'gensim': '4.1.2', 'python': '3.7

In [25]:
words = w2v_model.wv.index_to_key
vocab_size = len(words)
print("Vocab size", vocab_size)

Vocab size 43889


In [26]:


%%time
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH);



2021-10-04 10:22:36,790 : INFO : Word2Vec lifecycle event {'msg': 'training model with 8 workers on 43889 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=7 shrink_windows=True', 'datetime': '2021-10-04T10:22:36.790827', 'gensim': '4.1.2', 'python': '3.7.0 (v3.7.0:1bf9cc5093, Jun 26 2018, 23:26:24) \n[Clang 6.0 (clang-600.0.57)]', 'platform': 'Darwin-18.0.0-x86_64-i386-64bit', 'event': 'train'}
2021-10-04 10:22:37,808 : INFO : EPOCH 1 - PROGRESS: at 4.01% examples, 537884 words/s, in_qsize 15, out_qsize 0
2021-10-04 10:22:38,809 : INFO : EPOCH 1 - PROGRESS: at 8.73% examples, 569414 words/s, in_qsize 16, out_qsize 3
2021-10-04 10:22:39,817 : INFO : EPOCH 1 - PROGRESS: at 14.79% examples, 594929 words/s, in_qsize 15, out_qsize 0
2021-10-04 10:22:40,824 : INFO : EPOCH 1 - PROGRESS: at 20.89% examples, 603548 words/s, in_qsize 15, out_qsize 0
2021-10-04 10:22:41,824 : INFO : EPOCH 1 - PROGRESS: at 27.16% examples, 599878 words/s, in_qsize 15, out_qsize 0
2021-10

CPU times: user 23min 47s, sys: 14.3 s, total: 24min 1s
Wall time: 9min 23s


(311896808, 409866528)

In [27]:
# w2v_model.wv.most_similar("america")
# w2v_model.wv.most_similar("data")
w2v_model.wv.most_similar("money")

[('funds', 0.48755329847335815),
 ('proceeds', 0.3728924095630646),
 ('cash', 0.36217111349105835),
 ('money.', 0.3498491048812866),
 ('monies', 0.3430376648902893),
 ('payments', 0.33700433373451233),
 ('funds,', 0.33405762910842896),
 ('hedge', 0.32827451825141907),
 ('Wiseman', 0.32818976044654846),
 ('misappropriating', 0.3251149356365204)]

In [32]:
w2v_model.wv.similarity('money', 'withdrawal')

0.094042175

In [33]:
w2v_model.wv.similarity('money', 'deposit')

0.24093191

In [29]:
w2v_model.wv.index_to_key[:2]

['the', 'of']

In [30]:
%%time
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.text)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)



Total words 154495
CPU times: user 17 s, sys: 2.14 s, total: 19.2 s
Wall time: 20.5 s
