In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
import gensim
import re
import logging
from helpers import remove_tags, seperate_ontology

# Set log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [48]:
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1045)>


False

In [41]:
# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

# EXPORT
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"

In [42]:
data = pd.read_excel('../data/0_raw/RegInsight_Dataset.xlsx', engine='openpyxl')
data['RegInsightTextNative_Clean']=data['RegInsightTextNative'].apply(lambda cw : remove_tags(cw))
data['RegOntologyId_Clean']=data['RegOntologyId'].apply(lambda cw : seperate_ontology(cw))

In [66]:
df = data[['RegInsightTextNative_Clean']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [67]:
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
def decode_sentiment(label):
    return decode_map[int(label)]

In [68]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [69]:
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [70]:
%%time
df.text = df.text.apply(lambda x: preprocess(x))

CPU times: user 42.2 s, sys: 537 ms, total: 42.8 s
Wall time: 49.2 s


In [71]:


df_train, df_test = train_test_split(df, test_size=1-TRAIN_SIZE, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))



TRAIN size: 6954
TEST size: 1739


In [72]:
%%time
documents = [_text.split() for _text in df_train.text] 

CPU times: user 1.28 s, sys: 609 ms, total: 1.89 s
Wall time: 2.38 s


In [81]:
w2v_model = gensim.models.word2vec.Word2Vec(vector_size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            workers=8)

2021-10-03 10:55:56,201 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=300, alpha=0.025)', 'datetime': '2021-10-03T10:55:56.200751', 'gensim': '4.1.2', 'python': '3.7.0 (v3.7.0:1bf9cc5093, Jun 26 2018, 23:26:24) \n[Clang 6.0 (clang-600.0.57)]', 'platform': 'Darwin-18.0.0-x86_64-i386-64bit', 'event': 'created'}


In [82]:
w2v_model.build_vocab(documents);

2021-10-03 10:56:00,048 : INFO : collecting all words and their counts
2021-10-03 10:56:00,050 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-10-03 10:56:03,429 : INFO : collected 105466 word types from a corpus of 6977128 raw words and 6954 sentences
2021-10-03 10:56:03,430 : INFO : Creating a fresh vocabulary
2021-10-03 10:56:03,659 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 retains 19257 unique words (18.25896497449415%% of original 105466, drops 86209)', 'datetime': '2021-10-03T10:56:03.659004', 'gensim': '4.1.2', 'python': '3.7.0 (v3.7.0:1bf9cc5093, Jun 26 2018, 23:26:24) \n[Clang 6.0 (clang-600.0.57)]', 'platform': 'Darwin-18.0.0-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2021-10-03 10:56:03,660 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 leaves 6802624 word corpus (97.4989135931002%% of original 6977128, drops 174504)', 'datetime': '2021-10-03T10:56:03.660568', 'gensim': '4.1.2', 'python': '3.7.0 (v

In [93]:
words = w2v_model.wv.index_to_key
vocab_size = len(words)
print("Vocab size", vocab_size)

Vocab size 19257


In [94]:


%%time
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH);



2021-10-03 11:02:27,158 : INFO : Word2Vec lifecycle event {'msg': 'training model with 8 workers on 19257 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=7 shrink_windows=True', 'datetime': '2021-10-03T11:02:27.158541', 'gensim': '4.1.2', 'python': '3.7.0 (v3.7.0:1bf9cc5093, Jun 26 2018, 23:26:24) \n[Clang 6.0 (clang-600.0.57)]', 'platform': 'Darwin-18.0.0-x86_64-i386-64bit', 'event': 'train'}
2021-10-03 11:02:28,214 : INFO : EPOCH 1 - PROGRESS: at 7.25% examples, 438653 words/s, in_qsize 15, out_qsize 0
2021-10-03 11:02:29,296 : INFO : EPOCH 1 - PROGRESS: at 13.16% examples, 394724 words/s, in_qsize 15, out_qsize 0
2021-10-03 11:02:30,315 : INFO : EPOCH 1 - PROGRESS: at 20.25% examples, 418064 words/s, in_qsize 16, out_qsize 0
2021-10-03 11:02:31,318 : INFO : EPOCH 1 - PROGRESS: at 27.11% examples, 425432 words/s, in_qsize 15, out_qsize 0
2021-10-03 11:02:32,332 : INFO : EPOCH 1 - PROGRESS: at 35.45% examples, 442185 words/s, in_qsize 16, out_qsize 1
2021-1

CPU times: user 14min 40s, sys: 8.62 s, total: 14min 49s
Wall time: 5min 44s


(208086747, 223268096)

In [135]:
w2v_model.wv.most_similar("america")
# w2v_model.wv.most_similar("data")

[('pondent', 0.3976114094257355),
 ('ontario', 0.3891284167766571),
 ('nicl', 0.38507279753685),
 ('jfk', 0.37874433398246765),
 ('bofa', 0.37341007590293884),
 ('1617', 0.3613406717777252),
 ('inc', 0.35106754302978516),
 ('stonestreet', 0.35037073493003845),
 ('bnp', 0.3469139039516449),
 ('determinationschanges', 0.34622156620025635)]

In [133]:
w2v_model.wv.index_to_key[:2]

['exchange', 'rule']

In [105]:
%%time
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.text)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)



Total words 105467
CPU times: user 5.51 s, sys: 109 ms, total: 5.62 s
Wall time: 6.47 s
