## Sources in which this notebook was inspired from 
### https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial
### https://machinelearningmastery.com/develop-word-embeddings-python-gensim/
### https://towardsdatascience.com/visualising-high-dimensional-datasets-using-pca-and-t-sne-in-python-8ef87e7915b
### https://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb#topic=0&lambda=1&term=
### https://districtdatalabs.silvrback.com/principal-component-analysis-with-python
### https://plot.ly/ipython-notebooks/principal-component-analysis/
### https://www.kaggle.com/hamishdickson/training-and-plotting-word2vec-with-bigrams

In [1]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from gensim.models import Word2Vec
from mpl_toolkits.mplot3d import Axes3D
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
import matplotlib.cm as cm
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pyodbc
import os
import time
import string
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import pickle
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value
output_notebook()

## Grabbing the full IMA data from SQL 

In [2]:
con = pyodbc.connect(r"Driver={/usr/local/lib/libmsodbcsql.17.dylib};Server=EPGLBSQLDSDV01,50429;Database=GILA;Trusted_Connection=yes;timeout=login_timeout")
rules = pd.read_sql(f'SELECT * FROM dbo.TrainingDB_v2 with(nolock) WHERE Comments IS NOT NULL', con)

## Creating a few cleaner functions for the Comments field 

In [3]:
def note_cleaner(rules_df):
    """
    This takes in the comments from the text cleans them for certain strings shown in note_list below
    :param rules_df: dataframe with current rules
    :return: dataframe with cleaned comments
    """
    note_list = ['OCU NOTE:', 'BBG NOTE: ', 'US NOTE:', 'AIM RULE:', 'MANUAL CHECK ON FAIL', 'NOTE:']
    rules_df['Comments'] = rules_df.Comments.str.split('|'.join(note_list)).str.get(0)
    return rules_df

In [4]:
def reference_sentence_preprocessing(clean_rules_df):
    '''
    Function that takes in a dataframe with SQL table with all fund mandate information.

    :param rules: pandas dataframe
    :return: list of word_tokenized paragraphs/sentences
    '''
    # NLTK word tokenizer with stop words removed
    stop_words = set(nltk.corpus.stopwords.words('english'))
    stopped_sentences = []
    for row in clean_rules_df['Comments']:
        for sentence in nltk.sent_tokenize(row):
            cleaned_sentence = sentence.translate(str.maketrans('', '', string.punctuation))
            cleaned_sentences = []
            for word in nltk.word_tokenize(cleaned_sentence):
                if word.lower() not in stop_words:
                    cleaned_sentences.append(word.lower())
            stopped_sentences.append(cleaned_sentences)
    return stopped_sentences

In [6]:
def reference_sentence_preprocessing_yo(row):
    '''
    Function that takes in a dataframe with SQL table with all fund mandate information.

    :param rules: pandas dataframe
    :return: list of word_tokenized paragraphs/sentences
    '''
    # NLTK word tokenizer with stop words removed
    stop_words = set(nltk.corpus.stopwords.words('english'))
    stopped_words = []
#     for row in clean_rules_df['Comments']:
    for word in nltk.word_tokenize(row):
            if word.lower() not in stop_words:
                stopped_words.append(word.lower().translate(str.maketrans('', '', string.punctuation)))
    return stopped_words

In [7]:
clean_rules_df = note_cleaner(rules)

In [8]:
clean_rules_df['clean_comments'] = clean_rules_df['Comments'].apply(lambda row: reference_sentence_preprocessing_yo(row))
clean_rules_df['clean_comments']

target = clean_rules_df['Description']
target
X_train, X_test, y_train, y_test = train_test_split(clean_rules_df, target, test_size=0.10, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(423, 21)
(47, 21)
(423,)
(47,)


In [9]:
import multiprocessing

cores = multiprocessing.cpu_count()

## Starting the training of the Word2Vec models on cleaned sentence data 

In [10]:
model_cbow = Word2Vec(size=100, #dimensionality of vectors
                      sample=0.001, # threshold for configuring which high-freq. words are randomly downsampled
                      alpha=0.025, # the initial learning rate 
                      min_count=2, # Ignores all words with total absolute frequency lower than this 
                      min_alpha=0.0001, #Learning rate will linearly drop to min_alpha as training progresses. 
                      sg=0, # 0= CBOW and 1= skipgram
                      workers=cores-1)

In [11]:
model_skipgram = Word2Vec(size=100, #dimensionality of vectors
                      sample=0.001, # threshold for configuring which high-freq. words are randomly downsampled
                      alpha=0.025, # the initial learning rate 
                      min_count=2, # Ignores all words with total absolute frequency lower than this 
                      min_alpha=0.0001, #Learning rate will linearly drop to min_alpha as training progresses. 
                      sg=1, # 0= CBOW and 1= skipgram
                      workers=cores-1)

## Build the Vocabulary 


In [12]:
model_cbow.build_vocab(clean_rules_df['clean_comments'], progress_per=1)

In [13]:
model_skipgram.build_vocab(clean_rules_df['clean_comments'], progress_per=1000)

In [14]:
model_cbow.iter


Call to deprecated `iter` (Attribute will be removed in 4.0.0, use self.epochs instead).



5

## Training the Model


In [15]:
model_cbow.train(clean_rules_df['clean_comments'],
                total_examples=model_cbow.corpus_count, # count of sentences
                epochs=30, # number of iterations over the corpus
                report_delay=1)

(484055, 729180)

In [16]:
model_skipgram.train(clean_rules_df['clean_comments'],
                total_examples=model_skipgram.corpus_count, # count of sentences
                epochs=30, # number of iterations over the corpus
                report_delay=1)

(483799, 729180)

In [17]:
model_cbow.init_sims(replace=True)
model_skipgram.init_sims(replace=True)

## Preparing Cbow for Visualization with T-SNE


In [18]:
# build a list of the terms, integer indices,
# and term counts from the word2vec model vocabulary
ordered_vocab = [(term, voc.index, voc.count) for term, voc in model_cbow.wv.vocab.items()]

# sort by the term counts, so the most common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda x:x[2], reverse=True)

# unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

# create a DataFrame with the word2vec vectors as data,
# and the terms as row labels
tsne_input = pd.DataFrame(model_cbow.wv.syn0norm[term_indices, :],index=ordered_terms)

tsne_input.shape

tsne_filepath = os.path.join(os.getcwd(),
                             u'tsne_model_cbow')
tsne_vectors_filepath = os.path.join(os.getcwd(),
                                     u'tsne_vectors_cbow.npy')

tsne = TSNE(n_components=3)
tsne_vectors = tsne.fit_transform(tsne_input.values)

with open(tsne_filepath, 'wb') as f:
    pickle.dump(tsne, f)

pd.np.save(tsne_vectors_filepath, tsne_vectors)
    
with open(tsne_filepath, 'rb') as f:
    tsne = pickle.load(f, encoding='utf-8', errors='ignore')
    
tsne_vectors = pd.np.load(tsne_vectors_filepath)

tsne_vectors = pd.DataFrame(tsne_vectors,
                            index=pd.Index(tsne_input.index),
                            columns=[u'x_coord', u'y_coord', u'z_coord'])

tsne_vectors.head()

tsne_vectors[u'word'] = tsne_vectors.index


Call to deprecated `syn0norm` (Attribute will be removed in 4.0.0, use self.wv.vectors_norm instead).



In [19]:
tsne_vectors.head()

Unnamed: 0,x_coord,y_coord,z_coord,word
,-3.645095,-20.349792,-7.971199,
fund,-11.264399,7.351821,0.671343,fund
securities,-7.152228,-14.008588,-9.168035,securities
investment,0.754216,17.764633,1.327887,investment
may,-11.271077,-1.933685,-8.044405,may


## Visualizing Model_cbow with T-SNE

In [None]:
# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(tsne_vectors)

# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   tools= (u'pan, wheel_zoom, box_zoom,'
                           u'box_select, reset'),
                   active_scroll=u'wheel_zoom')

# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@word') )

# draw the words as circles on the plot
tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                 color=u'blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color=u'black')

# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# engage!
show(tsne_plot);

In [None]:
trace1 = go.Scatter3d(
    x=tsne_vectors['x_coord'], 
    y=tsne_vectors['y_coord'], 
    z=tsne_vectors['z_coord'],
    mode='markers',
    marker=dict(
        size=5,
#         color=subset_bidask.values,                # set color to an array/list of desired values
        colorscale='Jet',   # choose a colorscale
        opacity=0.8,
        colorbar={"thickness": 15, "len": 0.8, "x": 0.8, "y": 0.4, }
    )
)

data = [trace1]
layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='3d-scatter-colorscale')

## Preparing Skipgram for Visualization with T-SNE


In [None]:
# build a list of the terms, integer indices,
# and term counts from the food2vec model vocabulary
ordered_vocab = [(term, voc.index, voc.count)
                 for term, voc in model_skipgram.wv.vocab.items()]
# sort by the term counts, so the most common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda x:x[2], reverse=True)

# unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

# create a DataFrame with the food2vec vectors as data,
# and the terms as row labels
tsne_input = pd.DataFrame(model_skipgram.wv.syn0norm[term_indices, :],
                            index=ordered_terms)

tsne_filepath = os.path.join(os.getcwd(),
                             u'tsne_model_skipgram')
tsne_vectors_filepath = os.path.join(os.getcwd(),
                                     u'tsne_vectors_skipgram.npy')
tsne = TSNE(n_components=3)
tsne_vectors = tsne.fit_transform(tsne_input.values)

with open(tsne_filepath, 'wb') as f:
    pickle.dump(tsne, f)

pd.np.save(tsne_vectors_filepath, tsne_vectors)
    
with open(tsne_filepath, 'rb') as f:
    tsne = pickle.load(f, encoding='utf-8', errors='ignore')
    
tsne_vectors = pd.np.load(tsne_vectors_filepath)

tsne_vectors = pd.DataFrame(tsne_vectors,
                            index=pd.Index(tsne_input.index),
                            columns=[u'x_coord', u'y_coord', u'z_coord'])

tsne_vectors[u'word'] = tsne_vectors.index

## Visualizing Model_skipgram with T-SNE

In [None]:
# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(tsne_vectors)

# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   tools= (u'pan, wheel_zoom, box_zoom,'
                           u'box_select, reset'),
                   active_scroll=u'wheel_zoom')

# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@word') )

# draw the words as circles on the plot
tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                 color=u'blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color=u'black')

# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# engage!
show(tsne_plot);

In [None]:
trace1 = go.Scatter3d(
    x=tsne_vectors['x_coord'], 
    y=tsne_vectors['y_coord'], 
    z=tsne_vectors['z_coord'],
    mode='markers',
    marker=dict(
        size=5,
#         color=subset_bidask.values,                # set color to an array/list of desired values
        colorscale='Jet',   # choose a colorscale
        opacity=0.8,
        colorbar={"thickness": 15, "len": 0.8, "x": 0.8, "y": 0.4, }
    )
)

data = [trace1]
layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='3d-scatter-colorscale')