In [23]:
# dependences
import pandas as pd
import pickle as pkl
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
import numpy as np
from gensim import models
from gensim.models import Word2Vec
from sklearn.manifold import TSNE

In [24]:
# import pickles
def read_pickle(pickle_name):
    path = "../pickles/"
    return pkl.load(open(path+pickle_name, "rb"))

In [25]:
def dump_pickle(obj, pickle_name):
    path = "../pickles/"
    pkl.dump(obj, open(path+pickle_name, "wb"))

In [26]:
def label_map(labels):
    label_dict = {}
    temp1 = list(set(labels))
    temp2 = [(i+1) for i in range(len(temp1))]
    for i in range(len(temp1)):
        label_dict[temp1[i]] = int(temp2[i])
    mapped_labels = []
    for label in labels:
        mapped_labels.append(label_dict[label])
    return mapped_labels, label_dict

In [27]:
def tf_idf(texts):
    tfidf = TfidfVectorizer()
    tfs = tfidf.fit_transform(texts)
    tfs_df = pd.DataFrame(tfs.A, columns = tfidf.get_feature_names())
    vocab = tfidf.get_feature_names()
    dump_pickle(tfs_df, 'tfs.pkl')
    dump_pickle(vocab, 'tfs_vocab.pkl')
    return tfs_df, vocab

In [28]:
def untokenize(texts):
    docs = []
    for doc in texts:
        temp = ""
        for word in doc:
            temp += word + " "
        docs.append(temp)
    return docs

In [29]:
def w2v(texts, size):
    model = Word2Vec(texts, size = size)
    word_dict = dict(zip(model.wv.index2word, model.wv.vectors))
    dump_pickle(word_dict, 'w2v_dict.pkl')
    return word_dict

In [30]:
def d2v(texts, word_dict, size):
    doc_vecs = []
    for doc in texts:
        counter = 0
        vecs = np.zeros(size)
        for words in doc:
            if words in word_dict:
                vecs = np.add(vecs,word_dict[words])
                counter += 1
        vecs = np.divide(vecs,counter)
        doc_vecs.append(vecs)
    dump_pickle(doc_vecs, 'doc_vecs.pkl')
    return doc_vecs

In [31]:
# texts = list of documents
# tfs_df = dataframe of tfidf values
# word_dict = dict that maps words to w2v vectors
# vocab = vocabulary of tfidf
# size = size of w2v vectors
def tfidf_d2v(texts, tfs_df, word_dict, vocab, size):
    doc_vecs = []
    for i in range(len(tfs_df)):
        n = 0
        vectors = [0]*size
        for word in vocab:
            if word in word_dict:
                vectors += word_dict[word]*tfs_df[word][i]
                n += 1
        doc_vecs.append([i/n for i in vectors])
    dump_pickle(doc_vecs, 'weighted_doc_vecs.pkl')
    return doc_vecs

In [32]:
def tsne(vecs, model_name, perplexity = 30):
    model_tsne = TSNE(n_components=2, verbose = 1, perplexity = perplexity,
                  learning_rate = 30, random_state=0)
    tsne = model_tsne.fit_transform(vecs)
    dump_pickle(tsne, model_name+"_tsne.pkl")

In [33]:
labels = read_pickle("cleaned_train_data.pkl")['labels']
texts = read_pickle("cleaned_train_data.pkl")['texts']
size = 50
tfs_df, vocab = tf_idf(untokenize(texts))
dic = w2v(texts, size)
doc_vecs = d2v(texts, dic, size)
weighted_doc_vecs = tfidf_d2v(texts, tfs_df, dic, vocab, size)
tsne(tfs_df, "tfidf")
tsne(doc_vecs, "d2v")
tsne(weighted_doc_vecs, "weighted_d2v")

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 534 / 534
[t-SNE] Mean sigma: 0.281165
[t-SNE] KL divergence after 100 iterations with early exaggeration: 1.891549
[t-SNE] Error after 300 iterations: 1.891549
[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 534 / 534
[t-SNE] Mean sigma: 0.150990
[t-SNE] KL divergence after 100 iterations with early exaggeration: 1.515203
[t-SNE] Error after 375 iterations: 1.515203
[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 534 / 534
[t-SNE] Mean sigma: 0.000532
[t-SNE] KL divergence after 100 iterations with early exaggeration: 1.438098
[t-SNE] Error after 325 iterations: 1.438098
