 note: type of document is not consistnet - we take what the inst has labeled as the true label

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from datetime import datetime as dt
import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import SpectralClustering
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score, v_measure_score
from sklearn.feature_extraction.text import TfidfVectorizer

import spacy
nlp = spacy.load('en')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

import networkx as nx
from scipy.spatial import distance

%matplotlib inline
sns.set(style='darkgrid')
stop = stopwords.words('english')
today = dt.today().strftime('%Y%m%d')

# 1. Load Data

In [None]:
in_file = '../data/coe.txt'
f = open(in_file, 'r')
raw_file = f.readlines()
f.close()

## 1.2. Preparing Data

In [None]:
# text cleaning
raw_file = [line.lower() for line in raw_file]
raw_file = [line.strip() for line in raw_file]
raw_file = [line for line in raw_file if line]

In [None]:
# get metadata
region_indices = [i for i, elem in enumerate(raw_file) if 'region:' in elem]
name_indices = [i-1 for i in region_indices]
country_indices = [i+1 for i in region_indices]
org_indices = [i+2 for i in region_indices]
doc_indices = [i+3 for i in region_indices]

In [None]:
text_start = [i+4 for i in region_indices]
text_end = [i-1 for i in name_indices]
text_end = text_end[1:]
text_end.append(len(raw_file))
text_indices = list(zip(text_start, text_end))

In [None]:
col_name = ['ORG_NAME', 'ORG_TYPE', 'DOC_TYPE', 'COUNTRY', 'REGION', 'TEXT']
df = pd.DataFrame(columns=col_name)

In [None]:
df['ORG_NAME'] = [raw_file[i] for i in name_indices]

df['ORG_TYPE'] = [raw_file[i] for i in org_indices]
df['ORG_TYPE'] = df['ORG_TYPE'].str.split(':').str[1]

df['DOC_TYPE'] = [raw_file[i] for i in doc_indices]
df['DOC_TYPE'] = df['DOC_TYPE'].str.split(':').str[1]

df['COUNTRY'] = [raw_file[i] for i in country_indices]
df['COUNTRY'] = df['COUNTRY'].str.split(':').str[1]

df['REGION'] = [raw_file[i] for i in region_indices]
df['REGION'] = df['REGION'].str.split(':').str[1]

In [None]:
df['TEXT'] = [raw_file[s:e] for s,e in text_indices]
df['TEXT'] = df['TEXT'].apply(lambda x: ' '.join(x))
# df['TEXT'] = df['TEXT'].apply(lambda x: x.split('.'))

In [9]:
df['REGION'] = df['REGION'].apply(lambda x: x.replace("asian-pacific group", "asia-pacific group"))
df['ORG_TYPE'] = df['ORG_TYPE'].apply(lambda x: x.replace("chemistry-industry", "chemistry - industry"))

In [10]:
df.groupby('DOC_TYPE').size()

DOC_TYPE
 code of conduct    74
 code of ethics     64
 combined            5
dtype: int64

In [11]:
def plot_count(col, y_lable, title, df=df):
    plt.figure(figsize=(12, 10))
    g = sns.countplot(y=col, data=df,
                      order=df[col].value_counts().index,
                      color='k')
    plt.xticks(rotation=45)
#     g.set(yticklabels=['WEOG', 'Asia-Pacific Group', 'Worldwide', 'Easter European Group', 'African Group', 'GRULAC'])
    plt.ylabel(y_lable)
    plt.title(title)
    plt.tight_layout()
    plt.savefig('../result/img/count_{}_{}.jpg'.format(col, today))

In [12]:
# plot_count('REGION', "Type of Organization", "Count of Documents per Region")

# 2. Topic Anlaysis

### 2.1. LDA

In [13]:
from gensim import models

In [14]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'code', 'conduct', 'business', 'solvay', 'shall',
                  'may', 'must', 'braskem', 'sasol', 'petrobra', 'petkim', 'yara', 'corporation',
                  'dragon_oil', 'akzonobel'])
#TODO: expand the stopword vocabulary

In [15]:
from gensim.utils import simple_preprocess

# Tokenize words and Clean-up text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [16]:
data_words = list(sent_to_words(df['TEXT']))

In [17]:
from gensim.models.phrases import Phrases, Phraser
import gensim.corpora as corpora
from gensim.models.ldamodel import LdaModel as lda
from gensim.models import CoherenceModel
import math

# Build the bigram and trigram models
bigram = Phrases(data_words, min_count=5, threshold=10) # higher threshold fewer phrases.
trigram = Phrases(bigram[data_words], threshold=10)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)



In [18]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word.lower() for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [19]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [30]:
def prepare_corpus(doc_clean):
    """
    Input  : clean document
    Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
    Output : term dictionary and Document Term Matrix
    """
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    corpus = [dictionary.doc2bow(doc) for doc in doc_clean]
    
#     tfidf_corpus = models.TfidfModel(corpus)

    return dictionary, corpus

In [33]:
a, b = prepare_corpus(data_lemmatized)
type(b)

list

In [29]:
lsi = models.LsiModel(b, id2word=a, num_topics=3)

TypeError: 'int' object is not iterable

In [20]:
def create_gensim_lsa_model(processed_text, number_of_topics):
    """
    Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    dictionary, corpus = prepare_corpus(processed_text)
    # generate LDA model
    
    lda_model = lda(corpus=corpus,
                    id2word=dictionary,
                    num_topics=number_of_topics, 
                    random_state=100,
                    chunksize=143,
                    passes=20,
                    alpha='auto',
                    per_word_topics=True)
    
#     print(lda_model.print_topics(num_topics=number_of_topics, num_words=words))
    return lda_model, dictionary

In [24]:
# https://radimrehurek.com/gensim/tut2.html
a = create_gensim_lsa_model(data_lemmatized, 5)

TypeError: 'int' object is not iterable

In [None]:
def compute_coherence_values(processed_text, number_of_topics):
    """
    Input   : dictionary : Gensim dictionary
              corpus : Gensim corpus
              texts : List of input texts
              stop : Max num of topics
    purpose : Compute c_v coherence for various number of topics
    Output  : model_list : List of LSA topic models
              coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(2, number_of_topics):
        print("training LDA for {} topics".format(num_topics))
        # generate LDA model
        model, dictionary= create_gensim_lsa_model(processed_text, num_topics)
        model_list.append(model)
        print("calculating coherence score for {} topics \n".format(num_topics))
        coherencemodel = CoherenceModel(model=model, texts=processed_text, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [None]:
model_list, coh_val = compute_coherence_values(data_lemmatized, 20)
# model, dictionary  = create_gensim_lsa_model(data_lemmatized, 3)

In [None]:
def plot_graph(coherence_values, stop):
    plt.figure(figsize=(12,8))
    # Show graph
    x = range(2, stop, 1)
    plt.plot(x, coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
#     plt.legend(("coherence_values"), loc='best')
    plt.xticks(xint)
    plt.savefig('../result/img/20181230/topics_coherence_20181225.png')
    plt.show()

In [None]:
plot_graph(coh_val, 20)


In [None]:
# Select the model and print the topics
optimal_model = model_list[16]
model_topics = optimal_model.show_topics(formatted=False)
from pprint import pprint
pprint(optimal_model.print_topics(num_words=10))

In [None]:
#TODO: find dominant topic in each document, use it as a new feature

### 2.2. LSA 
https://www.datacamp.com/community/tutorials/discovering-hidden-topics-python

In [None]:
# from gensim.models import LsiModel
# from gensim.models.phrases import Phraser, Phrases
# from nltk.tokenize import RegexpTokenizer
# from nltk.corpus import stopwords
# from nltk.stem.porter import PorterStemmer
# from gensim.models.coherencemodel import CoherenceModel

# # TODO: remove numbers, use lemmatizer
# # better tokenization: n grams : phrases: abu dhabi [6]
# # https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
# def preprocess_data(doc_set):
#     """
#     Input  : docuemnt list
#     Purpose: preprocess text (tokenize, removing stopwords, and stemming)
#     Output : preprocessed text
#     """
#     # initialize regex tokenizer
#     tokenizer = RegexpTokenizer(r'\w+')
#     # create English stop words list
#     en_stop = set(stopwords.words('english'))
#     # Create p_stemmer of class PorterStemmer
#     p_stemmer = PorterStemmer()
#     # list for tokenized documents in loop
#     texts = []
#     # loop through document list
#     for i in doc_set:
#         # clean and tokenize document string
#         raw = i.lower()
#         tokens = tokenizer.tokenize(raw)
#         # remove stop words from tokens
#         stopped_tokens = [i for i in tokens if not i in en_stop]
#         # stem tokens
# #         stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
#         # add tokens to list
#         texts.append(stopped_tokens)
#     return texts

# def prepare_corpus(doc_clean):
#     """
#     Input  : clean document
#     Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
#     Output : term dictionary and Document Term Matrix
#     """
#     # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
#     dictionary = corpora.Dictionary(doc_clean)
#     # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
#     doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
#     # generate LDA model
#     return dictionary,doc_term_matrix

# def create_gensim_lsa_model(doc_clean,number_of_topics,words):
#     """
#     Input  : clean document, number of topics and number of words associated with each topic
#     Purpose: create LSA model using gensim
#     Output : return LSA model
#     """
#     dictionary,doc_term_matrix=prepare_corpus(doc_clean)
#     # generate LSA model
#     lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
#     print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
#     return lsamodel

# def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):
#     """
#     Input   : dictionary : Gensim dictionary
#               corpus : Gensim corpus
#               texts : List of input texts
#               stop : Max num of topics
#     purpose : Compute c_v coherence for various number of topics
#     Output  : model_list : List of LSA topic models
#               coherence_values : Coherence values corresponding to the LDA model with respective number of topics
#     """
#     coherence_values = []
#     model_list = []
#     for num_topics in range(start, stop, step):
#         # generate LSA model
#         model = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
#         model_list.append(model)
#         coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
#         coherence_values.append(coherencemodel.get_coherence())
#     return model_list, coherence_values

# def plot_graph(doc_clean,start, stop, step):
#     dictionary,doc_term_matrix=prepare_corpus(doc_clean)
#     model_list, coherence_values = compute_coherence_values(dictionary, doc_term_matrix,doc_clean,
#                                                             stop, start, step)
#     # Show graph
#     x = range(start, stop, step)
#     plt.plot(x, coherence_values)
#     plt.xlabel("Number of Topics")
#     plt.ylabel("Coherence score")
#     plt.legend(("coherence_values"), loc='best')
#     plt.savefig('../result/img/number_clusters_20181225.png')
#     plt.show()

# # LSA Model
# number_of_topics=7
# words=10
# clean_text=preprocess_data(df['TEXT'])
# model=create_gensim_lsa_model(clean_text,number_of_topics,words)

# plt.figure(figsize=(12,10))
# start,stop,step=2,10,1
# plot_graph(clean_text,start,stop,step)

# model=create_gensim_lsa_model(clean_text,6,words)

# # model.print_topics(num_topics=number_of_topics, num_words=words)
# model.print_topic(1)

## 2.3. Topics

In [None]:
import pyLDAvis
import pyLDAvis.gensim

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(optimal_model, corpus, id2word)
vis

# 4. Clustering

In [None]:
#TODO: use the same preprocessing as gensim
#TODO: 

In [None]:
# def remove_numbers(x):
#     clean = ''.join([i for i in x if not i.isdigit()])
#     return clean

# def lemmatize(df):

#     docs = df['TEXT_'].tolist()

# #     def token_filter(token):
# #         return not (token.is_punct | token.is_space | token.is_stop | len(token.text) <= 4)
# # for doc in nlp.pipe(docs):
# #     tokens = [token.lemma_ for token in doc if token_filter(token)]
# #     filtered_tokens.append(tokens)

#     filtered_tokens = []

#     for doc in nlp.pipe(docs):
#         tokens = [token.lemma_ for token in doc]
#         filtered_tokens.append(tokens)
        
#     lemmatized_doc = [' '.join(x for x in doc) for doc in filtered_tokens] 
#     df['TEXT_lemma'] = lemmatized_doc
#     return df

# # def remove_stop_words(x):
# #     clean = [i for i in word_tokenize(x) if i not in stop]
# #     sent = ' '.join(clean)
# #     return sent
# # [stop.append(x) for x in ['chemistry', 'chemists', 'chemical', 'chemist', 'chemicals', 'code', 'conduct', 'ethics']]

# # def remove_special_char(x):
# #     clean = re.sub('\W+',' ', x)
# #     return clean

In [None]:
# # remove numbers
# df['TEXT_'] = df['TEXT'].apply(lambda x: remove_numbers(x))
# df = lemmatize(df)

# # remove specail characters
# # df['TEXT_'] = df['TEXT_'].apply(lambda x: remove_special_char(x))
# # df['TEXT_'] = df['TEXT'].apply(lambda x: remove_stop_words(x))

# df['TEXT_lemma'] = df['TEXT_lemma'].apply(lambda x: x.replace("PRON", ""))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

text_clf = Pipeline([
    ('vect', CountVectorizer(encoding='utf-8', strip_accents='unicode', lowercase=True,
                             analyzer='word', ngram_range=(1, 5),
                             min_df=5, max_df=0.5,
                             stop_words=stop_words  # , max_features=200
                             )),
    ('tfidf', TfidfTransformer(smooth_idf=True))
])

In [None]:
matrix = text_clf.fit_transform(df['TEXT'])
feature_names = text_clf.steps[0][1].get_feature_names()

In [None]:
# top features for all documents - each document
def top_features(vectorizer, tfidf_matrix, row, n):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    feature_names = np.array(vectorizer.get_feature_names())  # get all features
    feature_index = np.argsort(tfidf_matrix.toarray())[::-1][row,:n]   # sort descending for the given row
    feautres_list = [feature_names[i] for i in feature_index]
    
    return feautres_list

In [None]:
# each doc
top_features(text_clf.steps[0][1], matrix, 0, 11)

In [None]:
def keyword_heatmap(vectorizer, tfidf_matrix, keyword_list):
    # matrix to array
    bow = tfidf_matrix.toarray()
    # get all features
    feature_names = vectorizer.steps[0][1].get_feature_names()  # get all features
    
    # find index of keywrods in the feature space
    idx = np.where(np.isin(feature_names, keyword_list))

    # construct a dataframe
    df_kw = pd.DataFrame(bow[:,list(idx[0])], columns=[keyword_list])
    df_kw = df_kw[df_kw != 0].dropna(thresh=1)
    df_kw = df_kw.fillna(0)  
    
    plt.figure(figsize=(5, 12))
    sns.heatmap(df_kw, fmt="g", cmap='viridis')
    
#     return df_kw

In [None]:
keyword = ['chemical weapons', 'chemistry']
keyword_heatmap(text_clf, matrix, keyword)
# plt.savefig('../result/img/feature_heatmap_20181225.png')

In [None]:
# all docs
def get_top_n_words(vectorizer, bow, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    """
    sum_words = bow.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vectorizer.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
get_top_n_words(text_clf.steps[0][1], matrix, 20)

## tsne

In [None]:
from yellowbrick.text import TSNEVisualizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df.columns

In [None]:
docs = text_clf.fit_transform(df['TEXT'])

In [None]:
labels = df['ORG_TYPE']

# Create the visualizer and draw the vectors
tsne = TSNEVisualizer(random_state=42)
tsne.fit(docs, labels)
tsne.poof()
# plt.savefig('../result/img/tsne/tsne_org.png')

In [None]:
labels = df['REGION']

# Create the visualizer and draw the vectors
tsne = TSNEVisualizer(random_state=42)
tsne.fit(docs, labels)
tsne.poof()
# plt.savefig('../result/img/tsne/tsne_region.png')

In [None]:
labels = df['DOC_TYPE']

# Create the visualizer and draw the vectors
tsne = TSNEVisualizer(random_state=42)
tsne.fit(docs, labels)
tsne.poof()
# plt.savefig('../result/img/tsne/type_org.png')

## Clustering

In [None]:
sim_matrix = cosine_similarity(matrix, matrix)
simdf = pd.DataFrame(sim_matrix)
simdf.head()

In [None]:
def create_cluster(m, n_cluster):
    rs = 42
    # Cluster
    sc = SpectralClustering(n_cluster, random_state=rs, affinity='rbf', n_init=100)
    sc.fit_predict(m)
    
    return sc
# The algorithm takes the top k eigenvectors of the input matrix corresponding to the largest eigenvalues, then runs the k-mean algorithm on the new matrix.

In [None]:
def cluster_map(sc, input_data):
    G = nx.from_numpy_matrix(sc.affinity_matrix_)
    pos = nx.nx.fruchterman_reingold_layout(G)

    p = pd.DataFrame.from_dict(pos, orient='index')
    p.columns = ['X', 'Y']
    p['CLUSTER'] = sc.labels_ + 1
    
#     p = pd.concat([p, input_data.iloc[:,:-1]],axis=1)   # wihtou text
    p = pd.concat([p, input_data], axis=1)
    return p

In [None]:
# TODO: which metric?
# https://scikit-learn.org/stable/modules/clustering.html#clustering-evaluation

def evaluate_cluster(df, sim_mat, number_of_clusters, metric):
    # assign clusters
    result_cluster = create_cluster(sim_mat, number_of_clusters)

    # map clusters to X and Y
    df_plot = cluster_map(result_cluster, df)
    
    df_plot['CODE_ORG_TYPE'] = df['ORG_TYPE'].astype('category').cat.codes
    df_plot['CODE_DOC_TYPE'] = df['DOC_TYPE'].astype('category').cat.codes
    df_plot['CODE_REGION'] = df['REGION'].astype('category').cat.codes

    mi_region = metric(list(df_plot['CLUSTER'].values), list(df_plot['CODE_REGION'].values))
    mi_org = metric(list(df_plot['CLUSTER'].values), list(df_plot['CODE_ORG_TYPE'].values))
    mi_doc = metric(list(df_plot['CLUSTER'].values), list(df_plot['CODE_DOC_TYPE'].values))

    
    all_dist = [mi_region, mi_org, mi_doc]

    return all_dist
# score = pd.DataFrame(all_dist, columns=['clusters', 'region', 'org', 'doc'])

In [None]:
# evaluating clusters
all_score = []
for i in range(15):
    cluster_score = evaluate_cluster(df, sim_matrix, i+1, v_measure_score)
    cluster_score.append(i)
    all_score.append(cluster_score)

score_df = pd.DataFrame(all_score, columns=['region', 'org', 'doc', 'clusters'])
score_df.to_csv('../result/clusters_score_{}.csv'.format(today))
score_df.head()

In [None]:
plt.figure(figsize=(16, 8))
ax = sns.lineplot(x="clusters", y='region', label='Region',
                  data=score_df)
ax = sns.lineplot(x="clusters", y='org', label='Type of Organization',
                  data=score_df)
ax = sns.lineplot(x="clusters", y='doc', label='Type of Document',
                  data=score_df)
plt.xticks(range(0, 20))
plt.xlabel("Number of Clusters")
plt.ylabel("V Measure Score")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title("Evaluation of Clusters for Each Attribute")

x_max = score_df['clusters'][score_df['org'] == score_df['org'].max()]
y_max = score_df['org'].max()
plt.vlines(x=x_max, ymin=0, ymax=(1.05 * y_max), linestyles="dashed", linewidth=1)
plt.text(1.01 * x_max, 1.02 * y_max,'best performing # of clusters')

plt.tight_layout()
plt.savefig("../result/img/clustesr_score_{}.jpg".format(today))

In [None]:
# find the best number of clusters
score_df['clusters'][score_df['org'] == score_df['org'].max()].values[0]

In [None]:
# assign clusters
best_cluster = create_cluster(sim_matrix, 7)

# map clusters to X and Y
df_plot = cluster_map(best_cluster, df)
df_plot.head()

In [None]:
import seaborn as sns

def scatter_plot(df, hue_, save=False):
    sns.set(style="darkgrid")

    plt.figure(figsize=(16,10))
    ax = sns.scatterplot(x="X", y="Y",
                         hue=hue_, style= "CLUSTER", s= 100,
                         data=df, legend="full")

    # ax.set_frame_on(False) #Remove both axes
    plt.legend(loc=4)
    ax.set_ylabel('')    # remove labels
    ax.set_xlabel('')
    ax.set_yticks([])   # remove ticks
    ax.set_xticks([])
    plt.tight_layout()
    plt.axis('equal')
    if save:
        plt.savefig('../result/tfidf_cosine_{}_{}.jpg'.format(hue_, today))

In [None]:
scatter_plot(df_plot, hue_='REGION', save=False)

#TODO: try doc2vec
#https://www.kaggle.com/sgunjan05/document-clustering-using-doc2vec-word2vec
#https://medium.com/@mishra.thedeepak/doc2vec-simple-implementation-example-df2afbbfbad5