In [51]:
"""Utility functions useful for analysis"""
import re
import string
import pickle

from tqdm import tqdm_notebook

import pandas as pd
import numpy as np
import scipy as sp

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import contractions

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from adjustText import adjust_text
import squarify
plt.style.use('fivethirtyeight')
%matplotlib inline

In [48]:
def remove_nonascii(words):
    return [w for w in words if w in string.printable]


def remove_stop(words):
    stop_words = set(stopwords.words("english"))
    return [w for w in words if not w in stop_words]


def lemmatize(words):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in words]


def remove_punct(words):
    return [w for w in words if not w in string.punctuation]
    
    
def lowercase(words):
    return [w.lower() for w in words]


def remove_short(words):
    return [w for w in words if len(w) > 2]


def filter_pos(words, pos):
    tagged = nltk.pos_tag(words)
    pos = " ".join(pos)
    return [w[0] for w in tagged if w[1] in pos]

In [41]:
def clean_string(s, pos=None):
    s = contractions.fix(s)
    words = word_tokenize(s)
    
    if pos:
        words = filter_pos(words, pos)
    
    words = lowercase(words)
    words = lemmatize(words)
    words = remove_stop(words)
    words = remove_punct(words)
    words = remove_short(words)
    
    assert(words is not None)
    return words

In [42]:
def load_embeddings(path):
    print("Loading Glove Model")
    with open(path,'r', encoding='utf8') as f:
        embs = {}
        for line in tqdm_notebook(f):
            splitLine = line.split(' ')
            word = splitLine[0]
            embedding = np.array([float(val) for val in splitLine[1:]])
            embs[word] = embedding
            
    print("Done.",len(embs)," words loaded!")
    return embs

In [43]:
def embed_string(s, glove):
    s = contractions.fix(s)
    
    words = word_tokenize(s)
    words = lowercase(words)
    
    emb = np.zeros(glove['hi'].shape)
    for w in words:
        try:
            emb += glove[w]
            
        except KeyError:
            # word not found in glove
            continue
        
    return emb / len(words)    

In [44]:
def similarity(emb1, emb2):
    return np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))

In [None]:
# We define a function to count the words
def count(words):
    counts = {}
    unique = set(words)
    for w in unique:
        counts[w] = words.count(w)
    
#     _ = counts.pop("hence")
    sorted_counts = sorted(zip(counts.keys(), counts.values()), key=lambda x: x[1], reverse=True)
    labels, counts = zip(*sorted_counts)
    return labels, counts

In [50]:
# We define a function to count the words
def count_dict(words):
    counts = {}
    unique = set(words)
    for w in unique:
        counts[w] = words.count(w)
    
#     _ = counts.pop("hence")
    sorted_counts = sorted(zip(counts.keys(), counts.values()), key=lambda x: x[1], reverse=True)
    return {word: count for word, count in sorted_counts}

In [None]:
# Function to plot the words 
def plot_squarify(words, show):
    labels, sizes = count(words)
    
    _labels = []
    for i in range(show):
        _labels.append(labels[i]+" - "+str(sizes[i]))

    plt.rcParams.update({'font.size':36})
    fig=plt.gcf()
    fig.set_size_inches(40,15)
    squarify.plot(sizes=sizes[:show],
                  label=_labels[:show],
                  color=sns.color_palette('GnBu_r',show+11)[11:],
                  alpha=0.9)
    plt.axis('off')

In [None]:
def compare_freqs(words1, words2, n=15):
    words1, counts1 = count(words1)
    words2, counts2 = count(words2)
    
    count_dict1 = {word: count for word, count in zip(words1, counts1)}
    count_dict2 = {word: count for word, count in zip(words2, counts2)}
    
    fig, ax = plt.subplots()
    fig.set_size_inches(30,14)
    
    texts = []
    for i in range(1, n):
        word = words1[i]
        noise = np.random.normal(size=2)
        f1 = count_dict1.get(word, 0) + noise[0]
        f2 = count_dict2.get(word, 0) + noise[1]
        ax.scatter(f1, f2, c='#008fd5', s=100)
        texts.append(plt.text(f1, f2, word, fontsize=27))
        
    for i in range(n):
        word = words2[i]
        if word in words1[:n]:
            continue
        
        noise = np.random.normal(size=2)
        f1 = count_dict1.get(word, 0) + noise[0]
        f2 = count_dict2.get(word, 0) + noise[1]
        ax.scatter(f1, f2, c='#008fd5', s=100)
        texts.append(plt.text(f1, f2, word, fontsize=27))
        
    plt.xticks(fontsize=28)
    plt.yticks(fontsize=28)
    
    adjust_text(texts, arrowprops=dict(arrowstyle="-", color='b', lw=1.1))
    
    plt.plot(np.arange(0, 30, 0.1), np.arange(0, 30, 0.1), linewidth=3, c='#fc4f30', alpha=0.7)
    
    plt.tight_layout()