# Federal Data Strategy Working Group Tagging
This notebook contains code to identify the most relevant working group for each instance of feedback collected by the Federal Data Strategy.

The output is a spreadsheet containing all of the feedback instances and similarity scores for each working group.

Methodologies are documented inline.

In [1]:
import pandas as pd
import numpy as np
from spacy.lang.en import English
import nltk
from nltk.corpus import wordnet as wn
from gensim import corpora
from nltk.stem.wordnet import WordNetLemmatizer
import pickle
import gensim
import pyLDAvis.gensim
from bs4 import BeautifulSoup
from nltk.tokenize.toktok import ToktokTokenizer
import enchant
import contractions
import re
from nltk.tokenize import word_tokenize
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
import warnings
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity, polynomial_kernel
warnings.filterwarnings('ignore')

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
data = pd.read_excel(r'/Users/scottmcallister/Desktop/GitHub/data-strategy-topic-modeling/Federal Data Strategy Comments.xlsx')
keywords = pd.read_excel('Working_Group_Terms.xlsx')

In [3]:
def clean(doc, spellcheck=True):
    
    en_stop = set(nltk.corpus.stopwords.words('english'))
    tokenizer = ToktokTokenizer()
    
    def strip_html_tags(text):
        soup = BeautifulSoup(text, "html.parser")
        stripped_text = soup.get_text()
        return stripped_text

    def strip_urls(text):
        #url regex
        url_re = re.compile(r"""(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""")
        stripped_text = url_re.sub('',text)
        return stripped_text

    def strip_emails(text):
        #email address regex
        email_re = re.compile(r'(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)')
        stripped_text = email_re.sub('',text)
        return stripped_text

    def strip_nonsense(text):
        # leave words that are at least three characters long, do not contain a number, and are no more 
        # than 17 chars long
        no_nonsense = re.findall(r'\b[a-z][a-z][a-z]+\b',text)
        stripped_text = ' '.join(w for w in no_nonsense if w != 'nan' and len(w) <= 17)
        return stripped_text

    def expand_contractions(text, contraction_mapping=contractions.contractions_dict):

            contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                              flags=re.IGNORECASE|re.DOTALL)
            def expand_match(contraction):
                match = contraction.group(0)
                first_char = match[0]
                if contraction_mapping.get(match):
                    expanded_contraction = contraction_mapping.get(match)
                else:
                    expanded_contraction = contraction_mapping.get(match.lower())
                if expanded_contraction:
                    expanded_contraction = first_char+expanded_contraction[1:]
                    return expanded_contraction
                else:
                    pass

            expanded_text = contractions_pattern.sub(expand_match, text)
            expanded_text = re.sub("'", "", expanded_text)
            return expanded_text

    def strip_misspellings(text):
        d = enchant.Dict("en_US")
        words_to_add = ['api','git','github','apis']
        for w in words_to_add:
            d.add(w)
        
        
        tokens = tokenizer.tokenize(text)
        non_dict_words = set([word for word in tokens if d.check(word) is False and re.match('^[a-zA-Z ]*$',word)])
        stripped_text = " ".join([x for x in tokens if x not in non_dict_words])
        return stripped_text
    
    doc = doc.lower()
    doc = " ".join([word for word in tokenizer.tokenize(doc) if word not in en_stop])
    contraction_free = expand_contractions(doc)
    tag_free = strip_html_tags(contraction_free)
    url_free = strip_urls(tag_free)
    email_free = strip_emails(url_free)
    if spellcheck:
        misspelling_free = strip_misspellings(email_free)
        normalized = strip_nonsense(misspelling_free)

    else:
        normalized = strip_nonsense(email_free)
    
    return normalized

### Clean Instances

In [4]:
data['Clean Instances'] = data['Instance'].apply(lambda x: clean(x))

In [5]:
instance_corpus = data['Clean Instances']

### Clean Keywords

In [6]:
# first concatenate the topic to its description
keywords['Text'] = keywords['Topic'].fillna(value = "").astype(str) + " " + keywords['Description'].fillna(value = "").astype(str)

In [7]:
# clean this concatenated text, replacing np.nan with empty strings
keywords['Clean Text'] = keywords['Text'].apply(lambda x: clean(x))
keywords['Clean Text'].replace(r'',np.nan,regex=False,inplace=True)

In [8]:
keyword_corpus = keywords['Clean Text']

In [9]:
### Combined Corpus
corpus = pd.concat([keyword_corpus,instance_corpus]).dropna()

In [10]:
#drop the clean instances col
data.drop(labels = 'Clean Instances',axis=1,inplace=True)

# Word Embeddings

In [11]:
class TfidfEmbeddingVectorizer(BaseEstimator, TransformerMixin):
    """
    Description:
        This class is designed for use as a transformer within a sklearn pipeline. The pipeline will call
        the fit and transform instance methods.

        The class attributes instantiate the fastext word embedding model.
        This model makes a dictionary mapping unique words from the entire corpus to vectors of shape [300,].
        The transform method uses tf-idf weighting to aggregate each word vector at the doc level.
    """

    def __init__(self, corpus):
        """
        Description:
            Create an instance of the class with the chosen model.

        Arguments:
            None

        """
        # let tokens be a list of tokenized texts (i.e. list of lists of tokens)
        tokens = [word_tokenize(s) for s in corpus]

        #### Create Fast Text Model ####
        # Set values for various parameters
        print("="*80)
        print("Learning word embeddings for FastText Model...")
        feature_size = 300    # Word vector dimensionality
        window_context = 50   # Context window size
        min_word_count = 3    # Minimum word count
        sample = 1e-3         # Downsample setting for frequent words

        ft_model = gensim.models.fasttext.FastText(tokens, size=feature_size, window=window_context,
                                                   min_count=min_word_count,sample=sample, sg=1, iter=50)
        ft_embedding = {w: vec for w, vec in zip(ft_model.wv.vocab.keys(), ft_model.wv.vectors)}
        print("\tDone learning word embeddings for FastText Model.")
        print("_"*80)

        self.model = ft_embedding
        self.dim = len(next(iter(self.model.values())))
        self.word2weight = None


    def fit(self, X_train, y=None):
        """
        Description:
            When this method is called by the sklearn pipeline, it creates the tf_idf scores for the words.
            These will be used by transform as weights when aggregating the vector representations of each
            word at the instance level.

        """
        # pass callable to analyzer to extract the sequence of features out of the instance.
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X_train)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(lambda: max_idf,[(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
        return self



    def transform(self, X_train):
        instances  = [[word for word in instance.split(" ")] for instance in X_train]
        embeddings =  np.array([np.mean([self.model[w] * self.word2weight[w]
                                         for w in words if w in self.model] or
                        [np.zeros(self.dim)], axis=0) for words in instances]).astype('float')
        return embeddings

In [12]:
#instantiate model
vec = TfidfEmbeddingVectorizer(corpus = corpus)

Learning word embeddings for FastText Model...
	Done learning word embeddings for FastText Model.
________________________________________________________________________________


In [13]:
#get the word embeddings for each instance
X = instance_corpus
vec.fit(X)
X_vec = vec.transform(X)
#scale
scaler = StandardScaler()
X_vec_scaled = scaler.fit_transform(X_vec)

In [14]:
#get the word embeddings for each working group
melted_keywords = pd.melt(keywords[['Clean Text','WG1','WG2','WG3','WG4']],id_vars='Clean Text').dropna().drop(labels='value',axis=1)
wg_df = melted_keywords.groupby(by='variable')['Clean Text'].apply(lambda x: " ".join(x))
y = wg_df.values
vec.fit(y)
y_vec = vec.transform(y)
#scale
scaler = StandardScaler()
y_vec_scaled = scaler.fit_transform(y_vec)

## Cosine Similarity
`cosine_similarity` computes the L2-normalized dot product of vectors. That is, if $x$ and $y$ are row vectors, their cosine similarity $k$ is defined as:

$$k(x, y) = \frac{x y^\top}{\|x\| \|y\|}$$

This is called cosine similarity because Euclidean (L2) normalization projects the vectors onto the unit sphere, and their dot product is then the cosine of the angle between the points denoted by the vectors.

The function returns a kernel matrix, which is an array with shape (n_samples_X, n_samples_Y).

### Score Interpretation
The resulting similarity scores range from $−1$ (meaning exactly opposite) to $1$ (meaning exactly the same). A $0$ indicates orthogonality or decorrelation, while in-between values indicate intermediate similarity or dissimilarity.

In [15]:
d_cos = cosine_similarity(X_vec_scaled,y_vec_scaled)

In [16]:
max_values = np.amax(d_cos,axis=1)
max_indices = np.argmax(d_cos,axis=1)
max_cos = list(zip(max_indices,max_values))

# Create Spreadsheet

In [17]:
wg_name_map = {0:'Enterprise Data Governance',
               1:'Access, Use and Augmentation',
               2:'Decision-Making and Accountability',
               3:'Commercialization, Innovation, and Public Use'}

data['Recommended Working Group'] = np.nan
data['Recommended Working Group Similarity Score'] = np.nan

data[['Recommended Working Group','Recommended Working Group Similarity Score']] = max_cos
data['Recommended Working Group'] = data['Recommended Working Group'].map(wg_name_map)

In [18]:
data['Enterprise Data Governance Similarity Score'] = np.nan
data['Access, Use and Augmentation Similarity Score'] = np.nan
data['Decision-Making and Accountability Similarity Score'] = np.nan
data['Commercialization, Innovation, and Public Use Similarity Score'] = np.nan
data[['Enterprise Data Governance Similarity Score',
      'Access, Use and Augmentation Similarity Score',
      'Decision-Making and Accountability Similarity Score',
      'Commercialization, Innovation, and Public Use Similarity Score']] = d_cos

In [19]:
writer = pd.ExcelWriter('Instances Mapped to Working Groups.xlsx')
data.to_excel(writer,'Sheet1')
writer.save()