Explore the pos results obtained using various packages  
Data: The pos results using stanza on the projects' names 

In [20]:
import re
import os
import pandas as pd
import nltk
import numpy as np
from nltk.corpus import wordnet as wn
wnl = nltk.WordNetLemmatizer()
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()

def isint(value):
    '''
    Check if the input value type is integer
    '''
    try:
        int(value)
        return int(value)
    except ValueError:
        return False

In [31]:
results_dir = 'C:\\Users\\RonyArmon\\Projects_Code\\Cluster_Activities\\results'
pos_df = pd.read_pickle(os.path.join(results_dir,'pos.pkl'))
print('{} rows'.format(len(pos_df)))
print(pos_df.head())

159626 rows
   token_id  name_id loc      token      lemma   upos xpos head deprel  \
0         1        1   1  construct  construct   VERB   VB    0   root   
0         2        1   2     trials      trial   NOUN  NNS    1    obj   
0         3        1   3          &          &  CCONJ   CC    4     cc   
0         4        1   4       test       test   NOUN   NN    2   conj   
0         5        1   5        for        for    ADP   IN    7   case   

  start_char end_char                  feats  
0          0        9  Mood=Imp|VerbForm=Fin  
0         10       16            Number=Plur  
0         17       18                      _  
0         19       23            Number=Sing  
0         24       27                      _  


# Stems and Lemmas  
Compare stanza and nltk lemmatizers and stemmers   

In [32]:
stems_lemmas = pos_df[['token', 'lemma', 'upos']].drop_duplicates()\
    .rename(columns={'lemma': 'StanzaLemmatizer'})
tokens = list(stems_lemmas['token'])
stems_lemmas['WordNetLemmatizer'] = [wnl.lemmatize(t) for t in tokens]
stems_lemmas['PorterStemmer'] = [porter.stem(t) for t in tokens]
stems_lemmas['LancasterStemmer'] = [lancaster.stem(t) for t in tokens]
stems_lemmas = stems_lemmas.reset_index(drop=True)
print(stems_lemmas.head())
stems_lemmas.to_excel(os.path.join(results_dir,'stems_lemmas.xlsx'), index=False)
#stems_lemmas = pd.read_excel('./results/stems_lemmas.xlsx')

       token StanzaLemmatizer   upos WordNetLemmatizer PorterStemmer  \
0  construct        construct   VERB         construct     construct   
1     trials            trial   NOUN             trial         trial   
2          &                &  CCONJ                 &             &   
3       test             test   NOUN              test          test   
4        for              for    ADP               for           for   

  LancasterStemmer  
0        construct  
1              tri  
2                &  
3             test  
4              for  


# Progressive forms  

In [36]:
indices = []
lemmatized, not_lemmatized = [], []
stems_lemmas = stems_lemmas[stems_lemmas['upos']=='VERB']
for index, row in stems_lemmas.iterrows():
    token, lemma = str(row['token']), str(row['StanzaLemmatizer'])
    if re.findall('ing$', token):
        if token == lemma:
            not_lemmatized.append(token)
            indices.append(index)
        else:
            lemmatized.append(token)            
progressive_forms = stems_lemmas[stems_lemmas.index.isin(indices)]
progressive_forms = progressive_forms.drop(['upos', 'WordNetLemmatizer', 'LancasterStemmer'], axis=1)
print(progressive_forms.head())
print('{} verbs lemmatized, {} verbs not lemmatized'.format(len(lemmatized), len(not_lemmatized)))
progressive_forms.to_excel(os.path.join(results_dir,'progressive_forms.xlsx'), index=False)

            token StanzaLemmatizer PorterStemmer
2050      probing          probing         probe
3047  engineering      engineering         engin
5146    lightning        lightning        lightn
5213      tracing          tracing         trace
5462      fencing          fencing          fenc
116 verbs lemmatized, 6 verbs not lemmatized


# Synonyms by Wordnet synsets

In [48]:
results = []
lemmas = list(set(pos_df['lemma']))
synonyms_tokens = {}
for token in lemmas:
    # Exclude numeric tokens
    if type(isint(token))!=int:
        # Exclude single character tokens
        if len(token)>1:
            # Exclude 2nd 3rd 4th
            if not re.findall('^\d{1,}[st|nd|rd|th]$', token):
                # Collect token synsets from Wordnet
                syn_set = wn.synsets(token)
                if syn_set:
                    synonyms = []
                    for syn in syn_set:
                        #syn_name = syn.name().split('.')[0]
                        #print(syn_name, syn_lemmas)
                        # Collect synonyms from all synsets
                        synonyms += [l.name() for l in syn.lemmas()]
                    synonyms = list(set(synonyms))
                    # Exclude multi word synonyms (e.g. 'survival_of_the_fittest'), 
                    markers =  ['_', '-']
                    #synonyms = [s for s in synonyms if '_' not in s]
                    multi_words = [s for s in synonyms if any(m in s for m in markers)] 
                    synonyms = [s for s in synonyms if s not in multi_words]
                    # Exclude proper nouns, such as names (e.g. Clarence_Day) and capitalized forms
                    synonyms = [s for s in synonyms if s == s.lower()]
                    # Exclude numeric synonyms
                    synonyms = [s for s in synonyms if type(isint(s))!=int]
                    # Exclude single character synonyms
                    synonyms = [s for s in synonyms if len(s)>1]
                    # Exclude token from synonyms 
                    synonyms = [s for s in synonyms if s != token]
                    if synonyms:
                        for synonym in synonyms:
                            synonyms_tokens[synonym]=token

In [54]:
results = []
lemmas = list(set(pos_df['lemma']))

for lemma in lemmas:
    # Exclude numeric lemmas
    if type(isint(lemma))!=int:
        # Exclude single character lemmas
        if len(lemma)>1:
            # Exclude 2nd 3rd 4th
            if not re.findall('^\d{1,}[st|nd|rd|th]$', lemma):
                # Collect lemma synsets from Wordnet
                syn_set = wn.synsets(lemma)
                if syn_set:
                    synonyms = []
                    for syn in syn_set:
                        #syn_name = syn.name().split('.')[0]
                        #print(syn_name, syn_lemmas)
                        # Collect synonyms from all synsets
                        synonyms += [l.name() for l in syn.lemmas()]
                    synonyms = list(set(synonyms))
                    # Exclude multi word synonyms (e.g. 'survival_of_the_fittest'), 
                    markers =  ['_', '-']
                    #synonyms = [s for s in synonyms if '_' not in s]
                    multi_words = [s for s in synonyms if any(m in s for m in markers)] 
                    synonyms = [s for s in synonyms if s not in multi_words]
                    # Exclude proper nouns, such as names (e.g. Clarence_Day) and capitalized forms
                    synonyms = [s for s in synonyms if s == s.lower()]
                    # Exclude numeric synonyms
                    synonyms = [s for s in synonyms if type(isint(s))!=int]
                    # Exclude single character synonyms
                    synonyms = [s for s in synonyms if len(s)>1]
                    # Exclude lemma from synonyms 
                    synonyms = [s for s in synonyms if s != lemma]
                    if synonyms:
                        for synonym in synonyms:
                            results.append([lemma, synonym])
lemmas_synonyms = pd.DataFrame(results, columns = ['lemma', 'synonym'])
lemmas_synonyms.to_excel(os.path.join(results_dir, 'lemmas_synonyms.xlsx'), index=False)