# Names normalization using the nltk stemmers and lemmatizer 

In [1]:
from prep import *
import nltk
#nltk.download('wordnet')
#nltk.download('punkt')
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import EnglishStemmer
import stanza

nlp = stanza.Pipeline(lang="en", use_gpu= True)
def lemmatize_text (text):
    '''
    :params:
    text: input text
    param1: stanza's language model
    :return: list of token-lemma tuple for each token in text
    '''

    text_tokens_lemmas = []
    doc = nlp(text)
    tokens_lemmas = [(word.text, word.lemma) \
                     for sent in doc.sentences for word in sent.words]
    for token_lemma in tokens_lemmas:
        token = token_lemma[0]
        lemma = token_lemma[1]
        #print('{t}:{l}'.format(t=token, l=lemma))
        if ('_' not in token) and (token != lemma) and (lemma != 'None'):
            text_tokens_lemmas.append(token_lemma)

    return text, text_tokens_lemmas

task_names = task_names[:1000]
start = time.time()
names_tokens_lemmas = []
print('analyzing {n} sentences'.format(n=len(task_names)))
for index, name in enumerate(task_names):
    print('name number ', index+1)
    name = name.lower()
    name, name_tokens_lemmas = lemmatize_text(name)
    if name_tokens_lemmas:
        print(30*'-')
        #print('task name:               ', name)
        print('tokens, different lemmas:', name_tokens_lemmas)
        names_tokens_lemmas += name_tokens_lemmas
end = time.time()
print('duration=', end-start)

2021-12-26 09:15:19 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| depparse     | combined  |
| sentiment    | sstplus   |
| constituency | wsj       |
| ner          | ontonotes |

2021-12-26 09:15:19 INFO: Use device: cpu
2021-12-26 09:15:19 INFO: Loading: tokenize
2021-12-26 09:15:19 INFO: Loading: pos
2021-12-26 09:15:19 INFO: Loading: lemma
2021-12-26 09:15:19 INFO: Loading: depparse
2021-12-26 09:15:19 INFO: Loading: sentiment
2021-12-26 09:15:20 INFO: Loading: constituency
2021-12-26 09:15:20 INFO: Loading: ner
2021-12-26 09:15:21 INFO: Done loading processors!


analyzing 1000 sentences
name number  1
------------------------------
tokens, different lemmas: [('stated', 'state')]
name number  2
name number  3
------------------------------
tokens, different lemmas: [('works', 'work'), ('works', 'work')]
name number  4
name number  5
------------------------------
tokens, different lemmas: [('tunnels', 'tunnel')]
name number  6
------------------------------
tokens, different lemmas: [('works', 'work')]
name number  7
name number  8
name number  9
name number  10
name number  11
name number  12
name number  13
name number  14
name number  15
name number  16
name number  17
name number  18
name number  19
name number  20
name number  21
name number  22
name number  23
name number  24


KeyboardInterrupt: 

In [14]:
unique_tokens_lemas = list(set(names_tokens_lemmas)) 
print(unique_tokens_lemas)

[('1,367', '1367'), ('cylinders', 'cylinder'), ('services', 'service'), ('4,277', '4277'), ('m.s', 'm.'), ('placing', 'place'), ('bridges', 'bridge'), ('parts', 'part'), ('utiltiies', 'utiltiy'), ('pre-drilling', 'pre-dril'), ('weeks', 'week'), ('2,608', '2608'), ('corridors', 'corridor'), ('approved', 'approve'), ('tunnelling', 'tunnel'), ('devices', 'device'), ('stairs', 'stair'), ('backs', 'back'), ('poured', 'pour'), ('pipes', 'pipe'), ('dismantling', 'dismantle'), ('risers', 'riser'), ('procedures', 'procedure'), ('accessories', 'accessory'), ('leading', 'lead'), ('1,292', '1292'), ('purchasing', 'purchase'), ('sams', 'sam'), ('26,322', '26322'), ('8,781', '8781'), ('i', 'I'), ('chartered', 'charter'), ('fixed', 'fix'), ('rms', 'rm'), ('running', 'run'), ('grating', 'grate'), ('1,130', '1130'), ('demolished', 'demolish'), ('5,052', '5052'), ('defects', 'defect'), ('2,076', '2076'), ('10,694', '10694'), ('repositioning', 'reposition'), ('1,560', '1560'), ('footing', 'foot'), ('boll

# Observations
### Calculation Duration
Lemmatizing all sentences takes a long time (40 minutes and running)

### Results  
1. Verbs: Lemmas appear to be useful to normalize inflections:  
'specified', 'specify'; 'designated', 'designate'; 'built', 'build'    
2. Nouns: Plural form lemmas may be misleading as the plural form may indicate a significant difference in work load:   
'tunnels' -> 'tunnel', 'gates' -> 'gate', 'wkys', 'wky'  
<font color='red'>! Lemmatization or another process to distinguish plural from singular forms may be useful to identify dissimilarities between tasks </font>   
difflib.ndiff may also be useful to distinguish plural from singular forms: https://stackoverflow.com/questions/17904097/python-difference-between-two-strings/17904977