<a href="https://colab.research.google.com/github/Bhargavi61/CAPSTONE/blob/main/Copy_of_Introduction_to_Natural_Language_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction to Natural Language Processing

# Installing the libraries

- spaCy: https://spacy.io/

In [None]:
!pip install spacy --upgrade

In [None]:
import spacy
spacy.__version__

In [None]:
!python -m spacy download pt_core_news_sm
# Portugese

In [None]:
!python -m spacy download fr_core_news_sm
# French

In [None]:
!python -m spacy download en_core_web_sm
# English

# POS (part-of-speech)

- POS (part-of-speech): noun, adjective, verb
- It is important to find named entities
- Tags: https://ashutoshtripathi.com/2020/04/13/parts-of-speech-tagging-and-dependency-parsing-using-spacy-nlp/

In [6]:
import en_core_web_sm
# Import English Language module

In [7]:
nlp = spacy.load('en_core_web_sm')
# Load English module into the variable 'nlp'

In [8]:
nlp
# To check if nlp is associated with English

<spacy.lang.en.English at 0x7cad5a287f40>

In [9]:
document = nlp('I am learning natural language processing. The course is in London')

In [10]:
for token in document:
  print(token.text, "-", token.pos_)

I - PRON
am - AUX
learning - VERB
natural - ADJ
language - NOUN
processing - NOUN
. - PUNCT
The - DET
course - NOUN
is - AUX
in - ADP
London - PROPN


## Legend

- lemma: "root" of the word
- pos: part-of-speech  
- tag: morfological information (present, future, past)
- dep: syntatic dependency
- shape: lowercase, uppercasa
- alpha: if it is alphanumeric
- stop: if it is a stop word

In [11]:
for token in document:
  print(token.text, "-", token.pos_, "--", token.lemma_, "-", token.tag_, "-", token.dep_, "-", token.shape_, "-", token.is_alpha, "-", token.is_stop)

I - PRON -- I - PRP - nsubj - X - True - True
am - AUX -- be - VBP - aux - xx - True - True
learning - VERB -- learn - VBG - ROOT - xxxx - True - False
natural - ADJ -- natural - JJ - amod - xxxx - True - False
language - NOUN -- language - NN - compound - xxxx - True - False
processing - NOUN -- processing - NN - dobj - xxxx - True - False
. - PUNCT -- . - . - punct - . - False - False
The - DET -- the - DT - det - Xxx - True - True
course - NOUN -- course - NN - nsubj - xxxx - True - False
is - AUX -- be - VBZ - ROOT - xx - True - True
in - ADP -- in - IN - prep - xx - True - True
London - PROPN -- London - NNP - pobj - Xxxxx - True - False


In [12]:
for token in document:
  if token.pos_ == 'PROPN':
    print(token.text)

London


In [13]:
for token in document:
  if token.pos_ == 'VERB':
    print(token.text)

learning


# Lemmatization and stemming

- Lemmatization: meaning of the word based on the dictionary (morphological analysis) - extract the base word
- Stemming: extract the root of the word

In [14]:
for token in document:
  print(token.text, "-", token.lemma_)

I - I
am - be
learning - learn
natural - natural
language - language
processing - processing
. - .
The - the
course - course
is - be
in - in
London - London


In [16]:
doc = nlp('learn learning watch watching watched')
[token.lemma_ for token in doc]

['learn', 'learn', 'watch', 'watch', 'watch']

## Lemmatization x Stemming

In [None]:
import nltk

In [None]:
stemmer = nltk.stem.PorterStemmer()
stemmer.stem('learning')

'learn'

In [None]:
stemmer.stem('watching')

'watch'

In [None]:
for token in document:
  print(token.text, token.lemma_, stemmer.stem(token.text))

I I i
am be am
learning learn learn
natural natural natur
language language languag
processing processing process
. . .
The the the
course course cours
is be is
in in in
London London london


# Named-entity recognition (NER)

- List of tags: https://towardsdatascience.com/named-entity-recognition-ner-using-spacy-nlp-part-4-28da2ece57c6

In [None]:
text = 'IBM is a US company on information technology. It is located in San Francisco and revenue in 2018 was approximately 320 billion dolars'

In [None]:
document = nlp(text)

In [None]:
for entity in document.ents:
  print(entity.text, entity.label_)

IBM ORG
US GPE
San Francisco GPE
2018 DATE
approximately 320 billion dolars MONEY


In [None]:
from spacy import displacy
displacy.render(document, style = 'ent', jupyter=True)

In [None]:
text = 'Bill Gates was born in Seattle on 1955-10-28 and is the founder of Microsoft'

In [None]:
document = nlp(text)
for entity in document.ents:
  print(entity.text, entity.label_)

Bill Gates PERSON
Seattle GPE
1955-10-28 DATE
Microsoft ORG


In [None]:
displacy.render(document, style = 'ent', jupyter=True)

In [None]:
for entity in document.ents:
  if entity.label_ == 'PERSON':
    print(entity.text)

Bill Gates


# Stopwords

- Words that appear very often and don't help to understand the context of the document

In [None]:
# it
from spacy.lang.fr.stop_words import STOP_WORDS
print(STOP_WORDS)

{'devra', 'proche', "t'", 'troisième', 'sera', 'concernant', 'autrui', 'nombreux', 'va', 'onzième', 'hormis', 'quelconque', 'quel', "s'", 'ceci', 'suit', 'ai', 'après', 'quatrièmement', 'suis', 'moi', 'â', 'dit', 'tel', 'mienne', 'maint', 'cinquantaine', 'celles-ci', 'il', 'celle-ci', 'cinq', 'anterieur', 'hi', 'huit', 'revoila', 'uns', 'seules', 'spécifiques', 'reste', 'tend', 'elle-meme', 'outre', 'car', 'différents', 'quels', 'as', 'revoici', 'onze', 'différentes', 'parler', 'dejà', 'quoi', 'à', 'par', 'te', 'avais', 'ceux-là', 'lui-même', 'sixième', 'procedant', 'quatrième', 'seront', 'differentes', 'ait', 'lorsque', 'vont', 'suivre', 'aupres', 'jusqu', 'tenant', 'ont', 'dixième', 'hem', 'douzième', 'au', 'effet', 'moi-même', 'sienne', 'semblable', 'nous', 'elle', "c'", 'a', "j'", 'ouverts', 'aurait', 'certains', 'seize', 'hou', 'deux', 'quoique', 'soi-même', 'sous', 'lors', 'chacune', 'cinquante', 'ce', 'antérieures', 'allaient', 'suivants', 'vé', 'certaines', 'cet', 'lesquels', '

In [None]:
from spacy.lang.pt.stop_words import STOP_WORDS
print(STOP_WORDS)

{'fora', 'momento', 'tais', 'ligado', 'máximo', 'ponto', 'ele', 'meio', 'nós', 'próxima', 'só', 'meses', 'zero', 'podia', 'vinda', 'vêm', 'partir', 'caminho', 'tudo', 'pela', 'nossas', 'for', 'coisa', 'sim', 'põe', 'mas', 'isto', 'uns', 'pelo', 'aqueles', 'pôde', 'sem', 'querem', 'aquilo', 'teus', 'direita', 'estiveste', 'cada', 'as', 'através', 'antes', 'onze', 'quinto', 'à', 'te', 'todos', 'vindo', 'deve', 'tem', 'numa', 'posição', 'nossa', 'cento', 'favor', 'tivestes', 'põem', 'contudo', 'outras', 'foi', 'suas', 'é', 'vem', 'vossas', 'pouco', 'duas', 'nova', 'tive', 'talvez', 'dois', 'outros', 'a', 'ao', 'nas', 'quanto', 'fazia', 'fazemos', 'esses', 'usar', 'exemplo', 'outra', 'baixo', 'área', 'ser', 'nível', 'tentar', 'fará', 'elas', 'certeza', 'breve', 'debaixo', 'até', 'povo', 'menos', 'estás', 'vez', 'catorze', 'deste', 'e', 'tanto', 'contra', 'esse', 'bem', 'forma', 'ora', 'nesta', 'local', 'minha', 'vos', 'conhecida', 'seus', 'dentro', 'usa', 'vossa', 'foste', 'obrigada', 'és'

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS
print(STOP_WORDS)

{'eight', 'upon', 'every', '’m', 'quite', 'eleven', 'meanwhile', 'their', 'which', 'everything', 'nevertheless', 'call', 'n‘t', 'for', 'several', 'back', 'behind', 'wherever', 'as', 'but', 'per', '’ve', 'themselves', 'put', '‘m', 'along', 'namely', 'whereupon', 'just', 'why', 'used', 'between', 'mostly', 'until', 'bottom', 'regarding', 'after', 'each', 'will', 'well', 'besides', 'beside', 'this', 'you', 'us', 'done', 'another', 'own', 'become', 'afterwards', 'three', 'a', 'however', 'when', '’s', '‘ll', 'throughout', 'top', 'still', 'beyond', 'its', 'latterly', 'latter', 'third', 'such', 'the', 'mine', 'again', 'by', 'thereupon', 'others', 'see', 'ten', 'neither', 'whom', 'it', 'too', 'say', 'herein', 'who', 'both', 'hence', 'did', 'how', 'next', 'n’t', 'perhaps', 'seems', 'very', 'all', 'whenever', '‘ve', 'get', 'thus', 'fifty', 'less', 'moreover', 'therefore', 'everyone', 'seeming', 'using', 'name', 'off', 'thereafter', 'becoming', 'give', 'becomes', 'there', 'otherwise', 'anywhere',

In [None]:
'it' in STOP_WORDS

True

In [None]:
len(STOP_WORDS)

326

In [None]:
nlp.vocab['it'].is_stop

True

In [None]:
nlp.vocab['walk'].is_stop

False

In [None]:
document = nlp('I am learning natural language processing. The course is in London')

In [None]:
for token in document:
  if nlp.vocab[token.text].is_stop:
    print(token.text)

I
am
The
is
in


In [None]:
for token in document:
  if not nlp.vocab[token.text].is_stop:
    print(token.text)

learning
natural
language
processing
.
course
London


# Dependency parsing

- Parent-child relation

## Example 1

In [None]:
document = nlp('book a ticket from London to Paris')

In [None]:
origin = document[4]
destiny = document[6]
print(origin, destiny)

London Paris


In [None]:
list(origin.ancestors)

[from, ticket, book]

In [None]:
list(destiny.ancestors)

[to, ticket, book]

In [None]:
document[0].is_ancestor(document[2])

True

## Example 2

In [None]:
document = nlp('Book a table for the restaurant and a taxi to the hotel')

In [None]:
tasks = document[2], document[8]
locations = document[5], document[11]
print(tasks, locations)

(table, taxi) (restaurant, hotel)


In [None]:
for local in locations:
  print('-----', local)
  for obj in local.ancestors:
    print(obj)

----- restaurant
for
table
Book
----- hotel
to
taxi
restaurant
for
table
Book


In [None]:
for local in locations:
  for obj in local.ancestors:
    if obj in tasks:
      print('Reservation of a {} to the {}'.format(obj, local))
      break

Reservation of a table to the restaurant
Reservation of a taxi to the hotel


In [None]:
list(document[5].children)

[the, and, taxi]

## Example 3

In [None]:
from spacy import displacy

In [None]:
document = nlp('Book a table for the restaurant and a taxi to the hotel')

In [None]:
displacy.render(document, style='dep', jupyter=True, options={'distance': 90})

In [None]:
list(document[2].ancestors)

[Book]

In [None]:
list(document[2].children)

[a, for]

## Example 4

In [None]:
document = nlp('What places can we visit in London and stay in Paris?')
locations = document[6], document[10]
actions = document[4], document[8]
print(locations, actions)

(London, Paris) (visit, stay)


In [None]:
for local in locations:
  #print(local)
  for action in local.ancestors:
    if action in actions:
      print('{} to {}'.format(local, action))
      break

London to visit
Paris to stay


In [None]:
displacy.render(document, style='dep', jupyter=True, options={'distance': 90})

# Similarity between words and sentences

- spaCy uses the GloVe algorithm (Global Vectors for Word Representation)
- Original paper: https://nlp.stanford.edu/pubs/glove.pdf

## Example 1

In [None]:
w1 = nlp('hello')
w2 = nlp('hi')
w3 = nlp('or')

In [None]:
w1.similarity(w2)

  w1.similarity(w2)


0.7161104995803708

In [None]:
w2.similarity(w1)

  w2.similarity(w1)


0.7161104995803708

In [None]:
w1.similarity(w3)

  w1.similarity(w3)


0.1586774408088656

In [None]:
w2.similarity(w3)

  w2.similarity(w3)


0.31475242221455424

In [None]:
text1 = nlp('When will the new movie be released?')
text2 = nlp('The new movie will be released next month')
text3 = nlp('What color is the car?')

In [None]:
text1.similarity(text2)

  text1.similarity(text2)


0.701367333985553

In [None]:
text1.similarity(text3)

  text1.similarity(text3)


0.4782758141062681

In [None]:
# New York
# Nw Yok

## Example 2

In [None]:
text = nlp('cat dog horse person')

In [None]:
for text1 in text:
  #print('----', text1)
  for text2 in text:
    #print(text2)
    similarity = text1.similarity(text2) * 100
    print('{} is {}% similar to {}'.format(text1, similarity, text2))

cat is 100.0% similar to cat
cat is 55.56725263595581% similar to dog
cat is 49.9476432800293% similar to horse
cat is 19.96726244688034% similar to person
dog is 55.56725263595581% similar to cat
dog is 100.0% similar to dog
dog is 66.69515371322632% similar to horse
dog is 35.0044310092926% similar to person
horse is 49.9476432800293% similar to cat
horse is 66.69515371322632% similar to dog
horse is 100.0% similar to horse
horse is 28.581640124320984% similar to person
person is 19.96726244688034% similar to cat
person is 35.0044310092926% similar to dog
person is 28.581640124320984% similar to horse
person is 100.0% similar to person


  similarity = text1.similarity(text2) * 100


# Tokenization

In [None]:
document1 = nlp('I am learning natural language processing. The course is in London. Ph.d John is coming')

In [None]:
for token in document1:
  print(token)

I
am
learning
natural
language
processing
.
The
course
is
in
London
.
Ph.d
John
is
coming


In [None]:
document2 = 'I am learning natural language processing. The course is in London. Ph.d John is coming'
document2.split('.')

['I am learning natural language processing',
 ' The course is in London',
 ' Ph',
 'd John is coming']