# 1. Text Operations

In [1]:
# Install libraries
!pip install nltk
!pip install spacy

Collecting spacy
  Downloading spacy-3.7.4-cp39-cp39-macosx_10_9_x86_64.whl (6.9 MB)
[K     |████████████████████████████████| 6.9 MB 1.7 MB/s eta 0:00:01
[?25hCollecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.5-py3-none-any.whl (22 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.10-cp39-cp39-macosx_10_9_x86_64.whl (26 kB)
Collecting pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4
  Downloading pydantic-2.7.3-py3-none-any.whl (409 kB)
[K     |████████████████████████████████| 409 kB 8.3 MB/s eta 0:00:01
[?25hCollecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.4.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 2.5 MB/s eta 0:00:01
[?25hCollecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.8-cp39-cp39-macosx_10_9_x86_64.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.7 MB/s eta 0:00:01
[?25hCollecting srsly<3.0.0,>=2.4.3
  Downloading srsly-2.4.8-cp39-cp39-macosx_10_9_x86_64.whl (493 kB)
[K     |

In [4]:
# Import the library
import nltk

In [10]:
text = "Ben relocated to Paris last year to pursue his passion. His currently enrolles in a comprehensive course on Natural Language Processing"
text

'Ben relocated to Paris last year to pursue his passion. His currently enrolles in a comprehensive course on Natural Language Processing'

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/deepshah/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)
print(tokens)

['Ben', 'relocated', 'to', 'Paris', 'last', 'year', 'to', 'pursue', 'his', 'passion', '.', 'His', 'currently', 'enrolles', 'in', 'a', 'comprehensive', 'course', 'on', 'Natural', 'Language', 'Processing']


In [12]:
# Stemming = reduces words to their root form
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(token) for token in tokens]
print(stemmed_words)

['ben', 'reloc', 'to', 'pari', 'last', 'year', 'to', 'pursu', 'hi', 'passion', '.', 'hi', 'current', 'enrol', 'in', 'a', 'comprehens', 'cours', 'on', 'natur', 'languag', 'process']


In [14]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/deepshah/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [15]:
# Part-of-speech tagging
from nltk import pos_tag
text_pos_tag = pos_tag(tokens)
print(text_pos_tag)

[('Ben', 'NNP'), ('relocated', 'VBD'), ('to', 'TO'), ('Paris', 'NNP'), ('last', 'JJ'), ('year', 'NN'), ('to', 'TO'), ('pursue', 'VB'), ('his', 'PRP$'), ('passion', 'NN'), ('.', '.'), ('His', 'PRP$'), ('currently', 'RB'), ('enrolles', 'VBZ'), ('in', 'IN'), ('a', 'DT'), ('comprehensive', 'JJ'), ('course', 'NN'), ('on', 'IN'), ('Natural', 'NNP'), ('Language', 'NNP'), ('Processing', 'NNP')]


In [17]:
# This function shows us the label for each tag
nltk.download('tagsets')
nltk.help.upenn_tagset("NNP")

[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/deepshah/nltk_data...


NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


[nltk_data]   Unzipping help/tagsets.zip.


In [21]:
# Named Entitiy Recognition
nltk.download('maxent_ne_chunker')
nltk.download('words')
entities = nltk.ne_chunk(text_pos_tag)
print(entities)

(S
  (PERSON Ben/NNP)
  relocated/VBD
  to/TO
  (GPE Paris/NNP)
  last/JJ
  year/NN
  to/TO
  pursue/VB
  his/PRP$
  passion/NN
  ./.
  His/PRP$
  currently/RB
  enrolles/VBZ
  in/IN
  a/DT
  comprehensive/JJ
  course/NN
  on/IN
  (ORGANIZATION Natural/NNP Language/NNP)
  Processing/NNP)


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/deepshah/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/deepshah/nltk_data...
[nltk_data]   Package words is already up-to-date!


# 2. Word Embeddings

In [22]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[K     |████████████████████████████████| 587.7 MB 18 kB/s  eta 0:00:012     |██████████████████████▎         | 409.7 MB 4.1 MB/s eta 0:00:44     |████████████████████████▎       | 446.0 MB 11.3 MB/s eta 0:00:13
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [24]:
import spacy

# Load the English model
nlp = spacy.load('en_core_web_lg')

# Process words with the model
word1 = nlp("king")
word2 = nlp("queen")
word3 = nlp("apple")

# Calculate the similarities
similarity1 = word1.similarity(word2)
similarity2 = word1.similarity(word3)

# Display the similarities
print(similarity1)
print(similarity2)

0.6108841234425123
0.19521993793686707
