In [1]:
import nltk
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lost\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
from nltk.corpus import wordnet
from typing import List

In [3]:
color=wordnet.synset('red.n.01')

In [4]:
list(color.closure(lambda s: s.hypernyms()))

[Synset('chromatic_color.n.01'),
 Synset('color.n.01'),
 Synset('visual_property.n.01'),
 Synset('property.n.02'),
 Synset('attribute.n.02'),
 Synset('abstraction.n.06'),
 Synset('entity.n.01')]

In [5]:
def get_synonyms(word: str) -> List[str]:
    synonyms = [] 
  
    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            synonyms.append(l.name()) 
    return synonyms

In [6]:
get_synonyms("data")

['data', 'information', 'datum', 'data_point']

In [7]:
print(wordnet.synsets("run")[0].examples())

['the Yankees scored 3 runs in the bottom of the 9th', 'their first tally came in the 3rd inning']


In [8]:
w1 = wordnet.synset('run.v.01')
w2 = wordnet.synset('jump.v.01')
print(w1.wup_similarity(w2))

0.2857142857142857


In [9]:
w1 = wordnet.synset('run.v.01')
w2 = wordnet.synset('climb.v.01')
print(w1.wup_similarity(w2))

0.25


In [10]:
w1 = wordnet.synset('run.v.01')
w2 = wordnet.synset('sit.v.01')
print(w1.wup_similarity(w2))

0.3333333333333333


In [11]:
w1 = wordnet.synset('run.v.01')
w2 = wordnet.synset('drive.v.01')
print(w1.wup_similarity(w2))

0.2


In [12]:
text="""Cristiano Ronaldo dos Santos Aveiro GOIH ComM (born 5 February 1985) is a Portuguese professional footballer who plays as a forward for Serie A club Juventus and captains the Portugal national team. Often considered the best player in the world and widely regarded as one of the greatest players of all time, Ronaldo has won five Ballons d'Or[note 3] and four European Golden Shoes, both of which are records for a European player. He has won 30 major trophies in his career, including seven league titles, five UEFA Champions Leagues, one UEFA European Championship, and one UEFA Nations League title. Ronaldo holds the records for the most goals (130) and assists (41) in the history of the UEFA Champions League. He is one of the few recorded players to have made over 1,000 professional career appearances and has scored over 700 senior career goals for club and country. He is also the second player to score 100 international goals, and the first European to achieve the feat."""

In [13]:
import nltk

In [14]:
tokens = text.lower().split()

In [15]:
from collections import Counter
token_cnt=Counter(tokens)

In [16]:
max(token_cnt, key=token_cnt.get)

'the'

In [17]:
tokens_frequency = nltk.FreqDist(tokens)

In [18]:
tokens_frequency.most_common()

[('the', 12),
 ('and', 8),
 ('of', 5),
 ('a', 4),
 ('for', 4),
 ('one', 4),
 ('european', 4),
 ('uefa', 4),
 ('ronaldo', 3),
 ('is', 3),
 ('in', 3),
 ('has', 3),
 ('he', 3),
 ('to', 3),
 ('professional', 2),
 ('as', 2),
 ('club', 2),
 ('player', 2),
 ('players', 2),
 ('won', 2),
 ('five', 2),
 ('records', 2),
 ('league', 2),
 ('champions', 2),
 ('goals', 2),
 ('over', 2),
 ('career', 2),
 ('cristiano', 1),
 ('dos', 1),
 ('santos', 1),
 ('aveiro', 1),
 ('goih', 1),
 ('comm', 1),
 ('(born', 1),
 ('5', 1),
 ('february', 1),
 ('1985)', 1),
 ('portuguese', 1),
 ('footballer', 1),
 ('who', 1),
 ('plays', 1),
 ('forward', 1),
 ('serie', 1),
 ('juventus', 1),
 ('captains', 1),
 ('portugal', 1),
 ('national', 1),
 ('team.', 1),
 ('often', 1),
 ('considered', 1),
 ('best', 1),
 ('world', 1),
 ('widely', 1),
 ('regarded', 1),
 ('greatest', 1),
 ('all', 1),
 ('time,', 1),
 ('ballons', 1),
 ("d'or[note", 1),
 ('3]', 1),
 ('four', 1),
 ('golden', 1),
 ('shoes,', 1),
 ('both', 1),
 ('which', 1),
 ('a

In [21]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lost\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [19]:
from nltk.corpus import stopwords

In [22]:
stop_words = stopwords.words('english')

In [23]:
cleaned_tokens = [token for token in tokens if token not in stop_words]

In [24]:
cleaned_tokens_frequency = nltk.FreqDist(cleaned_tokens)
cleaned_tokens_frequency.most_common()

[('one', 4),
 ('european', 4),
 ('uefa', 4),
 ('ronaldo', 3),
 ('professional', 2),
 ('club', 2),
 ('player', 2),
 ('players', 2),
 ('five', 2),
 ('records', 2),
 ('league', 2),
 ('champions', 2),
 ('goals', 2),
 ('career', 2),
 ('cristiano', 1),
 ('dos', 1),
 ('santos', 1),
 ('aveiro', 1),
 ('goih', 1),
 ('comm', 1),
 ('(born', 1),
 ('5', 1),
 ('february', 1),
 ('1985)', 1),
 ('portuguese', 1),
 ('footballer', 1),
 ('plays', 1),
 ('forward', 1),
 ('serie', 1),
 ('juventus', 1),
 ('captains', 1),
 ('portugal', 1),
 ('national', 1),
 ('team.', 1),
 ('often', 1),
 ('considered', 1),
 ('best', 1),
 ('world', 1),
 ('widely', 1),
 ('regarded', 1),
 ('greatest', 1),
 ('time,', 1),
 ('ballons', 1),
 ("d'or[note", 1),
 ('3]', 1),
 ('four', 1),
 ('golden', 1),
 ('shoes,', 1),
 ('player.', 1),
 ('30', 1),
 ('major', 1),
 ('trophies', 1),
 ('career,', 1),
 ('including', 1),
 ('seven', 1),
 ('titles,', 1),
 ('leagues,', 1),
 ('championship,', 1),
 ('nations', 1),
 ('title.', 1),
 ('holds', 1),
 ('

In [25]:
len(set(cleaned_tokens))

83

In [26]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [27]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [29]:

lemmatized_word_cleaned_tokens = [lemmatizer.lemmatize(token, pos="v") for token in cleaned_tokens]

In [30]:
cleaned_tokens

['cristiano',
 'ronaldo',
 'dos',
 'santos',
 'aveiro',
 'goih',
 'comm',
 '(born',
 '5',
 'february',
 '1985)',
 'portuguese',
 'professional',
 'footballer',
 'plays',
 'forward',
 'serie',
 'club',
 'juventus',
 'captains',
 'portugal',
 'national',
 'team.',
 'often',
 'considered',
 'best',
 'player',
 'world',
 'widely',
 'regarded',
 'one',
 'greatest',
 'players',
 'time,',
 'ronaldo',
 'five',
 'ballons',
 "d'or[note",
 '3]',
 'four',
 'european',
 'golden',
 'shoes,',
 'records',
 'european',
 'player.',
 '30',
 'major',
 'trophies',
 'career,',
 'including',
 'seven',
 'league',
 'titles,',
 'five',
 'uefa',
 'champions',
 'leagues,',
 'one',
 'uefa',
 'european',
 'championship,',
 'one',
 'uefa',
 'nations',
 'league',
 'title.',
 'ronaldo',
 'holds',
 'records',
 'goals',
 '(130)',
 'assists',
 '(41)',
 'history',
 'uefa',
 'champions',
 'league.',
 'one',
 'recorded',
 'players',
 'made',
 '1,000',
 'professional',
 'career',
 'appearances',
 'scored',
 '700',
 'senior',

In [32]:
nltk.download('punkt')
sentences = sent_tokenize(text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lost\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [33]:
word_tokens = word_tokenize(text)

In [37]:
word_tokens = word_tokenize(text.lower())
word_cleaned_tokens = [token for token in word_tokens if token not in stop_words]

In [38]:
word_cleaned_tokens_frequency = nltk.FreqDist(word_cleaned_tokens)
word_cleaned_tokens_frequency.most_common()

[(',', 7),
 ('.', 6),
 ('one', 4),
 ('european', 4),
 ('uefa', 4),
 ('ronaldo', 3),
 ('(', 3),
 (')', 3),
 ('player', 3),
 ('career', 3),
 ('league', 3),
 ('goals', 3),
 ('professional', 2),
 ('club', 2),
 ('players', 2),
 ('five', 2),
 ('records', 2),
 ('champions', 2),
 ('cristiano', 1),
 ('dos', 1),
 ('santos', 1),
 ('aveiro', 1),
 ('goih', 1),
 ('comm', 1),
 ('born', 1),
 ('5', 1),
 ('february', 1),
 ('1985', 1),
 ('portuguese', 1),
 ('footballer', 1),
 ('plays', 1),
 ('forward', 1),
 ('serie', 1),
 ('juventus', 1),
 ('captains', 1),
 ('portugal', 1),
 ('national', 1),
 ('team', 1),
 ('often', 1),
 ('considered', 1),
 ('best', 1),
 ('world', 1),
 ('widely', 1),
 ('regarded', 1),
 ('greatest', 1),
 ('time', 1),
 ('ballons', 1),
 ("d'or", 1),
 ('[', 1),
 ('note', 1),
 ('3', 1),
 (']', 1),
 ('four', 1),
 ('golden', 1),
 ('shoes', 1),
 ('30', 1),
 ('major', 1),
 ('trophies', 1),
 ('including', 1),
 ('seven', 1),
 ('titles', 1),
 ('leagues', 1),
 ('championship', 1),
 ('nations', 1),
 (

In [40]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

print(stemmer.stem("spinning"))

spin


In [41]:
word = "spinning"
print(f'Basic vebr form for {word}: {lemmatizer.lemmatize(word, pos="v")}')
print(f'Basic adjective form for {word}: {lemmatizer.lemmatize(word, pos="a")}')

Basic vebr form for spinning: spin
Basic adjective form for spinning: spinning


In [42]:
import spacy

In [44]:
!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0 MB)
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py): started
  Building wheel for en-core-web-sm (setup.py): finished with status 'done'
  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.3.1-py3-none-any.whl size=12047114 sha256=e3af0061876025dc4537ab74e34e59eea33255bdf3d4571e1d01c86960673dad
  Stored in directory: C:\Users\Lost\AppData\Local\Temp\pip-ephem-wheel-cache-nsyef6dy\wheels\b7\0d\f0\7ecae8427c515065d75410989e15e5785dd3975fe06e795cd9
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-2.3.1
[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')


In [47]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [48]:
from spacy import displacy

In [58]:
doc=nlp(text.split(".")[0])
displacy.serve(doc, style="dep")


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [53]:
for ent in doc.ents:
    if ent.label_=='PERSON':
        print(ent.text, ent.start_char, ent.end_char, ent.label_)

Cristiano Ronaldo 0 17 PERSON
dos Santos 18 28 PERSON
Ronaldo 309 316 PERSON
Ronaldo 603 610 PERSON


In [57]:
text.split(".")[0]

'Cristiano Ronaldo dos Santos Aveiro GOIH ComM (born 5 February 1985) is a Portuguese professional footballer who plays as a forward for Serie A club Juventus and captains the Portugal national team'