<a href="https://colab.research.google.com/github/AlirezaAhadipour/NLP_Pre-processing/blob/main/NLTK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
text = 'Hi. My name is Alireza. I am a passionate Data Scientist. Welcome to my Github page.'

# Tokenization

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
tokens = sent_tokenize(text)
print(tokens)

['Hi.', 'My name is Alireza.', 'I am a passionate Data Scientist.', 'Welcome to my Github page.']


In [None]:
tokens = word_tokenize(text)
print(tokens)

['Hi', '.', 'My', 'name', 'is', 'Alireza', '.', 'I', 'am', 'a', 'passionate', 'Data', 'Scientist', '.', 'Welcome', 'to', 'my', 'Github', 'page', '.']


# Parsing

In [None]:
# nltk.ngrams(text,n)

In [None]:
unigram = nltk.ngrams(tokens, 1)
print(list(unigram))

[('Hi',), ('.',), ('My',), ('name',), ('is',), ('Alireza',), ('.',), ('I',), ('am',), ('a',), ('passionate',), ('Data',), ('Scientist',), ('.',), ('Welcome',), ('to',), ('my',), ('Github',), ('page',), ('.',)]


In [None]:
bigram = nltk.bigrams(tokens)
print(list(bigram))

[('Hi', '.'), ('.', 'My'), ('My', 'name'), ('name', 'is'), ('is', 'Alireza'), ('Alireza', '.'), ('.', 'I'), ('I', 'am'), ('am', 'a'), ('a', 'passionate'), ('passionate', 'Data'), ('Data', 'Scientist'), ('Scientist', '.'), ('.', 'Welcome'), ('Welcome', 'to'), ('to', 'my'), ('my', 'Github'), ('Github', 'page'), ('page', '.')]


In [None]:
trigram = nltk.trigrams(tokens)
print(list(trigram))

[('Hi', '.', 'My'), ('.', 'My', 'name'), ('My', 'name', 'is'), ('name', 'is', 'Alireza'), ('is', 'Alireza', '.'), ('Alireza', '.', 'I'), ('.', 'I', 'am'), ('I', 'am', 'a'), ('am', 'a', 'passionate'), ('a', 'passionate', 'Data'), ('passionate', 'Data', 'Scientist'), ('Data', 'Scientist', '.'), ('Scientist', '.', 'Welcome'), ('.', 'Welcome', 'to'), ('Welcome', 'to', 'my'), ('to', 'my', 'Github'), ('my', 'Github', 'page'), ('Github', 'page', '.')]


# Stemming

In [None]:
from nltk.stem import PorterStemmer

In [None]:
stm = PorterStemmer()
stm.stem('running'), pst.stem('corpora'), pst.stem('studies')

('run', 'corpora', 'studi')

# Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
lemma = WordNetLemmatizer()
lemma.lemmatize('running'), lemma.lemmatize('corpora'), lemma.lemmatize('studies')

('running', 'corpus', 'study')

# Part-of-Speech (PoS) Tagging

In [None]:
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
stop_words = set(stopwords.words('english'))
print(stop_words)

{"shan't", "you're", 'whom', 'her', 'ours', "she's", 'aren', 'out', 'our', 'from', 'its', 'on', "mightn't", "hasn't", "that'll", 'in', 'each', 'with', 'y', 'no', 'needn', 'not', 'at', 'then', 'own', 'myself', 'mustn', 'hers', 'have', 'hadn', 'll', 'yours', 'before', 'there', 'didn', 'off', 'too', 'can', 'wasn', 'to', 'same', 'an', 'herself', 'we', 'are', 'a', "shouldn't", "you've", 'm', 'me', 'once', 'd', "wasn't", 'has', 'after', 'himself', 'when', 'nor', 'just', 'i', 'if', 'for', 'you', 'the', 'and', "you'll", 'into', 'what', 'be', 'as', 'only', 's', 'more', 're', 'their', 'ain', 'weren', 'here', 'does', 'ourselves', 'having', 'all', 'it', 'will', 'below', 'over', "couldn't", 'he', "isn't", 'by', 'o', 'some', 'or', 'his', 'any', 'during', 'such', 'hasn', 'very', "haven't", 'under', 'how', "doesn't", 'itself', 'shouldn', 'where', "don't", 'should', "aren't", 'this', "weren't", "mustn't", "you'd", 'these', 've', 'won', 'is', 'had', 'while', "won't", 'both', 'couldn', 'that', 'until', '

In [None]:
# remove stop words
wordslist = [word for word in tokens if word not in stop_words]
print(wordslist)

['Hi', '.', 'My', 'name', 'Alireza', '.', 'I', 'passionate', 'Data', 'Scientist', '.', 'Welcome', 'Github', 'page', '.']


In [None]:
tagged_words = nltk.pos_tag(wordslist)
print(tagged_words)

[('Hi', 'NNP'), ('.', '.'), ('My', 'PRP$'), ('name', 'NN'), ('Alireza', 'NNP'), ('.', '.'), ('I', 'PRP'), ('passionate', 'VBP'), ('Data', 'NNS'), ('Scientist', 'NNP'), ('.', '.'), ('Welcome', 'NNP'), ('Github', 'NNP'), ('page', 'NN'), ('.', '.')]


# Named Entity Recognition (NER)

In [None]:
import spacy

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
document = nlp(text)
for ent in document.ents:
  print(ent.text, ent.start_char, ent.end_char, ent.label_)

Alireza 15 22 PERSON
Data Scientist 42 56 ORG
Github 72 78 GPE
