In [1]:
## NLTK #####   Bigrams ###

#!pip install nltk
from nltk import ngrams

text = "Machine learning is fun".split()
list(ngrams(text, 2))

[('Machine', 'learning'), ('learning', 'is'), ('is', 'fun')]

In [5]:
text ="Machine learning is fun"
n=3
bigrams = list(ngrams(text.split(), n))

In [6]:
bigrams

[('Machine', 'learning', 'is'), ('learning', 'is', 'fun')]

## Using Spacy to find bigrams

In [3]:
### Spacy ###

nlp = spacy.load("en_core_web_sm")
text ="This is an example sentence for creating n-grams."
n=3
tokens = [token.text for token in nlp(text)]
ngrams = [tokens[i : i + n] for i in range(len(tokens) - n + 1)]
print(ngrams)

[['This', 'is', 'an'], ['is', 'an', 'example'], ['an', 'example', 'sentence'], ['example', 'sentence', 'for'], ['sentence', 'for', 'creating'], ['for', 'creating', 'n'], ['creating', 'n', '-'], ['n', '-', 'grams'], ['-', 'grams', '.']]


## Using Scikit Learn for n grams

In [22]:
## scikit Learn ##
#!pip install scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()
print(X.toarray())
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))
X2 = vectorizer2.fit_transform(corpus)
vectorizer2.get_feature_names_out()

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


array(['and this', 'document is', 'first document', 'is the', 'is this',
       'second document', 'the first', 'the second', 'the third',
       'third one', 'this document', 'this is', 'this the'], dtype=object)

## Tokenize using nltk

In [46]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download("punkt")

text = "Tokenization is the first step in NLP!"
tokens = word_tokenize(text)

print(tokens)

['Tokenization', 'is', 'the', 'first', 'step', 'in', 'NLP', '!']


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ashishbansal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Sentence Tokenize 

In [47]:
from nltk.tokenize import sent_tokenize

text = "Tokenization is important. It splits text into units."
sentences = sent_tokenize(text)

print(sentences)

['Tokenization is important.', 'It splits text into units.']


## Tokenize using Spacy 

In [48]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "Tokenization is the first step in NLP!"
doc = nlp(text)

tokens = [token.text for token in doc]
print(tokens)

['Tokenization', 'is', 'the', 'first', 'step', 'in', 'NLP', '!']


## sentence token 

In [49]:
sentences = [sent.text for sent in doc.sents]
print(sentences)

['Tokenization is the first step in NLP!']



## Stop word Removal using NLTK

In [21]:
#!pip install nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ashishbansal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/ashishbansal/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [12]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

text = "This is a simple example to demonstrate stopword removal using NLTK."

# Tokenize
tokens = word_tokenize(text)

# Load English stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

print(filtered_tokens)

['simple', 'example', 'demonstrate', 'stopword', 'removal', 'using', 'NLTK', '.']


## Remove Punctuations and stop words

In [13]:
import string

filtered_tokens = [
    word for word in tokens
    if word.lower() not in stop_words and word not in string.punctuation
]

print(filtered_tokens)

['simple', 'example', 'demonstrate', 'stopword', 'removal', 'using', 'NLTK']


## Stop word Removal using Spacy

In [19]:
#!pip install spacy
#!python -m spacy download en_core_web_sm

In [20]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "This is a simple example to demonstrate stop word removal using spaCy."

doc = nlp(text)

filtered_tokens = [
    token.text
    for token in doc
    if not token.is_stop
]

print(filtered_tokens)

['simple', 'example', 'demonstrate', 'stop', 'word', 'removal', 'spaCy', '.']


## stop words removal using scikit learn

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    "This is a simple example",
    "This example shows stop word removal",
    "Stop words are removed automatically"
]

vectorizer = CountVectorizer(stop_words="english")
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names_out())

['automatically' 'example' 'removal' 'removed' 'shows' 'simple' 'stop'
 'word' 'words']


### Stop words using TFidf 

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names_out())

['automatically' 'example' 'removal' 'removed' 'shows' 'simple' 'stop'
 'word' 'words']


## Stemming using NLTK

In [26]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')

ps = PorterStemmer()

text = "The students were studying and running faster"
tokens = word_tokenize(text)

stemmed_words = [ps.stem(word) for word in tokens]

print(stemmed_words)

['the', 'student', 'were', 'studi', 'and', 'run', 'faster']


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ashishbansal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Lemmatization using NLTK

In [27]:
import nltk
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ashishbansal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/ashishbansal/nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ashishbansal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ashishbansal/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [30]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cars"))     # car
print(lemmatizer.lemmatize("running"))

car
running


## Lemitization using Spacy

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [29]:
text = "The students were running faster and eating better food"

doc = nlp(text)

lemmatized = [token.lemma_ for token in doc]
print(lemmatized)

['the', 'student', 'be', 'run', 'fast', 'and', 'eat', 'well', 'food']


## POS tagging Using nltk

In [34]:
import nltk
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger_eng")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ashishbansal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/ashishbansal/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [35]:
from nltk import word_tokenize, pos_tag

text = "Apple is looking at buying a startup"
tokens = word_tokenize(text)
pos_tags = pos_tag(tokens)

print(pos_tags)

[('Apple', 'NNP'), ('is', 'VBZ'), ('looking', 'VBG'), ('at', 'IN'), ('buying', 'VBG'), ('a', 'DT'), ('startup', 'NN')]


## Pos Tagging using spacy

In [36]:
import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp("Apple is looking at buying a startup")

for token in doc:
    print(token.text, token.pos_, token.tag_)

Apple PROPN NNP
is AUX VBZ
looking VERB VBG
at ADP IN
buying VERB VBG
a DET DT
startup NOUN NN


## NER using Nltk

In [39]:
import nltk
nltk.download("maxent_ne_chunker_tab")
nltk.download("words")

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /Users/ashishbansal/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker_tab.zip.
[nltk_data] Downloading package words to
[nltk_data]     /Users/ashishbansal/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [40]:
from nltk import word_tokenize, pos_tag, ne_chunk

text = "Sundar Pichai is the CEO of Google"
tokens = word_tokenize(text)
pos_tags = pos_tag(tokens)
ner_tree = ne_chunk(pos_tags)

print(ner_tree)

(S
  (PERSON Sundar/NNP)
  (PERSON Pichai/NNP)
  is/VBZ
  the/DT
  (ORGANIZATION CEO/NN of/IN Google/NNP))


## Ner using spacy

In [41]:
import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp("Sundar Pichai is the CEO of Google")

for ent in doc.ents:
    print(ent.text, ent.label_)

Sundar Pichai PERSON
Google ORG


## Spacy Library for tokenizer , parser NER 

In [4]:
#!pip install -U spacy
#!python -m spacy download en_core_web_sm
import spacy

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)


Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']
Verbs: ['start', 'work', 'drive', 'take', 'tell', 'shake', 'turn', 'talk', 'say']
Sebastian Thrun PERSON
Google ORG
2007 DATE
American NORP
Thrun GPE
Recode ORG
earlier this week DATE


## BOW using nltk

In [42]:
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

nltk.download("punkt")

text = "I love AI and I love Python"

tokens = word_tokenize(text.lower())
bow = Counter(tokens)

print(bow)

Counter({'i': 2, 'love': 2, 'ai': 1, 'and': 1, 'python': 1})


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ashishbansal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## BOW using spacy 

In [43]:
import spacy
from collections import Counter

nlp = spacy.load("en_core_web_sm")

text = "I love AI and I love Python"
doc = nlp(text.lower())

tokens = [
    token.lemma_
    for token in doc
    if not token.is_stop and token.is_alpha
]

bow = Counter(tokens)
print(bow)

Counter({'love': 2, 'ai': 1, 'python': 1})


## BOW using scikit-learn

In [45]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    "I love AI",
    "I love Python",
    "AI loves Python"
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names_out())
print(X.toarray())

['ai' 'love' 'loves' 'python']
[[1 1 0 0]
 [0 1 0 1]
 [1 0 1 1]]


In [1]:
## TFidf using sklearn

from sklearn.feature_extraction.text import TfidfVectorizer

docs = [
    "machine learning is fun",
    "machine learning is powerful",
    "deep learning is powerful"
]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(docs)

print(vectorizer.get_feature_names_out())
print(tfidf_matrix.toarray())

['deep' 'fun' 'is' 'learning' 'machine' 'powerful']
[[0.         0.66283998 0.39148397 0.39148397 0.50410689 0.        ]
 [0.         0.         0.43370786 0.43370786 0.55847784 0.55847784]
 [0.66283998 0.         0.39148397 0.39148397 0.         0.50410689]]


In [2]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp311-cp311-macosx_11_0_arm64.whl (24.5 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.5/24.5 MB[0m [31m2.3 MB/s[0m  [33m0:00:10[0ma [36m0:00:01[0mm eta [36m0:00:01[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [3]:
## word2vec

from gensim.models import Word2Vec

sentences = [
    ["machine", "learning", "is", "fun"],
    ["deep", "learning", "is", "powerful"],
    ["ai", "is", "transforming", "technology"]
]

model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=1,
    sg=1  # 1 = Skip-gram, 0 = CBOW
)

# Get embedding
vector = model.wv["learning"]
print(vector.shape)

# Similar words
print(model.wv.most_similar("learning"))

(100,)
[('transforming', 0.06797551363706589), ('machine', 0.03364057093858719), ('powerful', 0.009391169995069504), ('ai', 0.0045030321925878525), ('is', -0.010839181020855904), ('technology', -0.023671656847000122), ('deep', -0.11410722881555557), ('fun', -0.11555544286966324)]


In [4]:
print(vector)

[-8.6196875e-03  3.6657380e-03  5.1898835e-03  5.7419385e-03
  7.4669183e-03 -6.1676754e-03  1.1056137e-03  6.0472824e-03
 -2.8400505e-03 -6.1735227e-03 -4.1022300e-04 -8.3689485e-03
 -5.6000124e-03  7.1045388e-03  3.3525396e-03  7.2256695e-03
  6.8002474e-03  7.5307419e-03 -3.7891543e-03 -5.6180597e-04
  2.3483764e-03 -4.5190323e-03  8.3887316e-03 -9.8581640e-03
  6.7646410e-03  2.9144168e-03 -4.9328315e-03  4.3981876e-03
 -1.7395747e-03  6.7113843e-03  9.9648498e-03 -4.3624435e-03
 -5.9933780e-04 -5.6956373e-03  3.8508223e-03  2.7866268e-03
  6.8910765e-03  6.1010956e-03  9.5384968e-03  9.2734173e-03
  7.8980681e-03 -6.9895042e-03 -9.1558648e-03 -3.5575271e-04
 -3.0998408e-03  7.8943167e-03  5.9385742e-03 -1.5456629e-03
  1.5109634e-03  1.7900408e-03  7.8175711e-03 -9.5101865e-03
 -2.0553112e-04  3.4691966e-03 -9.3897223e-04  8.3817719e-03
  9.0107834e-03  6.5365066e-03 -7.1162102e-04  7.7104042e-03
 -8.5343346e-03  3.2071066e-03 -4.6379971e-03 -5.0889552e-03
  3.5896183e-03  5.37033

In [5]:
import gensim.downloader as api

glove = api.load("glove-wiki-gigaword-100")

vector = glove["king"]
print(vector.shape)

print(glove.most_similar("king"))

(100,)
[('prince', 0.7682329416275024), ('queen', 0.7507690787315369), ('son', 0.7020888328552246), ('brother', 0.6985775828361511), ('monarch', 0.6977890133857727), ('throne', 0.691999077796936), ('kingdom', 0.6811409592628479), ('father', 0.680202841758728), ('emperor', 0.6712858080863953), ('ii', 0.6676074266433716)]


In [6]:
## fasttext 
#Uses character n-grams
# Great for noisy text

from gensim.models import FastText

sentences = [
    ["running", "is", "good"],
    ["runner", "runs", "fast"]
]

model = FastText(
    sentences,
    vector_size=100,
    window=3,
    min_count=1
)

print(model.wv["running"].shape)

# Works even for unseen words
print(model.wv["runing"])  # misspelling


(100,)
[ 3.2592006e-04  3.0878314e-04 -2.3986835e-03  1.4634965e-03
  1.9383548e-03  2.5352230e-03 -1.5754411e-03  7.8739109e-04
  2.8420839e-04  1.7295904e-03 -2.1287976e-03  2.5222203e-04
 -1.1413345e-03  1.1649579e-03  1.2338966e-03 -2.1058293e-04
 -4.8548801e-04 -1.0985128e-03  1.0116753e-04 -3.0819492e-03
 -8.2721905e-04 -1.4776889e-03 -8.4920652e-04  2.1289012e-03
 -1.8061429e-03 -3.0560524e-03 -2.4155995e-03  5.9602031e-04
 -2.3576571e-04 -1.5703663e-03 -4.0375376e-03  2.2614432e-04
 -7.3173677e-04  1.3818332e-03  1.3538908e-04  2.1072933e-03
  1.5878961e-03  1.3267922e-03  3.2056745e-05 -7.9966593e-04
 -5.7083782e-04  1.8891835e-03  1.6194319e-03  6.7433692e-04
 -1.3552830e-03 -2.1046721e-03 -5.0387275e-04  1.2752799e-03
 -4.3698688e-05  1.4291970e-03  1.1810130e-03 -7.3555135e-04
  3.4552196e-04  1.1330862e-03  1.1542472e-03 -6.2534423e-04
  2.7376527e-04 -9.0139161e-04  1.8991953e-03 -1.8430776e-03
 -1.7570338e-03 -2.5307743e-03  9.1046840e-04 -2.8497418e-03
 -1.9792165e-03  