<a href="https://colab.research.google.com/github/Fateme-Rahimi/Special-Topics/blob/main/Chapter_Six_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 6.6 Stemming Words

In [1]:
# Load library
from nltk.stem.porter import PorterStemmer
# Create word tokens
tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']
# Create stemmer
porter = PorterStemmer()
# Apply stemmer
[porter.stem(word) for word in tokenized_words]

['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']

# 6.7 Tagging Parts of Speech

In [7]:
!pip install nltk
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [8]:
# Load libraries
from nltk import pos_tag
from nltk import word_tokenize
# Create text
text_data = "Chris loved outdoor running"
# Use pretrained part of speech tagger
text_tagged = pos_tag(word_tokenize(text_data))
# Show parts of speech
text_tagged

[('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]

In [9]:
# Filter words
[word for word, tag in text_tagged if tag in ['NN','NNS','NNP','NNPS'] ]

['Chris']

In [52]:
# Load libraries
from nltk import pos_tag
from nltk import word_tokenize
# Create text
text_data = "Chris and Emmy loved outdoor running. They read book and they love cars. They are Americans."
# Use pretrained part of speech tagger
text_tagged = pos_tag(word_tokenize(text_data))
# Show parts of speech
text_tagged

[('Chris', 'NNP'),
 ('and', 'CC'),
 ('Emmy', 'NNP'),
 ('loved', 'VBD'),
 ('outdoor', 'RP'),
 ('running', 'VBG'),
 ('.', '.'),
 ('They', 'PRP'),
 ('read', 'VBD'),
 ('book', 'NN'),
 ('and', 'CC'),
 ('they', 'PRP'),
 ('love', 'VBP'),
 ('cars', 'NNS'),
 ('.', '.'),
 ('They', 'PRP'),
 ('are', 'VBP'),
 ('Americans', 'NNPS'),
 ('.', '.')]

In [53]:
# Filter words
[word for word, tag in text_tagged if tag in ['NN','NNS','NNP','NNPS'] ]

['Chris', 'Emmy', 'book', 'cars', 'Americans']

In [54]:
[tag for word, tag in text_tagged if tag in ['NN','NNS','NNP','NNPS'] ]

['NNP', 'NNP', 'NN', 'NNS', 'NNPS']

In [55]:
# Import libraries
from sklearn.preprocessing import MultiLabelBinarizer
# Create text
tweets = ["I am eating a burrito for breakfast",
          "Political science is an amazing field",
          "San Francisco is an awesome city"]
# Create list
tagged_tweets = []
# Tag each word and each tweet
for tweet in tweets:
  tweet_tag = nltk.pos_tag(word_tokenize(tweet))
  tagged_tweets.append([tag for word, tag in tweet_tag])

In [57]:
tagged_tweets

[['PRP', 'VBP', 'VBG', 'DT', 'NN', 'IN', 'NN'],
 ['JJ', 'NN', 'VBZ', 'DT', 'JJ', 'NN'],
 ['NNP', 'NNP', 'VBZ', 'DT', 'JJ', 'NN']]

In [17]:
# Use one-hot encoding to convert the tags into features
one_hot_multi = MultiLabelBinarizer()
one_hot_multi.fit_transform(tagged_tweets)

array([[1, 1, 0, 1, 0, 1, 1, 1, 0],
       [1, 0, 1, 1, 0, 0, 0, 0, 1],
       [1, 0, 1, 1, 1, 0, 0, 0, 1]])

In [18]:
print(one_hot_multi.classes_)

['DT' 'IN' 'JJ' 'NN' 'NNP' 'PRP' 'VBG' 'VBP' 'VBZ']


# 6.8 Performing Named-Entity Recognition

In [23]:
# Import libraries
import spacy
# Load the spaCy package and use it to parse the text
# make sure you have run "python -m spacy download en"
nlp = spacy.load("en_core_web_sm")
doc = nlp("Elon Musk offered to buy Twitter using $21B of his own money.")
# Print each entity
print(doc.ents)
# For each entity print the text and the entity label
for entity in doc.ents:
  print(entity.text, entity.label_, sep=",")

(Elon Musk, Twitter, 21B)
Elon Musk,PERSON
Twitter,PERSON
21B,MONEY


In [60]:
# Import libraries
import spacy
# Load the spaCy package and use it to parse the text
# make sure you have run "python -m spacy download en"
nlp = spacy.load("en_core_web_sm")
doc = nlp("Elon Musk offered to buy twitter in 2021 using $21B of his own money. White House. Bank. Trump. Iran. Microsoft")
# Print each entity
print(doc.ents)
# For each entity print the text and the entity label
for entity in doc.ents:
  print(entity.text, entity.label_, sep=",")

(Elon Musk, 2021, 21B, White House, Trump, Iran, Microsoft)
Elon Musk,PERSON
2021,DATE
21B,MONEY
White House,ORG
Trump,PERSON
Iran,GPE
Microsoft,ORG


# 6.9 Encoding Text as a Bag of Words

In [25]:
# Load library
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
# Create text
text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])
# Create the bag of words feature matrix
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)
# Show feature matrix
bag_of_words

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 8 stored elements and shape (3, 8)>

In [28]:
# Show feature names
count.get_feature_names_out()

array(['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love',
       'sweden'], dtype=object)

In [29]:
bag_of_words.toarray()

array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]])

In [33]:
# Create feature matrix with arguments
count_2gram = CountVectorizer(ngram_range=(1,2),
                              stop_words="english",
                              vocabulary=['brazil'])
bag = count_2gram.fit_transform(text_data)
# View feature matrix
bag.toarray()

array([[2],
       [0],
       [0]])

In [32]:
# View the 1-grams and 2-grams
count_2gram.vocabulary_

{'brazil': 0}

# 6.10 Weighting Word Importance

In [34]:
# Load libraries
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
# Create text
text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])
# Create the tf-idf feature matrix
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)
# Show tf-idf feature matrix
feature_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 8 stored elements and shape (3, 8)>

In [35]:
# Show tf-idf feature matrix as dense matrix
feature_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.89442719, 0.        ,
        0.        , 0.4472136 , 0.        ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.57735027],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027,
        0.        , 0.        , 0.        ]])

In [62]:
print(tfidf.get_feature_names_out())

['beats' 'best' 'both' 'brazil' 'germany' 'is' 'love' 'sweden']


In [36]:
# Show feature names
tfidf.vocabulary_

{'love': 6,
 'brazil': 3,
 'sweden': 7,
 'is': 5,
 'best': 1,
 'germany': 4,
 'beats': 0,
 'both': 2}

# 6.11 Using Text Vectors to Calculate Text Similarity in Search Query

In [40]:
# Load libraries
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
# Create searchable text data
text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])
# Create the tf-idf feature matrix
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)
# Create a search query and transform it into a tf-idf vector
text = "Brazil is the best"
vector = tfidf.transform([text])
# Calculate the cosine similarities between the input vector and all other vectors
cosine_similarities = linear_kernel(vector, feature_matrix).flatten()
# Get the index of the most relevent items in order
related_doc_indicies = cosine_similarities.argsort()[:-10:-1]
# Print the most similar texts to the search query along with the cosine similarity
print([(text_data[i], cosine_similarities[i]) for i in related_doc_indicies])

[(np.str_('Sweden is best'), np.float64(0.6666666666666666)), (np.str_('I love Brazil. Brazil!'), np.float64(0.5163977794943222)), (np.str_('Germany beats both'), np.float64(0.0))]


# 6.12 Using a Sentiment Analysis Classifier

In [41]:
# Import libraries
from transformers import pipeline
# Create an NLP pipeline that runs sentiment analysis
classifier = pipeline("sentiment-analysis")
# Classify some text
# (this may download some data and models the first time you run it)
sentiment_1 = classifier("I hate machine learning! It's the absolute worst.")
sentiment_2 = classifier(
    "Machine learning is the absolute"
    "bees knees I love it so much!"
)
# Print sentiment output
print(sentiment_1, sentiment_2)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


[{'label': 'NEGATIVE', 'score': 0.9998020529747009}] [{'label': 'POSITIVE', 'score': 0.9995730519294739}]
