NER 

In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
text = "Apple Inc. was founded by Steve Jobs and is headquartered in California."


In [4]:
doc = nlp(text)

In [5]:
for ent in doc.ents:
    print(ent.text, "→", ent.label_)

Apple Inc. → ORG
Steve Jobs → PERSON
California → GPE


Bag Of Words

In [6]:
from nltk.tokenize import word_tokenize
from collections import Counter

In [7]:
text = "The cat sat on the mat, The dog ate The cat's food, The cat and the dog are friends."

In [8]:
counter=Counter(word_tokenize(text))

In [9]:
counter

Counter({'The': 4,
         'cat': 3,
         'the': 2,
         ',': 2,
         'dog': 2,
         'sat': 1,
         'on': 1,
         'mat': 1,
         'ate': 1,
         "'s": 1,
         'food': 1,
         'and': 1,
         'are': 1,
         'friends': 1,
         '.': 1})

preprocessing before BOW

In [10]:
article = """When I first joined Medium, our CEO would occasionally Slack me about decisions I’d made. “I’m curious why…” he’d begin, before mentioning a story I’d curated or post I’d drafted. Honestly, this freaked me out at first. I took “I’m curious” to mean “I’m suspicious” or “I vehemently disagree but am trying to be polite.” It took me a solid year to figure out that, actually, he was literally just curious — and explaining the reasoning behind my decisions made me better at my job. In retrospect, I’m glad I had to do that.

On Medium, behavioral researcher and author Maria Keckler, Ph.D., has a similar memory of a colleague (in Keckler’s case it was the VP of Operations at her former company) who led with curiosity.

“What struck me most was what she didn’t do,” Keckler writes. “She didn’t jockey for airtime. She didn’t interrupt or grandstand or push her point. […] Influence didn’t have to look like volume or certainty. It could look like curiosity, with an edge of empathy and wisdom.”

This is a pattern I’ve noticed among the friends and coworkers I respect most: they’re highly curious, but somehow generous about it, too. They ask clarifying questions so they’re sure they understand. They ask why and usually aren’t satisfied with the first answer."""

In [11]:
tokens = word_tokenize(article)

In [12]:
lower_tokens = [t.lower() for t in tokens]


In [13]:
bow_simple = Counter(lower_tokens)


In [14]:
print(bow_simple.most_common(10))


[('’', 17), ('i', 13), (',', 13), ('.', 13), ('“', 6), ('”', 6), ('me', 5), ('a', 5), ('or', 5), ('to', 5)]


In [15]:
from nltk.corpus import stopwords


In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Beshoy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
tokens = [word for word in word_tokenize(text.lower()) if word.isalpha()]

In [18]:
tokens

['the',
 'cat',
 'sat',
 'on',
 'the',
 'mat',
 'the',
 'dog',
 'ate',
 'the',
 'cat',
 'food',
 'the',
 'cat',
 'and',
 'the',
 'dog',
 'are',
 'friends']

In [19]:
no_stops = [token for token in tokens if token not in stopwords.words('english')]

In [20]:
no_stops

['cat', 'sat', 'mat', 'dog', 'ate', 'cat', 'food', 'cat', 'dog', 'friends']

In [21]:
Counter(no_stops).most_common(2)

[('cat', 3), ('dog', 2)]

TF-IDF

In [22]:
from gensim.corpora.dictionary import Dictionary


In [34]:
docs = [
"The cat sat on the mat",
" The dog ate the food of the cat",
" The cat and the dog are friends"
]

In [35]:
#Tokenize each document using word_tokenize
tokenized_docs=[word_tokenize(doc.lower()) for doc in docs if word_tokenize(doc.lower())]
tokenized_docs


[['the', 'cat', 'sat', 'on', 'the', 'mat'],
 ['the', 'dog', 'ate', 'the', 'food', 'of', 'the', 'cat'],
 ['the', 'cat', 'and', 'the', 'dog', 'are', 'friends']]

In [36]:
no_stopwords = []
for doc in tokenized_docs:
    for word in doc:
        if word in stopwords.words('english'):
            doc.remove(word)
    no_stopwords.append(doc)

#Initiate the dictionary function of GENSIM

dictionary= Dictionary(no_stopwords)

#Check token ids

dictionary.token2id

{'cat': 0,
 'mat': 1,
 'sat': 2,
 'the': 3,
 'ate': 4,
 'dog': 5,
 'food': 6,
 'friends': 7}

In [37]:
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

In [38]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(0, 1), (3, 1), (5, 1), (7, 1)]]

In [40]:
from gensim.models.tfidfmodel import TfidfModel

# Create the TFIDF model in gensim
tfidf= TfidfModel(corpus)

#Print the weights of the tokens in the first document
tfidf_weights = tfidf[corpus[1]]

# Sort the weights from highest to lowest: sorted_tfidf_weights
sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

# Print the top 5 weighted words
for term_id, weight in sorted_tfidf_weights[:5]:
    print(dictionary.get(term_id), weight)

ate 0.6841916012796777
food 0.6841916012796777
dog 0.2525147628886298
