In [60]:
import spacy
import nltk

### Tokenization

In [7]:
string = '"We\'re are moving to L.A.!"'
print(string)

"We're are moving to L.A.!"


In [10]:
 nlp = spacy.load("en_core_web_sm")
    
doc = nlp(string)
for token in doc:
    print(token)

"
We
're
are
moving
to
L.A.
!
"


In [15]:
doc = nlp(u"Tesla is big godamn company and is awesome!!!")
for token in doc:
    print(token.text, token.pos_,token.dep_)

Tesla PROPN nsubj
is AUX ROOT
big ADJ amod
godamn NOUN compound
company NOUN attr
and CCONJ cc
is AUX conj
awesome ADJ acomp
! PUNCT punct
! PUNCT punct
! PUNCT punct


In [12]:
from spacy import displacy


doc = nlp("Apple is going to build a factory for $6 million.")
displacy.render(doc, style='dep', jupyter=True, options={'distance':110})

In [14]:
displacy.render(doc, style='ent', jupyter=True)

 #    

### Stemming

In [18]:
from nltk.stem.porter import PorterStemmer

p_stemmer = PorterStemmer()

words = ['run', 'runner','ran', 'runs', 'ease', 'easily', 'easy']
for w in words:
    print(w," -> ",p_stemmer.stem(w))

run  ->  run
runner  ->  runner
ran  ->  ran
runs  ->  run
ease  ->  eas
easily  ->  easili
easy  ->  easi


In [21]:
from nltk.stem.snowball import SnowballStemmer

s_stemmer = SnowballStemmer(language="english")

words = ['run', 'runner','ran', 'runs', 'ease', 'easily', 'easy','easyness']
for w in words:
    print(w," -> ",s_stemmer.stem(w))

run  ->  run
runner  ->  runner
ran  ->  ran
runs  ->  run
ease  ->  eas
easily  ->  easili
easy  ->  easi
easyness  ->  easy


#   

### Lemmatization

In [27]:
nlp = spacy.load("en_core_web_sm")

doc = nlp(u"I am a runner running in a race because I love to run since I first ran.")
for token in doc:
    print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

I            PRON   561228191312463089     -PRON-
am           AUX    10382539506755952630   be
a            DET    11901859001352538922   a
runner       NOUN   12640964157389618806   runner
running      VERB   12767647472892411841   run
in           ADP    3002984154512732771    in
a            DET    11901859001352538922   a
race         NOUN   8048469955494714898    race
because      SCONJ  16950148841647037698   because
I            PRON   561228191312463089     -PRON-
love         VERB   3702023516439754181    love
to           PART   3791531372978436496    to
run          VERB   12767647472892411841   run
since        SCONJ  10066841407251338481   since
I            PRON   561228191312463089     -PRON-
first        ADV    11860158879560853892   first
ran          VERB   12767647472892411841   run
.            PUNCT  12646065887601541794   .


#   

### Stopwords

In [28]:
print(nlp.Defaults.stop_words)

{'anyone', 'were', 'upon', 'why', 'their', 'third', 'his', 'every', 'her', 'as', 'he', 'with', 'just', 'back', 'both', 'else', 'part', 'through', 'that', 'name', 'formerly', 'therein', 'would', 'only', 'not', 'empty', 'almost', 'if', 'hereafter', 'seeming', 'then', 'nobody', 'we', 'noone', 'none', '’s', 'thru', 'someone', 'a', 'serious', '’re', 'at', 'whenever', 'toward', 'whether', 'could', 'nothing', 'very', 'when', 'take', 'per', 'seem', 'under', 'myself', 'my', 'give', 'never', 'than', 'who', 'beyond', 'yourselves', 'can', 'be', 'six', '‘re', 'whereafter', 'mostly', 'go', 'few', 'either', 'do', 'beforehand', 'behind', 'even', 'our', 'without', "'re", 'already', 'an', 'somewhere', 'show', 'to', 'indeed', 'within', 'now', 'make', 'twenty', 'used', 'becomes', '‘m', 'whither', 'wherever', 'unless', 'became', 'nowhere', 'what', 'seemed', 'mine', 'other', 'these', 'last', 'hence', 'again', 'thence', 'me', 'side', 'will', 'less', 'off', 'your', 'you', 'this', 'there', 'becoming', 'herself

In [29]:
## Adding stopwords

nlp.Defaults.stop_words.add("btw")
nlp.vocab["btw"].is_stop

True

#    

## Text Feature Extraction

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

msgs = ["Hey! lets go out somewhere.",
        "Call me ASAP now !",
        "Take the dog to a walk."]

vect = CountVectorizer()
cv = vect.fit_transform(msgs)
print(vect.get_feature_names())
print(cv.toarray())

['asap', 'call', 'dog', 'go', 'hey', 'lets', 'me', 'now', 'out', 'somewhere', 'take', 'the', 'to', 'walk']
[[0 0 0 1 1 1 0 0 1 1 0 0 0 0]
 [1 1 0 0 0 0 1 1 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 1 1 1 1]]


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

msgs = ["Hey! lets go out somewhere.",
        "Call me ASAP now !",
        "Take the dog to a walk."]

vect = TfidfVectorizer()
cv = vect.fit_transform(msgs)
print(vect.get_feature_names())
print(cv.toarray())

['asap', 'call', 'dog', 'go', 'hey', 'lets', 'me', 'now', 'out', 'somewhere', 'take', 'the', 'to', 'walk']
[[0.        0.        0.        0.4472136 0.4472136 0.4472136 0.
  0.        0.4472136 0.4472136 0.        0.        0.        0.       ]
 [0.5       0.5       0.        0.        0.        0.        0.5
  0.5       0.        0.        0.        0.        0.        0.       ]
 [0.        0.        0.4472136 0.        0.        0.        0.
  0.        0.        0.        0.4472136 0.4472136 0.4472136 0.4472136]]


#    

## Word2Vec

In [42]:
nlp = spacy.load("en_core_web_lg")

In [9]:
nlp(u"Lion").vector

array([ 1.8963e-01, -4.0309e-01,  3.5350e-01, -4.7907e-01, -4.3311e-01,
        2.3857e-01,  2.6962e-01,  6.4332e-02,  3.0767e-01,  1.3712e+00,
       -3.7582e-01, -2.2713e-01, -3.5657e-01, -2.5355e-01,  1.7543e-02,
        3.3962e-01,  7.4723e-02,  5.1226e-01, -3.9759e-01,  5.1333e-03,
       -3.0929e-01,  4.8911e-02, -1.8610e-01, -4.1702e-01, -8.1639e-01,
       -1.6908e-01, -2.6246e-01, -1.5983e-02,  1.2479e-01, -3.7276e-02,
       -5.7125e-01, -1.6296e-01,  1.2376e-01, -5.5464e-02,  1.3244e-01,
        2.7519e-02,  1.2592e-01, -3.2722e-01, -4.9165e-01, -3.5559e-01,
       -3.0630e-01,  6.1185e-02, -1.6932e-01, -6.2405e-02,  6.5763e-01,
       -2.7925e-01, -3.0450e-03, -2.2400e-02, -2.8015e-01, -2.1975e-01,
       -4.3188e-01,  3.9864e-02, -2.2102e-01, -4.2693e-02,  5.2748e-02,
        2.8726e-01,  1.2315e-01, -2.8662e-02,  7.8294e-02,  4.6754e-01,
       -2.4589e-01, -1.1064e-01,  7.2250e-02, -9.4980e-02, -2.7548e-01,
       -5.4097e-01,  1.2823e-01, -8.2408e-02,  3.1035e-01, -6.33

In [10]:
tokens = nlp(u"lions cat pet")
for token1 in tokens:
    for token2 in tokens:
        print(token1.text," ",token2.text," - ",token1.similarity(token2))

lions   lions  -  1.0
lions   cat  -  0.39475113
lions   pet  -  0.32804856
cat   lions  -  0.39475113
cat   cat  -  1.0
cat   pet  -  0.7505457
pet   lions  -  0.32804856
pet   cat  -  0.7505457
pet   pet  -  1.0


In [14]:
tokens = nlp(u"cat dog lion nngl")
for t in tokens:
    print(t.text, t.has_vector, t.vector_norm, t.is_oov)

cat True 6.6808186 False
dog True 7.0336733 False
lion True 6.5120897 False
nngl False 0.0 True


In [58]:
# Vector Arithmetic
from scipy import spatial


cosine_sim = lambda v1,v2:1 - spatial.distance.cosine(v1,v2)

king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
women = nlp.vocab['women'].vector

# king - man + women --> Queen, princess, highness
new_vec = king-man+women

computed_sims = []
for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                sim = cosine_sim(new_vec, word.vector)
                computed_sims.append((word, sim))    

In [59]:
computed_sims = sorted(computed_sims, key=lambda item:-item[1])
print([t[0].text for t in computed_sims[:10]])

['king', 'women', 'queen', 'royal', 'princess', 'throne', 'these', 'those', 'are', 'all']


#    

## Sentiment Analysis using VADER

In [66]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sa = SentimentIntensityAnalyzer()

review = "This is a good movie, and I very much loved it till the end"
sa.polarity_scores(review)

{'neg': 0.0, 'neu': 0.586, 'pos': 0.414, 'compound': 0.7951}

In [67]:
review = "This is a good movie, and I very much loved it till the end. It's pretty much AWESOME !!!!"
sa.polarity_scores(review)

{'neg': 0.0, 'neu': 0.444, 'pos': 0.556, 'compound': 0.9537}