In [8]:
import spacy
import pandas as pd
import numpy as np

In [9]:
nlp = spacy.load('en')

In [10]:
sent = 'I am Rahul Ahuja. I am working as a Senior Data Scientist in ValueFirst Digital Media. The company has its headquarters in Gurugram, whereas other offices are in Mumbai, Hyderabad and Bangalore. My hobbies are playing pool and table tennis. I am not interested in coding in languages other than python.'

In [11]:
print(sent)

I am Rahul Ahuja. I am working as a Senior Data Scientist in ValueFirst Digital Media. The company has its headquarters in Gurugram, whereas other offices are in Mumbai, Hyderabad and Bangalore. My hobbies are playing pool and table tennis. I am not interested in coding in languages other than python.


In [12]:
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)

In [13]:
tokens = tokenizer(sent)

In [14]:
for t in tokens:
    print(t)

I
am
Rahul
Ahuja.
I
am
working
as
a
Senior
Data
Scientist
in
ValueFirst
Digital
Media.
The
company
has
its
headquarters
in
Gurugram,
whereas
other
offices
are
in
Mumbai,
Hyderabad
and
Bangalore.
My
hobbies
are
playing
pool
and
table
tennis.
I
am
not
interested
in
coding
in
languages
other
than
python.


In [15]:
doc = nlp(sent)

In [16]:
for tokens in doc:
    print(tokens.text)

I
am
Rahul
Ahuja
.
I
am
working
as
a
Senior
Data
Scientist
in
ValueFirst
Digital
Media
.
The
company
has
its
headquarters
in
Gurugram
,
whereas
other
offices
are
in
Mumbai
,
Hyderabad
and
Bangalore
.
My
hobbies
are
playing
pool
and
table
tennis
.
I
am
not
interested
in
coding
in
languages
other
than
python
.


In [17]:
df = pd.DataFrame()
txt = []
pos = []
for tokens in doc:
    txt.append(tokens.text)
    pos.append(tokens.pos_)

df['token_text'] = txt
df['token_pos'] = pos
df

Unnamed: 0,token_text,token_pos
0,I,PRON
1,am,VERB
2,Rahul,PROPN
3,Ahuja,PROPN
4,.,PUNCT
5,I,PRON
6,am,VERB
7,working,VERB
8,as,ADP
9,a,DET


In [18]:
tag = []
for tokens in doc:
    tag.append(tokens.tag_)
    
df['token_tag'] = tag
df

Unnamed: 0,token_text,token_pos,token_tag
0,I,PRON,PRP
1,am,VERB,VBP
2,Rahul,PROPN,NNP
3,Ahuja,PROPN,NNP
4,.,PUNCT,.
5,I,PRON,PRP
6,am,VERB,VBP
7,working,VERB,VBG
8,as,ADP,IN
9,a,DET,DT


In [19]:
lem = []
for tokens in doc:
    lem.append(tokens.lemma_)
    
df['token_lemma'] = lem
df

Unnamed: 0,token_text,token_pos,token_tag,token_lemma
0,I,PRON,PRP,-PRON-
1,am,VERB,VBP,be
2,Rahul,PROPN,NNP,rahul
3,Ahuja,PROPN,NNP,ahuja
4,.,PUNCT,.,.
5,I,PRON,PRP,-PRON-
6,am,VERB,VBP,be
7,working,VERB,VBG,work
8,as,ADP,IN,as
9,a,DET,DT,a


In [20]:
ner = []
for tokens in doc:
    ner.append(tokens.ent_type_)
    
df['token_ner'] = ner
df

Unnamed: 0,token_text,token_pos,token_tag,token_lemma,token_ner
0,I,PRON,PRP,-PRON-,
1,am,VERB,VBP,be,
2,Rahul,PROPN,NNP,rahul,PERSON
3,Ahuja,PROPN,NNP,ahuja,PERSON
4,.,PUNCT,.,.,
5,I,PRON,PRP,-PRON-,
6,am,VERB,VBP,be,
7,working,VERB,VBG,work,
8,as,ADP,IN,as,
9,a,DET,DT,a,


In [None]:
import nltk

In [21]:
S = 'I stay in Rahul Ahuja.'

In [27]:
from nltk.stem.porter import *  

In [23]:
a = nlp(S)

In [26]:

for tokens in a:
    print(tokens.text)
    print(tokens.ent_type_)


I

stay

in

Rahul
GPE
Ahuja
GPE
.



In [28]:
stemmer = PorterStemmer()  
stemmer.stem('offices')

'offic'

In [29]:
stemm = []
for tokens in doc:
    stemm.append(stemmer.stem(tokens.text))

df['token_porter_stem'] = stemm
df

Unnamed: 0,token_text,token_pos,token_tag,token_lemma,token_ner,token_porter_stem
0,I,PRON,PRP,-PRON-,,I
1,am,VERB,VBP,be,,am
2,Rahul,PROPN,NNP,rahul,PERSON,rahul
3,Ahuja,PROPN,NNP,ahuja,PERSON,ahuja
4,.,PUNCT,.,.,,.
5,I,PRON,PRP,-PRON-,,I
6,am,VERB,VBP,be,,am
7,working,VERB,VBG,work,,work
8,as,ADP,IN,as,,as
9,a,DET,DT,a,,a


In [30]:
import sklearn

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
documents = ['I am Rahul Ahuja.','Rahul works in ValueFirst Digital Media',
             'He loves playing pool and he also plays table tennis', 'Rahul is a Data Scientist.',
             'He is working in Data Science field from past 4 years.',
             'You can also learn Data Science.'
            ]

In [33]:
documents

['I am Rahul Ahuja.',
 'Rahul works in ValueFirst Digital Media',
 'He loves playing pool and he also plays table tennis',
 'Rahul is a Data Scientist.',
 'He is working in Data Science field from past 4 years.',
 'You can also learn Data Science.']

In [34]:
cv = CountVectorizer()

In [35]:
cvectors = cv.fit_transform(documents).toarray()

In [36]:
cvectors.shape

(6, 29)

In [37]:
cnparray = np.asarray(cvectors)

In [38]:
cv.get_feature_names()

['ahuja',
 'also',
 'am',
 'and',
 'can',
 'data',
 'digital',
 'field',
 'from',
 'he',
 'in',
 'is',
 'learn',
 'loves',
 'media',
 'past',
 'playing',
 'plays',
 'pool',
 'rahul',
 'science',
 'scientist',
 'table',
 'tennis',
 'valuefirst',
 'working',
 'works',
 'years',
 'you']

In [39]:
len(cv.get_feature_names())

29

In [40]:
df = pd.DataFrame(cvectors)

In [41]:
df = df.T

In [42]:
df.index = cv.get_feature_names()

In [43]:
df

Unnamed: 0,0,1,2,3,4,5
ahuja,1,0,0,0,0,0
also,0,0,1,0,0,1
am,1,0,0,0,0,0
and,0,0,1,0,0,0
can,0,0,0,0,0,1
data,0,0,0,1,1,1
digital,0,1,0,0,0,0
field,0,0,0,0,1,0
from,0,0,0,0,1,0
he,0,0,2,0,1,0


In [44]:
df.T

Unnamed: 0,ahuja,also,am,and,can,data,digital,field,from,he,...,rahul,science,scientist,table,tennis,valuefirst,working,works,years,you
0,1,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,1,0,1,0,0
2,0,1,0,1,0,0,0,0,0,2,...,0,0,0,1,1,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,1,1,1,...,0,1,0,0,0,0,1,0,1,0
5,0,1,0,0,1,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [46]:
tfidf = TfidfVectorizer()

In [47]:
corpus = ['This is the first document.',    
          'This document is the second document.',    
          'And this is the third one.',     
          'Is this the first document?', ]

In [48]:
corpus

['This is the first document.',
 'This document is the second document.',
 'And this is the third one.',
 'Is this the first document?']

In [49]:
tfvectors = tfidf.fit_transform(corpus).toarray()

In [50]:
tfvectors.shape

(4, 9)

In [51]:
tfidfdf = pd.DataFrame(tfvectors)

In [52]:
tfidfdf = tfidfdf.T

In [53]:
tfidfdf.index = tfidf.get_feature_names()

In [54]:
tfidfdf

Unnamed: 0,0,1,2,3
and,0.0,0.0,0.511849,0.0
document,0.469791,0.687624,0.0,0.469791
first,0.580286,0.0,0.0,0.580286
is,0.384085,0.281089,0.267104,0.384085
one,0.0,0.0,0.511849,0.0
second,0.0,0.538648,0.0,0.0
the,0.384085,0.281089,0.267104,0.384085
third,0.0,0.0,0.511849,0.0
this,0.384085,0.281089,0.267104,0.384085


In [1]:
from nltk.corpus import brown
from gensim.models import Word2Vec

In [2]:
print(brown.sents())
w2v_model = Word2Vec(brown.sents(), size=128, window=5, min_count=3, workers=4)

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]


In [3]:
len(brown.sents())

57340

In [4]:
print(w2v_model.wv['Italy'], w2v_model.wv['France'])

[-0.22169581 -0.13910195 -0.30786458  0.2097693   0.11549138 -0.07737444
 -0.09679544  0.10089014 -0.3876957  -0.15476252 -0.20673902 -0.08739273
  0.19251207  0.18009783  0.10266093  0.23435399 -0.12998484 -0.02721743
  0.07660668 -0.09696666 -0.1028555   0.16014786 -0.04605732  0.05246686
 -0.01552193 -0.10836092  0.08881146 -0.03302382 -0.10660212 -0.161224
  0.21794264  0.13222437  0.03650586 -0.1987432   0.03038349  0.11835337
 -0.01039886  0.02604607 -0.1551416  -0.19799213  0.15901403 -0.19566032
 -0.08114427 -0.29287052  0.00803889 -0.26929802  0.25528246  0.04267662
 -0.00091361 -0.20190343  0.15812123 -0.05820785  0.20390472 -0.07471545
 -0.05569005 -0.13633616  0.05745646 -0.17325091 -0.1171652   0.00708337
  0.16261218 -0.18781906 -0.09005316 -0.24241453 -0.14213951  0.16653916
 -0.23244518 -0.05819499  0.04146701  0.03489877 -0.16629408 -0.34031057
  0.22802746  0.01715963  0.06912307  0.07813231  0.07001895 -0.08398199
 -0.24305904 -0.11197342  0.16666216  0.15414856  0.1

In [5]:
print(w2v_model.wv.most_similar('Paris'))

[('Italy', 0.973966121673584), ('France', 0.9638159275054932), ('Rome', 0.9626742005348206), ('headquarters', 0.9624108672142029), ('Eugene', 0.9608141779899597), ('London', 0.9604490995407104), ('breakfast', 0.9594292044639587), ('dancing', 0.9587491154670715), ('Chicago', 0.9576694369316101), ('Harvard', 0.9569618701934814)]


  if np.issubdtype(vec.dtype, np.int):


In [6]:
print(w2v_model.wv.most_similar(positive=['woman', 'king'], negative=['man']))

[('extracting', 0.9597046971321106), ('united', 0.9547815322875977), ('duties', 0.9540290832519531), ('savage', 0.9529659152030945), ('agility', 0.9447265863418579), ('stem', 0.942992091178894), ('doings', 0.942340612411499), ('abuse', 0.9415711164474487), ('measuring', 0.941333532333374), ('successors', 0.9410267472267151)]


  if np.issubdtype(vec.dtype, np.int):


In [7]:
print(w2v_model.wv.most_similar(positive=["Rome", "France"], negative=["Italy"]))

[('earth', 0.9387699365615845), ('reputed', 0.9373488426208496), ('beach', 0.9368444085121155), ('Nation', 0.933750569820404), ('Church', 0.9312849044799805), ('sink', 0.9308393597602844), ('plantation', 0.929715633392334), ('repeated', 0.9296947121620178), ('transformed', 0.9296269416809082), ('grounds', 0.9286481142044067)]


  if np.issubdtype(vec.dtype, np.int):


In [8]:

from gensim.models.word2vec import Text8Corpus
w2v_model2 = Word2Vec(Text8Corpus('/home/rahul/Sessions/text8'), size=100, window=5, min_count=150, workers=4)

In [19]:
print(w2v_model2.wv.most_similar(positive=['bangladesh','dhaka'], negative=['pakistan']))

KeyError: "word 'dhaka' not in vocabulary"