# SPACY
# If you want to find a part of speech POS ,and you want to do  NER (Name Entity Recognization)

In [1]:
import spacy
import en_core_web_sm

### en_core_web_sm is a small English language model created by spaCy, a library for natural language processing in Python. It can be used to perform a variety of tasks, such as tokenization, part-of-speech tagging, named entity recognition, and more. The "sm" in the name stands for "small", indicating that it is a smaller version of the en_core_web_md, which is a medium-sized model, and en_core_web_lg, which is a large model. The smaller the model, the less computational resources it requires, but it may not be as accurate as a larger model.

In [2]:
lang = spacy.load('en_core_web_sm')

### spacy.load('en_core_web_sm') is a command in Python that loads the en_core_web_sm model from the spaCy library.

In [3]:
import pandas as pd
df = pd.read_csv('winemag-data-130k-v2.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [4]:
df.shape

(129971, 14)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             129971 non-null  int64  
 1   country                129908 non-null  object 
 2   description            129971 non-null  object 
 3   designation            92506 non-null   object 
 4   points                 129971 non-null  int64  
 5   price                  120975 non-null  float64
 6   province               129908 non-null  object 
 7   region_1               108724 non-null  object 
 8   region_2               50511 non-null   object 
 9   taster_name            103727 non-null  object 
 10  taster_twitter_handle  98758 non-null   object 
 11  title                  129971 non-null  object 
 12  variety                129970 non-null  object 
 13  winery                 129971 non-null  object 
dtypes: float64(1), int64(2), object(11)


In [6]:
df.description.max()

"“Wow” is the first word that comes to mind when tasting this superconcentrated and immensely fruity wine. It practically overwhelms the nose and the palate with ripe, jammy boysenberry and blackberry, sprinkled with licorice and black-pepper nuances that add attractive complexity. It's a big wine all the way, but not extreme in alcohol."

In [7]:
df.title.max()

'Štoka 2011 Izbrani Teran (Kras)'

In [8]:
doc = lang(df.description.max())

In [9]:
type(doc)

spacy.tokens.doc.Doc

In [10]:
for i in doc:
    print(i)

“
Wow
”
is
the
first
word
that
comes
to
mind
when
tasting
this
superconcentrated
and
immensely
fruity
wine
.
It
practically
overwhelms
the
nose
and
the
palate
with
ripe
,
jammy
boysenberry
and
blackberry
,
sprinkled
with
licorice
and
black
-
pepper
nuances
that
add
attractive
complexity
.
It
's
a
big
wine
all
the
way
,
but
not
extreme
in
alcohol
.


In [11]:
for i in doc:  # pos_ : part of speech
    print(i ,'==>', i.pos_) # pron = proper noun , for full form of pos aux , adp etc google it

“ ==> PUNCT
Wow ==> INTJ
” ==> PUNCT
is ==> AUX
the ==> DET
first ==> ADJ
word ==> NOUN
that ==> PRON
comes ==> VERB
to ==> ADP
mind ==> NOUN
when ==> SCONJ
tasting ==> VERB
this ==> DET
superconcentrated ==> VERB
and ==> CCONJ
immensely ==> ADV
fruity ==> NOUN
wine ==> NOUN
. ==> PUNCT
It ==> PRON
practically ==> ADV
overwhelms ==> VERB
the ==> DET
nose ==> NOUN
and ==> CCONJ
the ==> DET
palate ==> NOUN
with ==> ADP
ripe ==> ADJ
, ==> PUNCT
jammy ==> PROPN
boysenberry ==> PROPN
and ==> CCONJ
blackberry ==> NOUN
, ==> PUNCT
sprinkled ==> VERB
with ==> ADP
licorice ==> ADJ
and ==> CCONJ
black ==> ADJ
- ==> PUNCT
pepper ==> NOUN
nuances ==> NOUN
that ==> PRON
add ==> VERB
attractive ==> ADJ
complexity ==> NOUN
. ==> PUNCT
It ==> PRON
's ==> AUX
a ==> DET
big ==> ADJ
wine ==> NOUN
all ==> DET
the ==> DET
way ==> NOUN
, ==> PUNCT
but ==> CCONJ
not ==> PART
extreme ==> ADJ
in ==> ADP
alcohol ==> NOUN
. ==> PUNCT


In [12]:
doc[1].pos_  # integer

'INTJ'

In [13]:
doc[9:14] # part of words from entire document

to mind when tasting this

if you have very big document and you want to see how many sentences are there so it helps to see.

In [14]:
for i in doc.sents:
    print(i)

“Wow” is the first word that comes to mind when tasting this superconcentrated and immensely fruity wine.
It practically overwhelms the nose and the palate with ripe, jammy boysenberry and blackberry, sprinkled with licorice and black-pepper nuances that add attractive complexity.
It's a big wine all the way, but not extreme in alcohol.


In [15]:
df.description.min()

'"Chremisa," the ancient name of Krems, is commemorated in this wine that comes from Krems vineyards. It has tight, tangy apple-driven acidity, with a bright, light, citrusy character. Not for aging.'

In [16]:
doc2 = lang(df.description.min())

In [17]:
for i in doc2.sents:
    print(i)

"Chremisa," the ancient name of Krems, is commemorated in this wine that comes from Krems vineyards.
It has tight, tangy apple-driven acidity, with a bright, light, citrusy character.
Not for aging.


In [18]:
for i in doc:
    print(i.text , end = '|')  # for seperate each words and punctuations

“|Wow|”|is|the|first|word|that|comes|to|mind|when|tasting|this|superconcentrated|and|immensely|fruity|wine|.|It|practically|overwhelms|the|nose|and|the|palate|with|ripe|,|jammy|boysenberry|and|blackberry|,|sprinkled|with|licorice|and|black|-|pepper|nuances|that|add|attractive|complexity|.|It|'s|a|big|wine|all|the|way|,|but|not|extreme|in|alcohol|.|

In [19]:
for i in doc2.ents:  #ents means entity
    print(i)

Chremisa
Krems
Krems


In [20]:
for token in doc2.ents:
    print(token) 
    print(token.label_)
    print('------')

Chremisa
PERSON
------
Krems
GPE
------
Krems
GPE
------


sometimes spacy is fair enough to understand sometimes it not

In [21]:
# to highlight the entities
doc3 = lang('last year MGhector sold 15000 cars in india')
from spacy import displacy
displacy.render(doc3 , style = 'ent')

In [26]:
# stemming using nltk
# stemming means mapping to the rootwords
import nltk
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language = 'english')
word_list = ['swim','swimmer','swimming','swam','match','matches','run','running','humans','celebrities']
for i in word_list:
    print(i , 'its stemming is ==>' , stemmer.stem(i))

swim its stemming is ==> swim
swimmer its stemming is ==> swimmer
swimming its stemming is ==> swim
swam its stemming is ==> swam
match its stemming is ==> match
matches its stemming is ==> match
run its stemming is ==> run
running its stemming is ==> run
humans its stemming is ==> human
celebrities its stemming is ==> celebr


In [28]:
lang = spacy.load('en_core_web_sm')
print(lang.Defaults.stop_words) # if you want to remove the stopwords you can take the stopwords from this libraries 

{'beforehand', 'can', 'an', 'my', 'wherever', 'eleven', 'latter', 'with', 'everyone', 'they', 'never', 'under', 'front', 'already', 'across', 'at', 'former', 'during', 'keep', 'hereupon', 'since', 'regarding', 'done', '‘re', 'say', 'has', 'however', 'hers', "n't", 'those', 'moreover', 'doing', 'himself', 'less', 'still', 'seem', 'call', 'throughout', 'each', '‘ve', 'rather', 'take', 'above', 'be', 'who', 'used', 'nine', 'another', 'very', 'may', 'nevertheless', 'using', 'been', 'besides', 'per', 'which', 'due', 'this', 'amount', 'us', 'many', 'via', 'around', 'every', 'them', 'otherwise', 'full', 'bottom', 'go', 'these', 'afterwards', '‘s', 'sometimes', 'others', 'not', 'behind', 'neither', 'hundred', 'four', 'seems', 'your', 'hereby', 'whom', 'than', 'along', 'side', 'down', 'whence', 'after', 'while', 'become', 'one', 'within', 'over', 'it', 'when', '’s', '‘ll', 'onto', 'am', 'she', 'although', 'fifteen', 'were', 'together', 'all', 're', 'only', 'first', 'before', 'though', 'amongst'

In [29]:
# am--> m 
# there are many stopwords
len(lang.Defaults.stop_words)

326