[Reference](https://medium.com/towards-artificial-intelligence/nlp-zero-to-hero-with-python-2df6fcebff6e)

# Section 1: NLP Introduction, Installation guide of Spacy and NLTK

In [1]:
!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [2]:
print('Amit')

Amit


In [4]:
#first insert the string to a variable
string = 'GURUGRAM'

#get first alphabet with index
print(string[0])

#printing multiple alphabets
print(string[2], string[5])

#for getting alphabet with negative indexing
print(string[-4])

G
R R
G


In [5]:
print(string[0:2])

GU


In [6]:
print(string[1:4])

URU


# Section 2: Basic ideas about a text, Regular expression

In [7]:
#A sentence and the removing character from the sentence
sentence = "****Hello World! I am Amit Chauhan****"
removing_character = "*"

#using strip function to remove star(*)
sentence.strip(removing_character)

'Hello World! I am Amit Chauhan'

In [8]:
str1 = "Happy"
str2 = "Home"
" Good ".join([str1, str2])

'Happy Good Home'

In [9]:
# to use a regular expression, we need to import re
import re

In [10]:
sentence = "My computer gives a very good performance in a very short time."
string = "very"

In [11]:
str_match = re.search(string, sentence)
str_match

<_sre.SRE_Match object; span=(20, 24), match='very'>

In [12]:
str_match.span()

(20, 24)

In [13]:
find_all = re.findall("very", sentence)
find_all

['very', 'very']

In [14]:
for word in re.finditer("very", sentence):
    print(word.span())

(20, 24)
(47, 51)


# Section 3: Tokenization and Stemming

In [15]:
#import library
import spacy

#Loading spacy english library
load_en = spacy.load('en_core_web_sm')

#take an example of string
example_string = "I'm going to meet\ M.S. Dhoni."

#load string to library 
words = load_en(example_string)

#getting tokens pieces with for loop
for tokens in words:
    print(tokens.text)

I
'm
going
to
meet\
M.S.
Dhoni
.


In [16]:
str1 = load_en(u"This laptop belongs to Amit Chauhan")

#getting tokens with index
str1[1]

#getting tokens with slicing
str1[2:6]

belongs to Amit Chauhan

In [17]:
#import nltk library
import nltk

#import porter stemmer from nltk
from nltk.stem.porter import PorterStemmer
pot_stem = PorterStemmer()

#random words to test porter stemmer
words = ['happy', 'happier', 'happiest', 'happiness', 'breathing', 'fairly']

for word in words:
    print(word + '----->' + pot_stem.stem(word))

happy----->happi
happier----->happier
happiest----->happiest
happiness----->happi
breathing----->breath
fairly----->fairli


In [18]:
from nltk.stem.snowball import SnowballStemmer
snow_stem = SnowballStemmer(language='english')

for word in words:
    print(word + '----->' + snow_stem.stem(word))

happy----->happi
happier----->happier
happiest----->happiest
happiness----->happi
breathing----->breath
fairly----->fair


# Section 4: Lemmatisation and Stop words

In [19]:
#import library
import spacy

#Loading spacy english library
load_en = spacy.load('en_core_web_sm')

#take an example of string
example_string = load_en(u"I'm happy in this happiest place with all happiness. It feels how happier we are")

for lem_word in example_string:
    print(lem_word.text, '\t', lem_word.pos_, '\t', lem_word.lemma, '\t', lem_word.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
'm 	 AUX 	 10382539506755952630 	 be
happy 	 ADJ 	 244022080605231780 	 happy
in 	 ADP 	 3002984154512732771 	 in
this 	 DET 	 1995909169258310477 	 this
happiest 	 ADJ 	 244022080605231780 	 happy
place 	 NOUN 	 7512738811199700769 	 place
with 	 ADP 	 12510949447758279278 	 with
all 	 DET 	 13409319323822384369 	 all
happiness 	 NOUN 	 2779265004918961325 	 happiness
. 	 PUNCT 	 12646065887601541794 	 .
It 	 PRON 	 561228191312463089 	 -PRON-
feels 	 VERB 	 5741770584995928333 	 feel
how 	 ADV 	 16331095434822636218 	 how
happier 	 ADJ 	 244022080605231780 	 happy
we 	 PRON 	 561228191312463089 	 -PRON-
are 	 AUX 	 10382539506755952630 	 be


In [20]:
#import library
import spacy

#Loading spacy english library
load_en = spacy.load('en_core_web_sm')

print(load_en.Defaults.stop_words)

{'those', 'bottom', 'doing', 'last', 'serious', 'nothing', 'next', 'should', 'whom', 'and', 'alone', 'cannot', 'per', 'after', 'afterwards', 'six', 'therefore', 'most', 'this', 'go', 'nor', 'they', 'thereafter', 'only', 'anywhere', 'whither', 'side', 'meanwhile', 'part', 'am', 'mine', 'except', 'get', 'third', 'whole', 'between', 'yourselves', 'be', 'take', 'him', 'themselves', 'one', 'without', 'whose', 'has', 'whereupon', 'in', 'noone', 'whereby', 'my', 'he', 'using', 'two', '‘re', 'less', 'behind', 'within', 'thus', 'becoming', 'someone', 'front', 'make', 'on', 'across', 'yet', 'the', 'wherever', 'full', 'indeed', 'over', 'out', 'ourselves', 'former', 'as', 'off', 'who', 'herself', 'therein', 'by', 'your', 'own', 'same', 'but', 'while', 'now', 'ever', 'anyhow', 'amongst', 'that', 'very', 'empty', 'several', 'something', 'everything', 'became', 'becomes', 'more', 'since', 'twelve', 'whatever', 'somewhere', 'yours', 'see', 'ten', 'herein', 'via', 'them', 'can', 'do', 'hundred', 'was',

# Section 5: Part of Speech (POS) and Named Entity Recognition (NER)

In [21]:
#import library
import spacy

#Loading spacy english library
load_en = spacy.load('en_core_web_sm')

str1 = load_en(u"This laptop belongs to Amit Chauhan")

In [22]:
print(str1[1])

laptop


In [23]:
#pos_ tag operation 
print(str1[1].pos_)

#to know fine grained information
print(str1[1].tag_)

NOUN
NN


In [24]:
pos_count = str1.count_by(spacy.attrs.POS)
pos_count

{85: 1, 90: 1, 92: 1, 96: 2, 100: 1}

In [25]:
str1.vocab[90].text

'DET'

In [29]:
#import library
import spacy

#Loading spacy english library
load_en = spacy.load('en_core_web_sm')

#lets label the entity in the text file

file = load_en(u" I am living in India, Studying in IIT")

if file.ents:
    for ner in file.ents:
        print(ner.text + ' - '+ ner.label_ + ' - ' + 
               str(spacy.explain(ner.label_)))
else:
    print('No Entity Found')

India - GPE - Countries, cities, states
IIT - ORG - Companies, agencies, institutions, etc.
