## Stemming and lemmatization examples using python



In [None]:
word_lis= ['programming', 'studying', 'saw', 'talked', 'happiness', 'felt', 'fairly']

###Stemming:
1. Porter stemming:

In [None]:
import nltk
from nltk.stem import PorterStemmer
nltk.download("punkt")
# Initialize Python porter stemmer
ps = PorterStemmer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#example words

print("{0:20}".format("Performing Porter stemming: "))
print("{0:20}{1:20}".format("Word: ","Stem: "))
for word in word_lis:
   print ("{0:20}{1:20}".format(word, ps.stem(word)))

Performing Porter stemming: 
Word:               Stem:               
programming         program             
studying            studi               
saw                 saw                 
talked              talk                
happiness           happi               
felt                felt                
fairly              fairli              


2. Snowball stemming

In [None]:
from nltk.stem.snowball import SnowballStemmer

# The Snowball Stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language='english')



In [None]:


print("{0:20}".format("Performing Snowball stemming: "))
print("{0:20}{1:20}".format("Word: ","Stem: "))
for word in word_lis:
   print ("{0:20}{1:20}".format(word, s_stemmer.stem(word)))

Performing Snowball stemming: 
Word:               Stem:               
programming         program             
studying            studi               
saw                 saw                 
talked              talk                
happiness           happi               
felt                felt                
fairly              fair                


3. Lancaster stemming:

In [None]:
import nltk
from nltk.stem import LancasterStemmer
Lanc_stemmer = LancasterStemmer()

In [None]:


print("{0:20}".format("Performing Lancaster stemming: "))
print("{0:20}{1:20}".format("Word: ","Stem: "))
for word in word_lis:
   print ("{0:20}{1:20}".format(word, Lanc_stemmer.stem(word)))

Performing Lancaster stemming: 
Word:               Stem:               
programming         program             
studying            study               
saw                 saw                 
talked              talk                
happiness           happy               
felt                felt                
fairly              fair                


Lemmatization:
1. Word net lemmatizer:

In [None]:
word_lis= ['programming', 'studying', 'saw', 'talked', 'happiness', 'felt', 'fairly']

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")
# Initialize wordnet lemmatizer
wnl = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:

print("{0:20}".format("Performing Word net Lemmatizer: "))
print("{0:20}{1:20}".format("Word: ","lemma: "))
for word in word_lis:
   print ("{0:20}{1:20}".format(word, wnl.lemmatize(word)))

Performing Word net Lemmatizer: 
Word:               lemma:              
programming         programming         
studying            studying            
saw                 saw                 
talked              talked              
happiness           happiness           
felt                felt                
fairly              fairly              


2. Textblob lemmatizer:


In [None]:
from textblob import TextBlob, Word




print("{0:20}".format("Performing Textblob Lemmatizer: "))
print("{0:20}{1:20}".format("Word: ","Lemma: "))
for word in word_lis:
   print ("{0:20}{1:20}".format(word, Word(word).lemmatize()))

Performing Textblob Lemmatizer: 
Word:               Lemma:              
programming         programming         
studying            studying            
saw                 saw                 
talked              talked              
happiness           happiness           
felt                felt                
fairly              fairly              


#TF-IDF with example:

###Process of performing TF -IDF
The process to find meaning of documents using TF-IDF is very similar to Bag of words:


1. Clean data / Preprocessing

2. Tokenize words with frequency
3. Find TF for words
4. Find IDF for words
5. Vectorize vocab

#from scratch

In [None]:
corpus= ['i would want to learn data science which is one of the most important fields of computer science',
         'this is the data science course on nlp',
         'data scientists work on analyzing data and running machine learning algorithms']

In [None]:
import pandas as pd
import numpy as np


In [None]:
word_set= set()
for doc in corpus:
  words= doc.split(" ")
  word_set= word_set.union(set(words))

print(word_set)

{'nlp', 'want', 'on', 'data', 'of', 'to', 'machine', 'course', 'and', 'learning', 'fields', 'running', 'analyzing', 'the', 'one', 'algorithms', 'science', 'is', 'which', 'scientists', 'this', 'work', 'learn', 'would', 'important', 'most', 'computer', 'i'}


In [None]:
print(len(word_set))

28


In [None]:
#compute the term frequency

n_doc= len(corpus)
n_word_set= len(word_set)

df_tf= pd.DataFrame(np.zeros((n_doc, n_word_set)), columns= list(word_set))
df_tf

Unnamed: 0,nlp,want,on,data,of,to,machine,course,and,learning,...,which,scientists,this,work,learn,would,important,most,computer,i
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
for i in range(n_doc):
  words= corpus[i].split(" ")
  for w in words:
    df_tf[w][i]= df_tf[w][i]+ (1/len(words))

df_tf

Unnamed: 0,nlp,want,on,data,of,to,machine,course,and,learning,...,which,scientists,this,work,learn,would,important,most,computer,i
0,0.0,0.055556,0.0,0.055556,0.111111,0.055556,0.0,0.0,0.0,0.0,...,0.055556,0.0,0.0,0.0,0.055556,0.055556,0.055556,0.055556,0.055556,0.055556
1,0.125,0.0,0.125,0.125,0.0,0.0,0.0,0.125,0.0,0.0,...,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.090909,0.181818,0.0,0.0,0.090909,0.0,0.090909,0.090909,...,0.0,0.090909,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# idf

idf= {}

for w in word_set:
  k=0 # no of documents in which this w word is present

  for i in range(n_doc):
    if w in corpus[i].split(" "):
      k+=1

  idf[w]= np.log10(n_doc/k)

idf


{'nlp': 0.47712125471966244,
 'want': 0.47712125471966244,
 'on': 0.17609125905568124,
 'data': 0.0,
 'of': 0.47712125471966244,
 'to': 0.47712125471966244,
 'machine': 0.47712125471966244,
 'course': 0.47712125471966244,
 'and': 0.47712125471966244,
 'learning': 0.47712125471966244,
 'fields': 0.47712125471966244,
 'running': 0.47712125471966244,
 'analyzing': 0.47712125471966244,
 'the': 0.17609125905568124,
 'one': 0.47712125471966244,
 'algorithms': 0.47712125471966244,
 'science': 0.17609125905568124,
 'is': 0.17609125905568124,
 'which': 0.47712125471966244,
 'scientists': 0.47712125471966244,
 'this': 0.47712125471966244,
 'work': 0.47712125471966244,
 'learn': 0.47712125471966244,
 'would': 0.47712125471966244,
 'important': 0.47712125471966244,
 'most': 0.47712125471966244,
 'computer': 0.47712125471966244,
 'i': 0.47712125471966244}

In [None]:
df_tf_idf= df_tf.copy()

for w in word_set:
  for i in range(n_doc):
    df_tf_idf[w][i]= df_tf[w][i]* idf[w]

df_tf_idf

Unnamed: 0,nlp,want,on,data,of,to,machine,course,and,learning,...,which,scientists,this,work,learn,would,important,most,computer,i
0,0.0,0.026507,0.0,0.0,0.053013,0.026507,0.0,0.0,0.0,0.0,...,0.026507,0.0,0.0,0.0,0.026507,0.026507,0.026507,0.026507,0.026507,0.026507
1,0.05964,0.0,0.022011,0.0,0.0,0.0,0.0,0.05964,0.0,0.0,...,0.0,0.0,0.05964,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.016008,0.0,0.0,0.0,0.043375,0.0,0.043375,0.043375,...,0.0,0.043375,0.0,0.043375,0.0,0.0,0.0,0.0,0.0,0.0


#sklearn implementation

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer()

tf_idf_vector= tf_idf.fit_transform(corpus)

tf_idf_array= tf_idf_vector.toarray()

tf_idf_array

array([[0.        , 0.        , 0.        , 0.23689484, 0.        ,
        0.13991391, 0.23689484, 0.23689484, 0.18016463, 0.23689484,
        0.        , 0.        , 0.23689484, 0.        , 0.47378968,
        0.        , 0.23689484, 0.        , 0.36032927, 0.        ,
        0.18016463, 0.        , 0.23689484, 0.23689484, 0.23689484,
        0.        , 0.23689484],
       [0.        , 0.        , 0.        , 0.        , 0.42024133,
        0.2482013 , 0.        , 0.        , 0.31960436, 0.        ,
        0.        , 0.        , 0.        , 0.42024133, 0.        ,
        0.31960436, 0.        , 0.        , 0.31960436, 0.        ,
        0.31960436, 0.42024133, 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.31664428, 0.31664428, 0.31664428, 0.        , 0.        ,
        0.37403043, 0.        , 0.        , 0.        , 0.        ,
        0.31664428, 0.31664428, 0.        , 0.        , 0.        ,
        0.24081614, 0.        , 0.31664428, 0.    

In [None]:
words_set= tf_idf.get_feature_names_out()
print(words_set)

['algorithms' 'analyzing' 'and' 'computer' 'course' 'data' 'fields'
 'important' 'is' 'learn' 'learning' 'machine' 'most' 'nlp' 'of' 'on'
 'one' 'running' 'science' 'scientists' 'the' 'this' 'to' 'want' 'which'
 'work' 'would']


In [None]:
print(len(words_set))

27


In [None]:
df_tf_idf= pd.DataFrame(tf_idf_array, columns= words_set)

df_tf_idf

Unnamed: 0,algorithms,analyzing,and,computer,course,data,fields,important,is,learn,...,running,science,scientists,the,this,to,want,which,work,would
0,0.0,0.0,0.0,0.236895,0.0,0.139914,0.236895,0.236895,0.180165,0.236895,...,0.0,0.360329,0.0,0.180165,0.0,0.236895,0.236895,0.236895,0.0,0.236895
1,0.0,0.0,0.0,0.0,0.420241,0.248201,0.0,0.0,0.319604,0.0,...,0.0,0.319604,0.0,0.319604,0.420241,0.0,0.0,0.0,0.0,0.0
2,0.316644,0.316644,0.316644,0.0,0.0,0.37403,0.0,0.0,0.0,0.0,...,0.316644,0.0,0.316644,0.0,0.0,0.0,0.0,0.0,0.316644,0.0


In [None]:
#

#POS tagging

The idea is to first remove stopwords and then perform the pos tagging

In [None]:
#imports
import nltk
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

We tokenize the example string, and identify stopwords to be removed

In [None]:
text = "The quick brown fox jumped over the lazy dog sitting on the fence."
words = nltk.word_tokenize(text)
stopwords = nltk.corpus.stopwords.words("english")

# Extending the stopwords list
stopwords.extend(string.punctuation)



In [None]:
print(words)

['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog', 'sitting', 'on', 'the', 'fence', '.']


In [None]:

cleaned = [word.lower() for word in words if (word not in stopwords)]
cleaned

['the', 'quick', 'brown', 'fox', 'jumped', 'lazy', 'dog', 'sitting', 'fence']

In [None]:

# Assign POS Tags to the words
tagged = nltk.pos_tag(cleaned)
print(tagged)

[('the', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumped', 'VBD'), ('lazy', 'JJ'), ('dog', 'NN'), ('sitting', 'VBG'), ('fence', 'NN')]


Performing POS tagging using spacy

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm
import spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
2023-06-17 13:55:55.041447: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m82.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


We load the en_core_web_sm medium-sized English model trained on written web text (blogs, news, comments), that includes a tagger, a dependency parser, a lemmatizer, a named entity recognizer and a word vector table with 20k unique vectors.

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
print(text)

The quick brown fox jumped over the lazy dog sitting on the fence.


In [None]:
doc = nlp(text)
for token in doc:
  print(token, "|", token.pos_,"|", spacy.explain(token.pos_),"|",token.tag_, spacy.explain(token.tag_))

The | DET | determiner | DT determiner
quick | ADJ | adjective | JJ adjective (English), other noun-modifier (Chinese)
brown | ADJ | adjective | JJ adjective (English), other noun-modifier (Chinese)
fox | NOUN | noun | NN noun, singular or mass
jumped | VERB | verb | VBD verb, past tense
over | ADP | adposition | IN conjunction, subordinating or preposition
the | DET | determiner | DT determiner
lazy | ADJ | adjective | JJ adjective (English), other noun-modifier (Chinese)
dog | NOUN | noun | NN noun, singular or mass
sitting | VERB | verb | VBG verb, gerund or present participle
on | ADP | adposition | IN conjunction, subordinating or preposition
the | DET | determiner | DT determiner
fence | NOUN | noun | NN noun, singular or mass
. | PUNCT | punctuation | . punctuation mark, sentence closer


In [None]:
from spacy import displacy

sen = nlp(text)
displacy.render(sen, style='dep', jupyter=True, options={'distance': 85})

You can clearly see the dependency of each token on another along with the POS tag.

In [None]:
doc = nlp('i ike to read a book')
for token in doc:
  print(token, "|", token.pos_,"|", spacy.explain(token.pos_),"|",token.tag_, spacy.explain(token.tag_))

i | PRON | pronoun | PRP pronoun, personal
ike | VERB | verb | VBP verb, non-3rd person singular present
to | PART | particle | TO infinitival "to"
read | VERB | verb | VB verb, base form
a | DET | determiner | DT determiner
book | NOUN | noun | NN noun, singular or mass


#NER using NLTK:

In [None]:
import nltk
nltk.download('words')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('averaged_perceptron_tagger')
nltk.download('state_union')





[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package state_union to /root/nltk_data...
[nltk_data]   Unzipping corpora/state_union.zip.


True

In [None]:
# we use the state union corpus and the punkt tokenizer

from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [None]:
train_text = state_union.raw()

sample_text = state_union.raw("2006-GWBush.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)


In [None]:
len(sample_text)

33411

In [None]:
print(sample_text.split("\n")[0])

PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION


In [None]:
# function
def get_named_entity(text):
    try:
            words = nltk.word_tokenize(text)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged, binary=False)
            #namedEnt.draw()
            print(namedEnt)
    except Exception as e:
      print(e)


get_named_entity(tokenized[0])

(S
  PRESIDENT/NNP
  (PERSON GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (ORGANIZATION ADDRESS/NNP)
  BEFORE/IN
  A/NNP
  (ORGANIZATION JOINT/NNP)
  SESSION/NNP
  OF/IN
  (ORGANIZATION THE/NNP)
  (ORGANIZATION CONGRESS/NNP)
  ON/NNP
  THE/NNP
  (ORGANIZATION STATE/NNP OF/IN)
  (ORGANIZATION THE/NNP)
  (ORGANIZATION UNION/NNP)
  January/NNP
  31/CD
  ,/,
  2006/CD
  (ORGANIZATION THE/NNP)
  PRESIDENT/NNP
  :/:
  Thank/NNP
  you/PRP
  all/DT
  ./.)


#NER with spacy

In [None]:
# command to run before code
! pip install spacy
! pip install nltk
! python -m spacy download en_core_web_sm

# imports and load spacy english language package
import spacy
from spacy import displacy
from spacy import tokenizer
nlp = spacy.load('en_core_web_sm')




Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m111.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:

doc = nlp(sample_text)

sentences = list(doc.sents)
print(sentences[0])



PRESIDENT GEORGE W. BUSH'S ADDRESS
PRESIDENT


In [None]:
# tokenization

# print entities
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
#print(ents)
# now we use displaycy function on doc2
displacy.render(doc, style='ent', jupyter=True)

#REGEX



re.search():

The re.search() method returns an object of the match when the pattern is found in a string or text.

re.findall()	:

The re.findall() method is used to return a string list containing all the matches.
re.split():

The re.split() method is used to divide the string on the basis of matching with the regular expression.

re.sub():

The re.sub() method is used to replace the matched string with another string.

In [None]:
import re

In [None]:
import re

string = """Hello my Number is 123456789 and
            my father's number is 987654321"""

# A sample regular expression to find digits.
regex = '\d+'

match = re.findall(regex, string)
print(match)



['123456789', '987654321']


In [None]:
p = re.compile('[a-e]')
#re.compile creates regular expressions


# findall() searches for the Regular Expression
# and return a list upon finding
print(p.findall("Awesome, said the boy"))

['e', 'e', 'a', 'd', 'e', 'b']


In [None]:
p = re.compile('\d+')
print(p.findall("I went to him at 11 A.M. on 4th July 1990"))

['11', '4', '1990']


In [None]:
p = re.compile('\w+')
print(p.findall("I went to him at 11 A.M. on 4th July 1990"))

['I', 'went', 'to', 'him', 'at', '11', 'A', 'M', 'on', '4th', 'July', '1990']


In [None]:
p = re.compile('\s+')
print(p.findall("I went to him at 11 A.M. on 4th July 1990"))

[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']


In [None]:
p = re.compile('w|r')
print(p.findall("I went to him at 11 A.M. on 4th July 1990"))

['w']


In [None]:
p = re.compile('\W+')
print(p.findall("I went to him at 11 A.M. on 4th July 1990"))

In [None]:
print(re.split('\W+', 'On 12th Jan 2016, at 11:02 AM'))

['On', '12th', 'Jan', '2016', 'at', '11', '02', 'AM']


In [None]:
print(re.sub('ub', '@@', 'Subject has Uber booked already'))

S@@ject has Uber booked already


In [None]:
regex = r"([a-zA-Z]+) (\d+)"

match = re.search(regex, "I was born on June 24")
if match != None:

    print ("Match at index %s, %s" % (match.start(), match.end()))

Match at index 14, 21


#Glove embeddings:


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2023-06-15 16:24:23--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-06-15 16:24:23--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-06-15 16:24:24--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
#load glove embeddings
import numpy as np
vocab = {}
with open('glove.6B.100d.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        vocab[word] = vec
print(f'Loaded {len(vocab)} word vectors')

Loaded 400000 word vectors


In [None]:
## To find the nearest neighbors of a word
def find_nearest(word, vocab, k=5):
  distances = []
  word_vec = vocab[word]
  for w, vec in vocab.items():
    distance = np.linalg.norm(word_vec - vec)
    distances.append((w, distance))
  distances = sorted(distances, key=lambda x: x[1])
  return distances[:k]

print(find_nearest('cat', vocab))

[('cat', 0.0), ('dog', 2.6811306), ('rabbit', 3.6489706), ('cats', 3.6892002), ('monkey', 3.7469325)]


In [None]:
## To find the analogy between words
def find_analogy(a, b, c, vocab):
  a_vec = vocab[a]
  b_vec = vocab[b]
  c_vec = vocab[c]
  d_vec = b_vec - a_vec + c_vec
  distances = []
  for w, vec in vocab.items():
    distance = np.linalg.norm(d_vec - vec)
    distances.append((w, distance))
  distances = sorted(distances, key=lambda x: x[1])
  return distances[:1]

print(find_analogy('king', 'man', 'queen', vocab))
print(find_analogy('paris', 'france', 'italy', vocab))

[('woman', 4.0810785)]
[('italy', 4.627724)]


RNN& LSTM

In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder


In [4]:
from keras.utils import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SimpleRNN


In [6]:
categories=['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
data= fetch_20newsgroups(subset= 'train', categories= categories, shuffle= True, random_state= 42)
data

Output hidden; open in https://colab.research.google.com to view.

In [7]:
X_train, X_test, y_train, y_test= train_test_split(data.data, data.target, test_size=0.2, random_state=42)

len(X_train)

1805

In [8]:
len(X_test)

452

In [9]:
max_features=5000
max_len=200
vector= CountVectorizer(max_features=max_features)
X_t= vector.fit_transform(X_train)
X_d= vector.transform(X_test)

In [10]:
y_train[0:5]

array([3, 0, 2, 2, 3])

In [11]:
label_encoder= LabelEncoder()
y_cat_tr= to_categorical(label_encoder.fit_transform(y_train))
y_cat_test= to_categorical(label_encoder.transform(y_test))

y_cat_tr[0:5]

array([[0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]], dtype=float32)

In [12]:
#pad
train_pad= pad_sequences(X_t.toarray(), maxlen= max_len)
test_pad= pad_sequences(X_d.toarray(), maxlen= max_len)


In [14]:
rnn_model= Sequential()
rnn_model.add(Embedding(max_features, 128, input_length= max_len))
rnn_model.add(SimpleRNN(128))
rnn_model.add(Dense(len(categories), activation= 'softmax'))
rnn_model.compile(loss= 'categorical_crossentropy', optimizer='adam', metrics= ['accuracy'])

rnn_model.fit(train_pad, y_cat_tr, batch_size= 64, epochs= 2, validation_data=(test_pad, y_cat_test))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fa1dad7d8a0>

In [15]:
lstm_model= Sequential()
lstm_model.add(Embedding(max_features, 128, input_length= max_len))
lstm_model.add(LSTM(128))
lstm_model.add(Dense(len(categories), activation= 'softmax'))
lstm_model.compile(loss= 'categorical_crossentropy', optimizer='adam', metrics= ['accuracy'])

lstm_model.fit(train_pad, y_cat_tr, batch_size= 64, epochs= 2, validation_data=(test_pad, y_cat_test))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fa1d43bf7f0>

#CNN

In [16]:

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [17]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from keras.utils import to_categorical

In [19]:
digits= load_digits()
len(digits)
len(digits.images)

1797

In [20]:
X_train, X_test, y_train, y_test= train_test_split(digits.images, digits.target, test_size= 0.2, random_state=42)


In [21]:
X_train[0]

array([[ 0.,  0.,  3., 14.,  1.,  0.,  0.,  0.],
       [ 0.,  0., 12., 12.,  0.,  0.,  0.,  0.],
       [ 0.,  3., 16.,  6.,  0.,  0.,  0.,  0.],
       [ 0.,  5., 16.,  2.,  0.,  0.,  0.,  0.],
       [ 0.,  6., 16.,  2.,  5.,  2.,  0.,  0.],
       [ 0.,  4., 16.,  2., 12., 15.,  2.,  0.],
       [ 0.,  1., 14., 13.,  2., 13., 11.,  0.],
       [ 0.,  0.,  3., 11., 16., 13.,  4.,  0.]])

In [22]:
X_train= X_train/ 16.0
X_test= X_test/ 16.0

In [23]:
X_train= X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)
X_test= X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)

In [26]:
X_train[0].shape

(8, 8, 1)

In [24]:
y_train= to_categorical(y_train)
y_test= to_categorical(y_test)



In [29]:
model= Sequential()
model.add(Conv2D(32, kernel_size=(3,3), activation= 'relu', input_shape=(8,8,1)))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Flatten())
model.add(Dense(64, activation= 'relu'))
model.add(Dense(10, activation= 'softmax'))



In [30]:
model.compile(loss= 'categorical_crossentropy', optimizer= 'adam', metrics= ['accuracy'])

model.fit(X_train, y_train, batch_size= 64, epochs=2, validation_data= (X_test, y_test))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fa1d49e3580>