In [1]:
!pip install spacy
!python -m spacy download en_core_web_md
#change the last md to lg for downloading large set for more complex vectors

Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


## Bag of Words using CountVectorizer

In [2]:
class Category:
    BOOKS = "BOOKS"
    CLOTHING = "CLOTHING"

train_x = ["I love the book","this is a great book","the fit is great","I love the shoes"]
train_y = [Category.BOOKS,Category.BOOKS,Category.CLOTHING,Category.CLOTHING]

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True,ngram_range=(1,2))
train_x_vectors = vectorizer.fit_transform(train_x)

print(vectorizer.get_feature_names_out())

#Count Vectorizer is not binary so it will count no. of occurances
#if we want it to be binary we do CountVectorizer(binary=True)
#unigram takes each word seperately
#bigram takes grps of 2 words and makes vectors
#we can use both by giving an ngram_range(1,2)

print(train_x_vectors.toarray())

['book' 'fit' 'fit is' 'great' 'great book' 'is' 'is great' 'love'
 'love the' 'shoes' 'the' 'the book' 'the fit' 'the shoes' 'this'
 'this is']
[[1 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0]
 [1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1]
 [0 1 1 1 0 1 1 0 0 0 1 0 1 0 0 0]
 [0 0 0 0 0 0 0 1 1 1 1 0 0 1 0 0]]


In [5]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors,train_y)


SVC(kernel='linear')

In [6]:
test_x = vectorizer.transform(['I like the book','Shoes are alright'])

clf_svm.predict(test_x)

#But our model doesn't know how to handle unknown words like books,story etc 

array(['BOOKS', 'CLOTHING'], dtype='<U8')

## Word Vectors using spaCy
### Captures the semantic meaning of a word in a vector
### Like grping the color related words together in the vectors

In [15]:
import spacy

nlp = spacy.load("en_core_web_md")

In [16]:
print(train_x)

['I love the book', 'this is a great book', 'the fit is great', 'I love the shoes']


In [22]:
doc = [nlp(text) for text in train_x]
train_x_wv = [x.vector for x in doc]
print(doc[0].vector)

[-1.3978975  -0.94314    -1.1927751  -4.3043246  -1.69008    -2.18875
  2.8568673   4.1011753  -3.6249747   4.17611     7.12575     2.2885249
 -6.495055   -0.703155    3.46178    -0.9427      4.1357403  -3.3983903
 -0.04899997  1.3973      1.5476775   1.4060001  -0.07001507 -4.591998
 -1.038875   -1.8461976  -3.6312752   0.4407499  -1.7652375   3.388475
 -0.4016     -1.471375   -0.39702505  0.24449998 -0.04592732 -1.4172026
 -1.167125    0.5882125   2.6957998  -0.5626705  -1.7447001   3.973075
 -0.671685   -1.0611899   4.576425    2.9842675  -2.49175    -2.6355624
  0.5972425   0.59040004 -0.792125   -0.590725    0.33869502 -3.42171
 -3.4163604  -0.1711675  -0.786485    1.4665233   3.89455     1.9638373
  5.5787754  -1.3022224  -0.651945    0.43172497 -2.4435027   0.596875
 -3.6072845  -5.0790253   3.3520503   3.8547673  -0.87257504  2.2705574
 -0.5900501  -2.054635    3.19281     3.36905    -2.8925076   1.6652
 -2.5049374  -2.7379746  -2.37408     0.8923325   5.0625834  -1.2852752
  1

In [24]:
from sklearn import svm

clf_svm_wv = svm.SVC(kernel='linear')
clf_svm_wv.fit(train_x_wv,train_y)

SVC(kernel='linear')

In [38]:
test_x = ['I love the story','I love the hat','i Love the books','I love that shirt','these earings hurt']
test_doc = [nlp(text) for text in test_x]
test_x_wv = [x.vector for x in test_doc]

clf_svm_wv.predict(test_x_wv)
#this model uses averaging so it predicts some unkown words
#but if sentence length is big and lot of categories
#then the meanings or semantice might get lost while averaging
#in those cases it might be not as good as bag of words
#words having same spelling but different meanings wont work properly

array(['BOOKS', 'CLOTHING', 'BOOKS', 'CLOTHING', 'CLOTHING'], dtype='<U8')

## Regexes