In [None]:
!pip install scikit-learn==0.21.3


In [49]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
#BoW
from sklearn.feature_extraction.text import CountVectorizer
#tfidf
from sklearn.feature_extraction.text import TfidfVectorizer

# *One Hot Encoding*

In [2]:
documents = ["Dog bites man.", "Man bites dog.", "Dog eats meat.", "Man eats food."]
processed_docs=[doc.lower().replace('.',"") for doc in documents]
processed_docs

['dog bites man', 'man bites dog', 'dog eats meat', 'man eats food']

In [3]:
#Build the vocabulary
#Each word in the vocab is given a unique Integer ID b/w 1 and len(vocab)
vocab = {}
count = 0
for doc in processed_docs:
    for word in doc.split():
        if word not in vocab:
            count = count +1
            vocab[word] = count
print(vocab)

{'dog': 1, 'bites': 2, 'man': 3, 'eats': 4, 'meat': 5, 'food': 6}


In [5]:
vocab['bites']-1

1

In [6]:
def one_hot_vector(string):
  one_hot_encoding=[]
  for word in string.split():
    temp=[0] *len(vocab) #V dimensional vector for a word with 0's and 1's
    if word in vocab:
      temp[vocab[word]-1]=1 #subtracting with -1 to get index for word in the
    one_hot_encoding.append(temp)
  return one_hot_encoding

In [11]:
print(processed_docs[0])
one_hot_vector(processed_docs[0])

dog bites man


[[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0]]

Using scikit learn

In [22]:
S1 = 'dog bites man'
S2 = 'man bites dog'
S3 = 'dog eats meat'
S4 = 'man eats food'

In [23]:
data = [S1.split(), S2.split(), S3.split(), S4.split()]
values = data[0]+data[1]+data[2]+data[3]
print("The data: ",values)


The data:  ['dog', 'bites', 'man', 'man', 'bites', 'dog', 'dog', 'eats', 'meat', 'man', 'eats', 'food']


In [24]:
#Label Encoding 
#In Label Encoding, each word w in our corpus is converted into a numeric value between 0 and n-1 (where n refers to number of unique words in our corpus).
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print("Label Encoded:",integer_encoded)


Label Encoded: [1 0 4 4 0 1 1 2 5 4 2 3]


In [31]:
#One-Hot Encoding
onehot_encoder = OneHotEncoder()
onehot_encoded = onehot_encoder.fit_transform(data).toarray()
print("Onehot Encoded Matrix:\n",onehot_encoded)

Onehot Encoded Matrix:
 [[1. 0. 1. 0. 0. 0. 1. 0.]
 [0. 1. 1. 0. 1. 0. 0. 0.]
 [1. 0. 0. 1. 0. 0. 0. 1.]
 [0. 1. 0. 1. 0. 1. 0. 0.]]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  check_array(X, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X = check_array(X, dtype=np.object)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X = check_array(X, dtype=np.object)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X_int = np.zeros((n_samples, n_features), dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X_mask = np.ones((n_samples, n_features), dtype=np.bool)


# *Bag of Words(BOW)*



*   BoW maps word to Unique Integer IDs b/w 1 & |V|


*   Each Document in the corpus is then converted into a vector of |V| dims

*  Each word in vector V is represented by their occurence(count) in the document



In [33]:
count_vect = CountVectorizer()
#Build a BOW representation for the corpus
bow_rep = count_vect.fit_transform(processed_docs)

#Look at the vocabulary mapping
print("Our vocabulary: ", count_vect.vocabulary_)


Our vocabulary:  {'dog': 1, 'bites': 0, 'man': 4, 'eats': 2, 'meat': 5, 'food': 3}


In [34]:
#see the BOW rep for first 2 documents
print("BoW representation for 'dog bites man': ", bow_rep[0].toarray())
print("BoW representation for 'man bites dog: ",bow_rep[1].toarray())

BoW representation for 'dog bites man':  [[1 1 0 0 1 0]]
BoW representation for 'man bites dog:  [[1 1 0 0 1 0]]


In [35]:
#Get the representation using this vocabulary, for a new text
temp = count_vect.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':", temp.toarray())

Bow representation for 'dog and dog are friends': [[0 2 0 0 0 0]]


In [36]:
#BoW with binary vectors
count_vect = CountVectorizer(binary=True)
count_vect.fit(processed_docs)
temp = count_vect.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':", temp.toarray())

Bow representation for 'dog and dog are friends': [[0 1 0 0 0 0]]


# *Bag of N-Grams*



*   BoW is a special case of BoN with n=1
*   It Breaks the text into chunks of n contigous words
*   Each chunk is called n-gram
*   Corpus Vocab is collection of all unique n-grams across the text corpus
*   Each document is reprensted by a vector of length |V|
*   These vectors contains frequency count of n-grams present in the doc


In [37]:
#Ngram vectorization example with count vectorizer and uni, bi, trigrams
count_vect = CountVectorizer(ngram_range=(1,3))

#Build a BOW representation for the corpus
bow_rep = count_vect.fit_transform(processed_docs)

#Look at the vocabulary mapping
print("Our vocabulary: ", count_vect.vocabulary_)

Our vocabulary:  {'dog': 3, 'bites': 0, 'man': 12, 'dog bites': 4, 'bites man': 2, 'dog bites man': 5, 'man bites': 13, 'bites dog': 1, 'man bites dog': 14, 'eats': 8, 'meat': 17, 'dog eats': 6, 'eats meat': 10, 'dog eats meat': 7, 'food': 11, 'man eats': 15, 'eats food': 9, 'man eats food': 16}


In [41]:
len(count_vect.vocabulary_)

18

In [46]:
#see the BOW rep for first 2 documents
print("BoW representation for 'dog bites man': ", bow_rep[0].toarray(),',length of vector-',len(bow_rep[0].toarray()[0]))


BoW representation for 'dog bites man':  [[1 0 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0]] ,length of vector- 18


In [47]:
#Get the representation using this vocabulary, for a new text
temp = count_vect.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':", temp.toarray())

Bow representation for 'dog and dog are friends': [[0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


# *TF-IDF*



*   TF (Term Frequency): it gives all terms equal importance. 
*   it is calulated as (No of counts of term t in document d)/Total no of terms in document d
*   IDF (Inverse Document Frequency): it measures the impportance of the term t across all documents
*   IDF weights down term that are very common and weights up which are rare
*   For IDF calculation we take log(Total no of documents in corpus/No of documents with term t)



In [54]:
tfidf=TfidfVectorizer()
corpus_tfidf=tfidf.fit_transform(processed_docs)

#IDF for all words in the vocabulary
print("IDF for all words in the vocabulary",tfidf.idf_)

IDF for all words in the vocabulary [1.51082562 1.22314355 1.51082562 1.91629073 1.22314355 1.91629073]


In [55]:
#All words in the vocabulary.
print("All words in the vocabulary",tfidf.get_feature_names())

All words in the vocabulary ['bites', 'dog', 'eats', 'food', 'man', 'meat']


In [61]:
#Stop words in the vocabulary.
print("Stop words in the vocabulary are ",tfidf.get_stop_words())

Stop words in the vocabulary are  None


In [56]:
#TFIDF representation for all documents in our corpus 
print("TFIDF representation for all documents in our corpus\n",corpus_tfidf.toarray()) 
print("-"*10)

TFIDF representation for all documents in our corpus
 [[0.65782931 0.53256952 0.         0.         0.53256952 0.        ]
 [0.65782931 0.53256952 0.         0.         0.53256952 0.        ]
 [0.         0.44809973 0.55349232 0.         0.         0.70203482]
 [0.         0.         0.55349232 0.70203482 0.44809973 0.        ]]
----------


In [58]:
temp = tfidf.transform(["dog and man are friends"])
print("Tfidf representation for 'dog and man are friends':\n", temp.toarray())

Tfidf representation for 'dog and man are friends':
 [[0.         0.70710678 0.         0.         0.70710678 0.        ]]
