# Bag of Words(BOW)

In [3]:
documents = ["Dog bites man.", "Man bites dog.", "Dog eats meat.", "Man eats food."] 
processed_docs = [doc.lower().replace(".","") for doc in documents]
processed_docs

['dog bites man', 'man bites dog', 'dog eats meat', 'man eats food']

In [None]:
pip install scikit-learn

* our vocebulary kısmı indexleme yapıyor. 
* vektorel çıktısı = 'dog bites man' 
* semantic olarak anlamıyor bag of wordsta; sadece corputaki (külliyattaki) hangi kelimler var onu çözüyor.
* özet olarak bow indexleme yapıyor.  

In [5]:
from sklearn.feature_extraction.text import CountVectorizer 

# look at the document list 
print("Our corput:  ", processed_docs)

count_vect = CountVectorizer()
# Build a BOW representation for the corpus 
bow_rep = count_vect.fit_transform(processed_docs)

# Look at the vocabulary mapping 
print("Our vocabulary:  ", count_vect.vocabulary_)

# see the BOW rep for first 2 documents 
print("BoW representation for 'dog bites man': ", bow_rep[0].toarray())
print("BoW representation for 'man bites dog': " , bow_rep[1].toarray())

# Get the representation using this vocabulary, for a new text 
temp = count_vect.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends': ", temp.toarray())

Our corput:   ['dog bites man', 'man bites dog', 'dog eats meat', 'man eats food']
Our vocabulary:   {'dog': 1, 'bites': 0, 'man': 4, 'eats': 2, 'meat': 5, 'food': 3}
BoW representation for 'dog bites man':  [[1 1 0 0 1 0]]
BoW representation for 'man bites dog':  [[1 1 0 0 1 0]]
Bow representation for 'dog and dog are friends':  [[0 2 0 0 0 0]]


* binary true da sadece var mı yok mu diye kontrol ediyor. Varsa 1 yoksa 0 

In [6]:
# BoW with binary vectors 
count_vect = CountVectorizer(binary=True)
count_vect.fit(processed_docs)
temp = count_vect.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':", temp.toarray())

Bow representation for 'dog and dog are friends': [[0 1 0 0 0 0]]


In [7]:
# our corpus 
documents = ["Dog bites man.","Man bites dog.", "Dog eats meat.", "Man eats food."]

processed_docs = [doc.lower().replace(".","") for doc in documents]
processed_docs

['dog bites man', 'man bites dog', 'dog eats meat', 'man eats food']

* ngram bizim için kelime indexleri kaçarlı olacak onu belirliyor. kelime öbeği. 
* ngram öncesi ve sonrasında ki bağlantıyı yapoıyordu. 
* ama şuan attention algoritması ile bu da eski di 

In [None]:
from sklearn.feature_extraction.text import  CountVectorizer

# Ngram vectorization example with count vectorizer and uni, bi, trigrams 
count_vect = CountVectorizer(ngram_range=(1,3))

# Build a BOW representation for the corpus 
bow_rep = count_vect.fit_transform(processed_docs)

# Look at the vocabulary mapping 
print("Our vocabulary: ", count_vect.vocabulary_)

# see the BOW rep for first 2 documents 
print("BoW representation for 'dog bites man': ", bow_rep[0].toarray())
print("BoW representation for 'man bites dog': ", bow_rep[1].toarray())

# Get the representation using this vocabulary, for a new text 
temp = count_vect.transform(["dog and dog are friends"])

print("Bow representation for 'dog and dog are friends:", temp.toarray())

# One Hot Encoding

In [9]:
documents = ["Dog bites man.", "Man bites dog.", "Dog eats meat.", "Man eats food."]
processed_docs = [doc.lower().replace(".","") for doc in documents]
processed_docs

['dog bites man', 'man bites dog', 'dog eats meat', 'man eats food']

* vocab bizim için dict yapısı 


In [10]:
# Build the vocabulary 
vocab = {}
count = 0
for doc in processed_docs:
    for word in doc.split():
        if word not in vocab:
            count = count + 1
            vocab[word] = count
print(vocab)

{'dog': 1, 'bites': 2, 'man': 3, 'eats': 4, 'meat': 5, 'food': 6}


* burada her bir kelime için bir liste yapısı oluşturuyor. vektörel olarak indexini temsil ediyor. 

In [2]:
def get_onehot_vector(somestring):
    onehot_encoded = []
    for word in somestring.split():
        temp = [0]*len(vocab)
        if word in vocab:
            temp[vocab[word]-1] = 1 # -1 is to take care of the fact indexing in array starts from 0 not 1 
        onehot_encoded.append(temp)
    return onehot_encoded

In [12]:
print(processed_docs[1])
get_onehot_vector(processed_docs[1]) # one hot representation for a text form our corpus. 

man bites dog


[[0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0]]

* çok fazla liste yapısı olduğundan kullanışı yok artık 

In [13]:
get_onehot_vector("man and dog are good")
# one hot representation for a random text, using the above vocabulary

[[0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0]]

In [14]:
S1 = 'dog bites man'
S2 = 'man bites dog'
S3 = 'dog eats meat'
S4 = 'man eats food'

In [15]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

data = [S1.split(), S2.split(), S3.split(), S4.split()]
values = data[0]+data[1]+data[2]+data[3]
print("The data: ", values)

# Label Encoding 
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print("Label Encoded:", integer_encoded)

# One-Hot Encoding
onehot_encoder = OneHotEncoder()
onehot_encoded = onehot_encoder.fit_transform(data).toarray()
print("Onehot Encoded Matrix:\n", onehot_encoded)

The data:  ['dog', 'bites', 'man', 'man', 'bites', 'dog', 'dog', 'eats', 'meat', 'man', 'eats', 'food']
Label Encoded: [1 0 4 4 0 1 1 2 5 4 2 3]
Onehot Encoded Matrix:
 [[1. 0. 1. 0. 0. 0. 1. 0.]
 [0. 1. 1. 0. 1. 0. 0. 0.]
 [1. 0. 0. 1. 0. 0. 0. 1.]
 [0. 1. 0. 1. 0. 1. 0. 0.]]


# TF/IDF

* günümüzde hala kullanışlı
* bir kelimenin çok olması önemli olduğunu göstermiyor. 

In [16]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [17]:
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'

In [18]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

In [19]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
uniqueWords

{'a',
 'around',
 'children',
 'fire',
 'for',
 'man',
 'out',
 'sat',
 'the',
 'walk',
 'went'}

In [20]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1 
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1 

In [21]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Casper\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

* tf hesaplamak için number of times t appears in d / total number of term in d 

In [22]:
def computeTF(wordDict, bagofWords):
    tfDict = {}
    bagofWordsCount = len(bagofWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagofWordsCount)
    return tfDict

In [23]:
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

* idf formülü hesapladık 

In [24]:
def computeIDF(documents):
    import math 
    N = len(documents)

    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [25]:
idfs = computeIDF([numOfWordsA, numOfWordsB])

* sonra tf ve ıdf birleştirdik. 

In [26]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf 

In [27]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
df = pd.DataFrame([tfidfA, tfidfB])
df

Unnamed: 0,fire,out,the,children,went,around,man,for,walk,sat,a
0,0.0,0.099021,0.0,0.0,0.099021,0.0,0.099021,0.099021,0.099021,0.0,0.099021
1,0.115525,0.0,0.0,0.115525,0.0,0.115525,0.0,0.0,0.0,0.115525,0.0


In [28]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentA, documentB])
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
df

Unnamed: 0,around,children,fire,for,man,out,sat,the,walk,went
0,0.0,0.0,0.0,0.42616,0.42616,0.42616,0.0,0.303216,0.42616,0.42616
1,0.407401,0.407401,0.407401,0.0,0.0,0.0,0.407401,0.579739,0.0,0.0
