In [1]:
!pip install nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.1.2[0m[39;49m -> [0m[32;49m22.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
text = ["I am a elementary school student", "And I am a boy"]

In [3]:
# Word tokenized sentence
from nltk import word_tokenize

text_tokenzied = [word_tokenize(sentence) for sentence in text]
print(text_tokenzied)

[['I', 'am', 'a', 'elementary', 'school', 'student'], ['And', 'I', 'am', 'a', 'boy']]


In [4]:
# Count the words
from collections import Counter

vocab_counter = Counter()
for sentence in text_tokenzied:
    vocab_counter.update(sentence)
    
print(vocab_counter)

Counter({'I': 2, 'am': 2, 'a': 2, 'elementary': 1, 'school': 1, 'student': 1, 'And': 1, 'boy': 1})


In [5]:
# Vocabulary set
vocab = []
for key, value in vocab_counter.items():
    vocab.append(key)

print(vocab)

['I', 'am', 'a', 'elementary', 'school', 'student', 'And', 'boy']


In [6]:
# Count vector
text_count_vector = []

for sentence in text_tokenzied:
    sentence_vector = []
    
    for word in sentence:
        sentence_vector.append(vocab_counter[word])
    text_count_vector.append(sentence_vector)
    
print(text_count_vector)

[[2, 2, 2, 1, 1, 1], [1, 2, 2, 2, 1]]


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vector = CountVectorizer()
text = ["I am a elementary school student. And I am a boy"]

vocab_counter_sklearn = vector.fit_transform(text).toarray()

# Count of vocabulary
print(vocab_counter_sklearn)

# Index of list
print('vocabulary :',vector.vocabulary_)

[[2 1 1 1 1 1]]
vocabulary : {'am': 0, 'elementary': 3, 'school': 4, 'student': 5, 'and': 1, 'boy': 2}


In [8]:
text = ["I am a great great elementary school student", "And I am a boy"]

In [9]:
# Word tokenized sentence
from nltk import word_tokenize

text_tokenzied = [word_tokenize(sentence) for sentence in text]
print(text_tokenzied)

[['I', 'am', 'a', 'great', 'great', 'elementary', 'school', 'student'], ['And', 'I', 'am', 'a', 'boy']]


In [10]:
# Remove stopwords that are too short.
text_tokenzied2 = []
for sentence in text_tokenzied:
    sent = []
    for word in sentence:
        if len(word) >= 2:
            sent.append(word)
    text_tokenzied2.append(sent)

print(text_tokenzied2)            

[['am', 'great', 'great', 'elementary', 'school', 'student'], ['And', 'am', 'boy']]


In [11]:
# Vocabulary list
from collections import Counter

vocab_counter = Counter()
for sentence in text_tokenzied2:
    vocab_counter.update(sentence)

vocab = []
for key, value in vocab_counter.items():
    vocab.append(key)
    
print(vocab)

['am', 'great', 'elementary', 'school', 'student', 'And', 'boy']


In [12]:
# Count word of each sentence
from collections import Counter

count = []
for sentence in text_tokenzied2:
    vocab_counter = Counter()
    vocab_counter.update(sentence)
    count.append(vocab_counter)
print(count)

[Counter({'great': 2, 'am': 1, 'elementary': 1, 'school': 1, 'student': 1}), Counter({'And': 1, 'am': 1, 'boy': 1})]


In [13]:
def TF(vocab, counter):
    vector = []
    for word in vocab:
        if counter[word] != False:
            vector.append(counter[word])
        else:
            vector.append(0)
    return vector

print(vocab)
print(TF(vocab, count[0]))

['am', 'great', 'elementary', 'school', 'student', 'And', 'boy']
[1, 2, 1, 1, 1, 0, 0]


In [14]:
def DF(text_tokenzied2, vocab):
    text = []
    for sentence in text_tokenzied2:
        for word in list(set(sentence)):
            text.append(word)
    vocab_counter = Counter()
    vocab_counter.update(text)
    
    df = []
    for word in vocab:
        df.append(vocab_counter[word])
    return df

print(vocab)
print(DF(text_tokenzied2, vocab))

['am', 'great', 'elementary', 'school', 'student', 'And', 'boy']
[2, 1, 1, 1, 1, 1, 1]


In [15]:
import math

def IDF(df, n):
    idf = []
    for i in df:
        idf.append(math.log((n)/(i+1))+1)
    return idf

print(vocab)
print(IDF(DF(text_tokenzied2, vocab), len(text_tokenzied2)))

['am', 'great', 'elementary', 'school', 'student', 'And', 'boy']
[0.5945348918918356, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [19]:
def TFIDF(tf, idf):
    product = [x*y for x, y in zip(tf, idf)]
    return product

print(vocab)
print(TFIDF(TF(vocab, count[0]), IDF(DF(text_tokenzied2, vocab), len(text_tokenzied2))))

['am', 'great', 'elementary', 'school', 'student', 'And', 'boy']
[0.5945348918918356, 2.0, 1.0, 1.0, 1.0, 0.0, 0.0]


In [17]:
tfidf = []
for c in count:
    tfidf.append(TFIDF(TF(vocab, c), IDF(DF(text_tokenzied2, vocab), len(text_tokenzied2))))

print(vocab)
print(tfidf)

['am', 'great', 'elementary', 'school', 'student', 'And', 'boy']
[[0.5945348918918356, 2.0, 1.0, 1.0, 1.0, 0.0, 0.0], [0.5945348918918356, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0]]


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfv = TfidfVectorizer().fit(text)
print(tfidfv.transform(text).toarray())
print(tfidfv.vocabulary_)

[[0.25969799 0.         0.         0.36499647 0.72999294 0.36499647
  0.36499647]
 [0.44943642 0.6316672  0.6316672  0.         0.         0.
  0.        ]]
{'am': 0, 'great': 4, 'elementary': 3, 'school': 5, 'student': 6, 'and': 1, 'boy': 2}
