# Manual Method


In [1]:
with open('../DATA/One.txt') as mytext:
    first_txt = mytext.read().lower().split()
    uni_words_one = set(first_txt)

In [2]:
uni_words_one

{'a',
 'about',
 'animals',
 'are',
 'canine',
 'dogs',
 'furry',
 'is',
 'our',
 'pets',
 'story',
 'this'}

In [3]:
with open('../DATA/Two.txt') as text2:
    second_txt = text2.read().lower().split()
    uni_words_two = set(second_txt)

In [4]:
uni_words_two

{'a',
 'about',
 'catching',
 'fun',
 'is',
 'popular',
 'sport',
 'story',
 'surfing',
 'this',
 'water',
 'waves'}

In [5]:
all_uni_words = set()
all_uni_words.update(uni_words_one)
all_uni_words.update(uni_words_two)

In [6]:
all_uni_words

{'a',
 'about',
 'animals',
 'are',
 'canine',
 'catching',
 'dogs',
 'fun',
 'furry',
 'is',
 'our',
 'pets',
 'popular',
 'sport',
 'story',
 'surfing',
 'this',
 'water',
 'waves'}

In [7]:
full_vocab = dict()
i = 0

for word in all_uni_words:
    full_vocab[word] = i
    i += 1

In [8]:
full_vocab

{'is': 0,
 'story': 1,
 'waves': 2,
 'a': 3,
 'sport': 4,
 'furry': 5,
 'fun': 6,
 'are': 7,
 'water': 8,
 'about': 9,
 'our': 10,
 'canine': 11,
 'surfing': 12,
 'pets': 13,
 'dogs': 14,
 'animals': 15,
 'popular': 16,
 'catching': 17,
 'this': 18}

In [9]:
one_freq = [0] * len(full_vocab)
two_freq = [0] * len(full_vocab)
all_words = [''] * len(full_vocab)

In [10]:
with open('../DATA/One.txt') as f:
    one_text = f.read().lower().split()

In [11]:
for word in one_text:
    word_ind = full_vocab[word]
    one_freq[word_ind] += 1

In [12]:
one_freq

[1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 2, 1, 0, 0, 1]

In [13]:
with open('../DATA/Two.txt') as f:
    two_text = f.read().lower().split()

In [14]:
for word in two_text:
    word_ind = full_vocab[word]
    two_freq[word_ind] += 1

In [15]:
two_freq

[3, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 2, 0, 0, 0, 1, 1, 1]

In [16]:
for word in full_vocab:
    index = full_vocab[word]
    all_words[index] = word

In [17]:
all_words

['is',
 'story',
 'waves',
 'a',
 'sport',
 'furry',
 'fun',
 'are',
 'water',
 'about',
 'our',
 'canine',
 'surfing',
 'pets',
 'dogs',
 'animals',
 'popular',
 'catching',
 'this']

In [18]:
import pandas as pd
vec = pd.DataFrame(data= [one_freq, two_freq], columns= all_words)

In [19]:
vec

Unnamed: 0,is,story,waves,a,sport,furry,fun,are,water,about,our,canine,surfing,pets,dogs,animals,popular,catching,this
0,1,1,0,1,0,1,0,1,0,1,1,1,0,1,2,1,0,0,1
1,3,1,1,1,1,0,1,0,1,1,0,0,2,0,0,0,1,1,1


# Scikit-Learn

In [20]:
text = ['This is a line',
        'This is another line',
        'Completely different line']

In [31]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [33]:
cv = CountVectorizer()

In [34]:
matrix = cv.fit_transform(text)

In [35]:
matrix.todense()

matrix([[0, 0, 0, 1, 1, 1],
        [1, 0, 0, 1, 1, 1],
        [0, 1, 1, 0, 1, 0]])

In [36]:
cv.vocabulary_

{'this': 5, 'is': 3, 'line': 4, 'another': 0, 'completely': 1, 'different': 2}

In [38]:
tfidf = TfidfTransformer()

In [40]:
matrix2 =tfidf.fit_transform(matrix)

In [41]:
matrix2.todense()

matrix([[0.        , 0.        , 0.        , 0.61980538, 0.48133417,
         0.61980538],
        [0.63174505, 0.        , 0.        , 0.4804584 , 0.37311881,
         0.4804584 ],
        [0.        , 0.65249088, 0.65249088, 0.        , 0.38537163,
         0.        ]])

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [49]:
tv = TfidfVectorizer()

In [50]:
tv_matrix = tv.fit_transform(text)

In [51]:
tv_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10 stored elements and shape (3, 6)>

In [52]:
tv_matrix.todense()

matrix([[0.        , 0.        , 0.        , 0.61980538, 0.48133417,
         0.61980538],
        [0.63174505, 0.        , 0.        , 0.4804584 , 0.37311881,
         0.4804584 ],
        [0.        , 0.65249088, 0.65249088, 0.        , 0.38537163,
         0.        ]])

In [53]:
tv.vocabulary_

{'this': 5, 'is': 3, 'line': 4, 'another': 0, 'completely': 1, 'different': 2}