# NLP intro
- N = Natural
- L = Language
- P = Processing

In [43]:
import numpy as np

review1 = "I LOVE this book about love"
review2 = "No this book was okay"

all_words = [text.lower().split() for text in [review1, review2]]
print (all_words)

## Flatten this list, can also add list with plus operator
all_words = [word for text in all_words for word in text]
print(f"Flattend all words {all_words}")

unique_words = set(all_words)
print(f"Unique words {unique_words}")


[['i', 'love', 'this', 'book', 'about', 'love'], ['no', 'this', 'book', 'was', 'okay']]
Flattend all words ['i', 'love', 'this', 'book', 'about', 'love', 'no', 'this', 'book', 'was', 'okay']
Unique words {'was', 'okay', 'no', 'book', 'love', 'this', 'about', 'i'}


In [62]:
vocabulary = {word: index for index, word in enumerate(unique_words)}
print(vocabulary)

def term_frequency_voctorizer(document, vocabulary):
    term_frequency = np.zeros(len(vocabulary))

    for word in document.lower().split():
        index = vocabulary[word]
        term_frequency[index] += 1

    return term_frequency

review1_term_freq = term_frequency_voctorizer(review1, vocabulary)
review2_term_freq = term_frequency_voctorizer(review2, vocabulary)

print(review1_term_freq)
print(review2_term_freq)

{'was': 0, 'okay': 1, 'no': 2, 'book': 3, 'love': 4, 'this': 5, 'about': 6, 'i': 7}
[0. 0. 0. 1. 2. 1. 1. 1.]
[1. 1. 1. 1. 0. 1. 0. 0.]


In [66]:
import pandas as pd

bag_of_words = pd.DataFrame(
    [review1_term_freq, review2_term_freq], columns = vocabulary.keys(),
    dtype = "int16"
)

bag_of_words

Unnamed: 0,was,okay,no,book,love,this,about,i
0,0,0,0,1,2,1,1,1
1,1,1,1,1,0,1,0,0


## Feature Extraction with sklearn

In [71]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer

count_vectorizer = CountVectorizer()
bag_of_words_sparse = count_vectorizer.fit_transform([review1, review2])
bag_of_words_sparse.todense(), count_vectorizer.get_feature_names()

(matrix([[1, 1, 2, 0, 0, 1, 0],
         [0, 1, 0, 1, 1, 1, 1]], dtype=int64),
 ['about', 'book', 'love', 'no', 'okay', 'this', 'was'])

In [82]:
bag_of_words = pd.DataFrame(bag_of_words_sparse.todense(), columns=count_vectorizer.get_feature_names())
bag_of_words



Unnamed: 0,about,book,love,no,okay,this,was
0,1,1,2,0,0,1,0
1,0,1,0,1,1,1,1


## TF-IDF

In [83]:
tfid_transformer = TfidfTransformer()
tfid = tfid_transformer.fit_transform(bag_of_words_sparse)
tfid

<2x7 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [84]:
tfid.todense()

matrix([[0.4078241 , 0.29017021, 0.81564821, 0.        , 0.        ,
         0.29017021, 0.        ],
        [0.        , 0.35520009, 0.        , 0.49922133, 0.49922133,
         0.35520009, 0.49922133]])

In [85]:
# Creates tfidif vector in on go
tfid_vectorizer = TfidfVectorizer()
tfid_vectorizer.fit_transform([review1, review2]).todense()

matrix([[0.4078241 , 0.29017021, 0.81564821, 0.        , 0.        ,
         0.29017021, 0.        ],
        [0.        , 0.35520009, 0.        , 0.49922133, 0.49922133,
         0.35520009, 0.49922133]])