# NLP Workflow

In [None]:
sentence = "I am a master student. I love natural language processing."

### Tokenization

In [None]:
!pip install nltk

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
from nltk.tokenize import sent_tokenize

sentence = "I am a master student. I love natural language processing."
documents = sent_tokenize(sentence)
documents

In [None]:
from nltk.tokenize import word_tokenize
from string import punctuation

sentence = "I am a master student. I love natural language processing."
sentence = ''.join([c for c in sentence if c not in punctuation])
word_token=word_tokenize(sentence)
word_token

### Filtering Stop Words

In [None]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
filtered_sentence = [w for w in word_token if not w in stop_words]
print(filtered_sentence)

In [None]:
unique_words = set(filtered_sentence)
unique_words

## Vectorization

In [None]:
import pandas as pd

In [None]:
def tokenization(doc):
    doc = ''.join([c for c in doc if c not in punctuation])
    doc = doc.split(' ')
    doc = [w for w in doc if not w in stop_words]
    return doc

In [None]:
document1 = tokenization(documents[0])
document2 = tokenization(documents[1])

### Bag of Words

In [None]:
num_of_words1 = dict.fromkeys(unique_words, 0)
for word in document1:
    num_of_words1[word] += 1

num_of_words2 = dict.fromkeys(unique_words, 0)
for word in document2:
    num_of_words2[word] += 1

In [None]:
pd.DataFrame([num_of_words1,num_of_words2])

### N-grams

In [None]:
def ngrams(doc,n):
    ngrams = zip(*[doc[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

In [None]:
# bigram (2-gram)
ngrams1 = ngrams(document1,2)
ngrams2 = ngrams(document2,2)

In [None]:
ngrams1

In [None]:
ngrams2

### TF-IDF
Tutorial: https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76

#### Term Frequency (TF)

In [None]:
def compute_TF(word_dict, bag_of_words):
    tf_dict = {}
    words_count = len(bag_of_words)
    for word, count in word_dict.items():
        tf_dict[word] = count / float(words_count)
    return tf_dict

In [None]:
tf1 = compute_TF(num_of_words1, document1)
tf2 = compute_TF(num_of_words2, document2)

In [None]:
pd.DataFrame([tf1,tf2])

#### Inverse Data Frequency (IDF)

In [None]:
import math
def compute_IDF(documents):
    n = len(documents)
    idf_dict = dict.fromkeys(documents[0].keys(), 0)
    for doc in documents:
        for word, val in doc.items():
            if val > 0:
                idf_dict[word] += 1
    for word, val in idf_dict.items():
        idf_dict[word] = math.log(n/float(val))
    return idf_dict

In [None]:
idf = compute_IDF([num_of_words1, num_of_words2])

In [None]:
idf

In [None]:
pd.Series(idf).to_frame().T

#### Compute TF-IDF
TF-IDF = Term Frequency (TF) * Inverse Document Frequency (IDF)

In [None]:
def compute_TFIDF(tfs, idfs):
    tfidf = {}
    for word, val in tfs.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [None]:
tfidf1 = compute_TFIDF(tf1, idf)
tfidf2 = compute_TFIDF(tf2, idf)

In [None]:
pd.DataFrame([tfidf1,tfidf2])