### NLP Representations Exercise

In [1]:
# import pandas
import pandas as pd

# import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
 
# import TfidfTransformer 
from sklearn.feature_extraction.text import TfidfTransformer

# import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
docs=["the house had a tiny little mouse",
      "the cat saw the mouse",
      "the mouse ran away from the house",
      "the cat finally ate the mouse",
      "the end of the mouse story"
     ]

# BagOfWords

* instantiate [CountVectorizer()](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)

In [3]:
cv = CountVectorizer()

* use fit_transform method of CountVectorizer to 'docs' and store the result in 'word_count_vector'

In [4]:
word_count_vector = cv.fit_transform(docs)

* print the shape 

In [5]:
word_count_vector.shape

(5, 16)

* create dataframe from word_count_vector

In [9]:
df_wordCount = pd.DataFrame(word_count_vector.toarray())

In [26]:
df_wordCount.columns = cv.get_feature_names_out()

In [27]:
df_wordCount.head(10)

Unnamed: 0,ate,away,cat,end,finally,from,had,house,little,mouse,of,ran,saw,story,the,tiny
0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,1,1
1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,2,0
2,0,1,0,0,0,1,0,1,0,1,0,1,0,0,2,0
3,1,0,1,0,1,0,0,0,0,1,0,0,0,0,2,0
4,0,0,0,1,0,0,0,0,0,1,1,0,0,1,2,0


# TfIdf with TfidfTransformer

* instantiate [TfidfTransformer()](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html) with the following parameters: 
    * smooth_idf = True
    * use_idf = True

In [28]:
# transformer works with already vectorized data - but transforms it to TF IDF 
tf_transf = TfidfTransformer(smooth_idf=True, use_idf=True)

* use fit_transform method of tfidf transformer on 'word_count_vector' created above and store the result in 'tf_idf_data'

In [29]:
tf_idf_data = tf_transf.fit_transform(word_count_vector)

* create dataframe from 'tf_idf_data'

In [31]:
tf_idf_df = pd.DataFrame(tf_idf_data.toarray())

In [36]:
tf_transf.get_feature_names_out()

array(['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10',
       'x11', 'x12', 'x13', 'x14', 'x15'], dtype=object)

In [39]:
tf_idf_df.columns = cv.get_feature_names_out()

* print the IDF for words in 'docs'

In [40]:
tf_idf_df.head()

Unnamed: 0,ate,away,cat,end,finally,from,had,house,little,mouse,of,ran,saw,story,the,tiny
0,0.0,0.0,0.0,0.0,0.0,0.0,0.493562,0.398203,0.493562,0.235185,0.0,0.0,0.0,0.0,0.235185,0.493562
1,0.0,0.0,0.483344,0.0,0.0,0.0,0.0,0.0,0.0,0.285471,0.0,0.0,0.599092,0.0,0.570941,0.0
2,0.0,0.457093,0.0,0.0,0.0,0.457093,0.0,0.36878,0.0,0.217807,0.0,0.457093,0.0,0.0,0.435614,0.0
3,0.513923,0.0,0.41463,0.0,0.513923,0.0,0.0,0.0,0.0,0.244887,0.0,0.0,0.0,0.0,0.489774,0.0
4,0.0,0.0,0.0,0.491753,0.0,0.0,0.0,0.0,0.0,0.234323,0.491753,0.0,0.0,0.491753,0.468646,0.0


# TfIdf with TfidfVectorizer

* instantiate [TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) with the following parameters:
    - use_idf = True

In [67]:
# starts from raw documents 
tf_vectorizer = TfidfVectorizer(use_idf=True)

* fit and transforn 'docs' with TfidfVectorizer and store the result in 'tfidf_vectorizer_data'

In [68]:
tf_vectorizer.fit(docs)

In [69]:
tfidf_vectorizer_data = tf_vectorizer.transform(docs)

In [70]:
tf_vectorizer.vocabulary_

{'the': 14,
 'house': 7,
 'had': 6,
 'tiny': 15,
 'little': 8,
 'mouse': 9,
 'cat': 2,
 'saw': 12,
 'ran': 11,
 'away': 1,
 'from': 5,
 'finally': 4,
 'ate': 0,
 'end': 3,
 'of': 10,
 'story': 13}

In [71]:
tf_vectorizer.idf_

array([2.09861229, 2.09861229, 1.69314718, 2.09861229, 2.09861229,
       2.09861229, 2.09861229, 1.69314718, 2.09861229, 1.        ,
       2.09861229, 2.09861229, 2.09861229, 2.09861229, 1.        ,
       2.09861229])

In [73]:
tf_vectorizer.get_feature_names_out()

array(['ate', 'away', 'cat', 'end', 'finally', 'from', 'had', 'house',
       'little', 'mouse', 'of', 'ran', 'saw', 'story', 'the', 'tiny'],
      dtype=object)

* create dataframe from tfidf_vectorizer_data

In [75]:
df_tf_idf = pd.DataFrame(tfidf_vectorizer_data.toarray())

* print 'df_tf_idf'

In [76]:
df_tf_idf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,0.0,0.0,0.0,0.0,0.0,0.493562,0.398203,0.493562,0.235185,0.0,0.0,0.0,0.0,0.235185,0.493562
1,0.0,0.0,0.483344,0.0,0.0,0.0,0.0,0.0,0.0,0.285471,0.0,0.0,0.599092,0.0,0.570941,0.0
2,0.0,0.457093,0.0,0.0,0.0,0.457093,0.0,0.36878,0.0,0.217807,0.0,0.457093,0.0,0.0,0.435614,0.0
3,0.513923,0.0,0.41463,0.0,0.513923,0.0,0.0,0.0,0.0,0.244887,0.0,0.0,0.0,0.0,0.489774,0.0
4,0.0,0.0,0.0,0.491753,0.0,0.0,0.0,0.0,0.0,0.234323,0.491753,0.0,0.0,0.491753,0.468646,0.0


In [77]:
df_tf_idf.columns = tf_vectorizer.get_feature_names_out().tolist()

* print IDF for words in 'docs'

In [78]:
df_tf_idf.head()

Unnamed: 0,ate,away,cat,end,finally,from,had,house,little,mouse,of,ran,saw,story,the,tiny
0,0.0,0.0,0.0,0.0,0.0,0.0,0.493562,0.398203,0.493562,0.235185,0.0,0.0,0.0,0.0,0.235185,0.493562
1,0.0,0.0,0.483344,0.0,0.0,0.0,0.0,0.0,0.0,0.285471,0.0,0.0,0.599092,0.0,0.570941,0.0
2,0.0,0.457093,0.0,0.0,0.0,0.457093,0.0,0.36878,0.0,0.217807,0.0,0.457093,0.0,0.0,0.435614,0.0
3,0.513923,0.0,0.41463,0.0,0.513923,0.0,0.0,0.0,0.0,0.244887,0.0,0.0,0.0,0.0,0.489774,0.0
4,0.0,0.0,0.0,0.491753,0.0,0.0,0.0,0.0,0.0,0.234323,0.491753,0.0,0.0,0.491753,0.468646,0.0
