#  Vectorizing Raw Data
This is the process of converting text data into numerical data. This is done using the following techniques:
* **CountVectorizer** : It Converts a collection of text documents to a matrix of token counts
* **TfidfVectorizer** : It Converts a collection of raw documents to a matrix of TF-IDF features.
* **N-grams vectorization** : It Converts a collection of text documents to a matrix of token counts

In [1]:
import pandas as pd

In [2]:
tweets = pd.read_csv('../assets/twitter.csv')

In [24]:
sample_data = ['This is the first paper.',
               'This document is the second paper.',
               'And this is the third one.',
               'Is this the first paper?']

## Data Cleaning

## CountVectorizer

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
count_vectorizer = CountVectorizer()

In [12]:
x1 = count_vectorizer.fit_transform(sample_data)

In [14]:
x1.toarray()

array([[0, 0, 1, 1, 0, 1, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 1, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 0, 1, 1, 1],
       [0, 0, 1, 1, 0, 1, 0, 1, 0, 1]])

In [30]:
count_vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'paper', 'second', 'the',
       'third', 'this'], dtype=object)

In [32]:
len(count_vectorizer.get_feature_names_out())

10

## TfidfVectorizer

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [69]:
tfidf_vectorizer = TfidfVectorizer()

In [70]:
x2 = tfidf_vectorizer.fit_transform(sample_data)

In [71]:
x2.toarray()

array([[0.        , 0.        , 0.58028582, 0.38408524, 0.        ,
        0.46979139, 0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.55690079, 0.        , 0.29061394, 0.        ,
        0.35546256, 0.55690079, 0.29061394, 0.        , 0.29061394],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.        , 0.58028582, 0.38408524, 0.        ,
        0.46979139, 0.        , 0.38408524, 0.        , 0.38408524]])

In [29]:
tfidf_vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'paper', 'second', 'the',
       'third', 'this'], dtype=object)

In [33]:
len(tfidf_vectorizer.get_feature_names_out())

10

## N-grams vectorization

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

In [55]:
n_gram_vectorizer = CountVectorizer(ngram_range=(2, 2))

In [56]:
x3 = n_gram_vectorizer.fit_transform(sample_data)

In [57]:
x3.toarray()

array([[0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0],
       [0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1]])

In [58]:
n_gram_vectorizer.get_feature_names_out()

array(['and this', 'document is', 'first paper', 'is the', 'is this',
       'second paper', 'the first', 'the second', 'the third',
       'third one', 'this document', 'this is', 'this the'], dtype=object)

In [59]:
len(n_gram_vectorizer.get_feature_names_out())

13