In [6]:
import pandas as pd
import re

## Bag of words

In [7]:
messages = "tonigh I will call you, so can you please call me a cab ?, but not forget, please call me"

In [10]:
message = re.sub(r'[^\w\s]','',messages)

In [11]:
print(message)

tonigh I will call you so can you please call me a cab  but not forget please call me


In [12]:
documento = message.split(' ')

In [14]:
print(documento)

['tonigh', 'I', 'will', 'call', 'you', 'so', 'can', 'you', 'please', 'call', 'me', 'a', 'cab', '', 'but', 'not', 'forget', 'please', 'call', 'me']


Process of converting free text to structured data is called as Text Vectorization. 

In [17]:
# instantiate CountVectorizer (vectorizer)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(documento)
vect.get_feature_names()  #unique words from text

['but',
 'cab',
 'call',
 'can',
 'forget',
 'me',
 'not',
 'please',
 'so',
 'tonigh',
 'will',
 'you']

## Transform message
   (Bag of Words)

In [22]:
documento_transformed = vect.transform(documento)
print(vect.get_feature_names())
documento_transformed.toarray()

['but', 'cab', 'call', 'can', 'forget', 'me', 'not', 'please', 'so', 'tonigh', 'will', 'you']


array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [24]:
data = pd.DataFrame(documento_transformed.toarray())
data.columns = vect.get_feature_names()
print(documento)
data.head(20)

['tonigh', 'I', 'will', 'call', 'you', 'so', 'can', 'you', 'please', 'call', 'me', 'a', 'cab', '', 'but', 'not', 'forget', 'please', 'call', 'me']


Unnamed: 0,but,cab,call,can,forget,me,not,please,so,tonigh,will,you
0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,1,0
3,0,0,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,1
5,0,0,0,0,0,0,0,0,1,0,0,0
6,0,0,0,1,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,1
8,0,0,0,0,0,0,0,1,0,0,0,0
9,0,0,1,0,0,0,0,0,0,0,0,0


## TF-IDF : Term Frequency inverse document Frequency 

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfvect = TfidfVectorizer()
trans = tfvect.fit_transform(data)
pd.DataFrame(trans.toarray(), columns = tfvect.get_feature_names())

Unnamed: 0,but,cab,call,can,forget,me,not,outcome,please,so,tonigh,will,you
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
