In [1]:
import nltk
txt = open('data.txt')

# Tokenize the words in file
tok_txt = nltk.word_tokenize(txt.read())
tok_txt

['The',
 'product',
 'was',
 'good',
 'but',
 'the',
 'delivery',
 'package',
 'has',
 'scratches']

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

txt = ['The product was good', 
       'but the delivery package has scratches']

Vec = CountVectorizer()
Vec.fit(txt)


CountVectorizer()

In [3]:
# Excludes commas, exclamation marks etc., as well as duplicated words (not the same as tokens)
Vec.get_feature_names()


['but',
 'delivery',
 'good',
 'has',
 'package',
 'product',
 'scratches',
 'the',
 'was']

In [4]:
# Get the weight of each word in text data, and can be understood by our computerb
Vec.vocabulary_


{'the': 7,
 'product': 5,
 'was': 8,
 'good': 2,
 'but': 0,
 'delivery': 1,
 'package': 4,
 'has': 3,
 'scratches': 6}

In [5]:
# Transform our data into something meaningful by our computer (so it can understand)
X = Vec.transform(txt)

# See the transformation of words
X.toarray()

# The sentences where as follows in 'txt':
      # 'The product was good', 
      # 'but the delivery package has scratches'
# So, in the array below we can see that the first array has the following:
      # [0, 0, 1, 0, 0, 1, 0, 1, 1]
# When compared with 'Vec.get_feature_names()'. we can see that it translates to which words are within the sentence:
      # ['but', 'delivery', 'good', 'has', 'package', 'product', 'scratches', 'the', 'was']
      #    no       no        yes     no       no        yes          no        yes    yes
      #                      good                      product                 the     was
# Same goes for the second array below [1, 1, 0, 1, 1, 0, 1, 1, 0]


array([[0, 0, 1, 0, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 1, 0, 1, 1, 0]], dtype=int64)

In [6]:
# TF - Term Frequency  (how many times a term is used)
# IDF - Inverse Document Frequency (opposite of TF)

from sklearn.feature_extraction.text import TfidfTransformer

# Transform text for computer to understand and divide text into 'good' and/or 'bad'
TfIdf = TfidfTransformer()
TfIdf.fit(X)


TfidfTransformer()

In [7]:
tfidf_txt = TfIdf.transform(X)

In [9]:
X

<2x9 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [8]:
tfidf_txt

<2x9 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>