Text Analytics


1. Extract Sample document and apply following document preprocessing methods:
Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization. 2. Create representation of documents by calculating Term Frequency and Inverse
DocumentFrequency

In [1]:
import nltk 
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity 

In [6]:
nltk.download() 

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [7]:
text = "I think Amazon is making a great effort in adding engaging content but I can’t get past the ugly interface. It’s not as intuitive as other competing streaming services and if it weren’t lumped in with my Prime membership, I wouldn’t pay for the stand-alone service."

In [9]:
# word tokenization 
tokens = nltk.word_tokenize(text.lower()) 
print(tokens)

sentence_token = nltk.sent_tokenize(text) 
print(sentence_token)

['i', 'think', 'amazon', 'is', 'making', 'a', 'great', 'effort', 'in', 'adding', 'engaging', 'content', 'but', 'i', 'can', '’', 't', 'get', 'past', 'the', 'ugly', 'interface', '.', 'it', '’', 's', 'not', 'as', 'intuitive', 'as', 'other', 'competing', 'streaming', 'services', 'and', 'if', 'it', 'weren', '’', 't', 'lumped', 'in', 'with', 'my', 'prime', 'membership', ',', 'i', 'wouldn', '’', 't', 'pay', 'for', 'the', 'stand-alone', 'service', '.']
['I think Amazon is making a great effort in adding engaging content but I can’t get past the ugly interface.', 'It’s not as intuitive as other competing streaming services and if it weren’t lumped in with my Prime membership, I wouldn’t pay for the stand-alone service.']


In [10]:
stopwords = nltk.corpus.stopwords.words('english') 
text_cleaned = [] 

for word in tokens: 
    if word not in stopwords: 
        text_cleaned.append(word)  
print(text_cleaned)

['think', 'amazon', 'making', 'great', 'effort', 'adding', 'engaging', 'content', '’', 'get', 'past', 'ugly', 'interface', '.', '’', 'intuitive', 'competing', 'streaming', 'services', '’', 'lumped', 'prime', 'membership', ',', '’', 'pay', 'stand-alone', 'service', '.']


In [12]:
# stemming 

stemmer = nltk.stem.PorterStemmer()
text_stemmed = []

for word in text_cleaned:
    text_stemmed.append(stemmer.stem(word)) 

print(text_stemmed)

['think', 'amazon', 'make', 'great', 'effort', 'ad', 'engag', 'content', '’', 'get', 'past', 'ugli', 'interfac', '.', '’', 'intuit', 'compet', 'stream', 'servic', '’', 'lump', 'prime', 'membership', ',', '’', 'pay', 'stand-alon', 'servic', '.']


In [13]:
pos_tag = nltk.pos_tag(text_cleaned) 

print(pos_tag)

[('think', 'VB'), ('amazon', 'NN'), ('making', 'VBG'), ('great', 'JJ'), ('effort', 'NN'), ('adding', 'VBG'), ('engaging', 'VBG'), ('content', 'NN'), ('’', 'NNP'), ('get', 'VB'), ('past', 'JJ'), ('ugly', 'RB'), ('interface', 'NN'), ('.', '.'), ('’', 'JJ'), ('intuitive', 'JJ'), ('competing', 'VBG'), ('streaming', 'VBG'), ('services', 'NNS'), ('’', 'NNP'), ('lumped', 'VBD'), ('prime', 'JJ'), ('membership', 'NN'), (',', ','), ('’', 'JJ'), ('pay', 'NN'), ('stand-alone', 'NN'), ('service', 'NN'), ('.', '.')]


In [23]:
# lemmatization 
lemmatizer = nltk.stem.WordNetLemmatizer() 
text_lemmatized = []

for word in text_cleaned:
    text_lemmatized.append(lemmatizer.lemmatize(word))
    
print(text_lemmatized)

['think', 'amazon', 'making', 'great', 'effort', 'adding', 'engaging', 'content', '’', 'get', 'past', 'ugly', 'interface', '.', '’', 'intuitive', 'competing', 'streaming', 'service', '’', 'lumped', 'prime', 'membership', ',', '’', 'pay', 'stand-alone', 'service', '.']


In [18]:
vector = TfidfVectorizer(analyzer='word', use_idf=True, smooth_idf=True) 

text = ["I think Amazon is making a great effort in adding engaging content but I can’t get past the ugly interface. It’s not as intuitive as other competing streaming services and if it weren’t lumped in with my Prime membership, I wouldn’t pay for the stand-alone service.",] 
out = vector.fit(text)
print(out.vocabulary_)


{'think': 34, 'amazon': 2, 'is': 18, 'making': 21, 'great': 13, 'effort': 9, 'in': 15, 'adding': 0, 'engaging': 10, 'content': 8, 'but': 5, 'can': 6, 'get': 12, 'past': 26, 'the': 33, 'ugly': 35, 'interface': 16, 'it': 19, 'not': 24, 'as': 4, 'intuitive': 17, 'other': 25, 'competing': 7, 'streaming': 32, 'services': 30, 'and': 3, 'if': 14, 'weren': 36, 'lumped': 20, 'with': 37, 'my': 23, 'prime': 28, 'membership': 22, 'wouldn': 38, 'pay': 27, 'for': 11, 'stand': 31, 'alone': 1, 'service': 29}


In [19]:
tfid_out = vector.fit_transform(text) 
print(tfid_out)

  (0, 29)	0.14002800840280097
  (0, 1)	0.14002800840280097
  (0, 31)	0.14002800840280097
  (0, 11)	0.14002800840280097
  (0, 27)	0.14002800840280097
  (0, 38)	0.14002800840280097
  (0, 22)	0.14002800840280097
  (0, 28)	0.14002800840280097
  (0, 23)	0.14002800840280097
  (0, 37)	0.14002800840280097
  (0, 20)	0.14002800840280097
  (0, 36)	0.14002800840280097
  (0, 14)	0.14002800840280097
  (0, 3)	0.14002800840280097
  (0, 30)	0.14002800840280097
  (0, 32)	0.14002800840280097
  (0, 7)	0.14002800840280097
  (0, 25)	0.14002800840280097
  (0, 17)	0.14002800840280097
  (0, 4)	0.28005601680560194
  (0, 24)	0.14002800840280097
  (0, 19)	0.28005601680560194
  (0, 16)	0.14002800840280097
  (0, 35)	0.14002800840280097
  (0, 33)	0.28005601680560194
  (0, 26)	0.14002800840280097
  (0, 12)	0.14002800840280097
  (0, 6)	0.14002800840280097
  (0, 5)	0.14002800840280097
  (0, 8)	0.14002800840280097
  (0, 10)	0.14002800840280097
  (0, 0)	0.14002800840280097
  (0, 15)	0.28005601680560194
  (0, 9)	0.1400280

In [21]:
print(vector.get_feature_names_out())

['adding' 'alone' 'amazon' 'and' 'as' 'but' 'can' 'competing' 'content'
 'effort' 'engaging' 'for' 'get' 'great' 'if' 'in' 'interface' 'intuitive'
 'is' 'it' 'lumped' 'making' 'membership' 'my' 'not' 'other' 'past' 'pay'
 'prime' 'service' 'services' 'stand' 'streaming' 'the' 'think' 'ugly'
 'weren' 'with' 'wouldn']


In [22]:
data = pd.DataFrame(tfid_out.todense(), columns=vector.get_feature_names_out()) 

print(data)

     adding     alone    amazon       and        as       but       can  \
0  0.140028  0.140028  0.140028  0.140028  0.280056  0.140028  0.140028   

   competing   content    effort  ...   service  services     stand  \
0   0.140028  0.140028  0.140028  ...  0.140028  0.140028  0.140028   

   streaming       the     think      ugly     weren      with    wouldn  
0   0.140028  0.280056  0.140028  0.140028  0.140028  0.140028  0.140028  

[1 rows x 39 columns]
