In [0]:
import pandas as pd
import numpy as np
import nltk
import spacy
from nltk.tokenize.toktok import ToktokTokenizer
import re
import gensim
from contractions import CONTRACTION_MAP # will only work if you have contractions.py file in same directory
from gensim.corpora import Dictionary
from gensim.models import LdaModel

In [0]:
import warnings
import os
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now
%matplotlib inline

In [0]:
#nltk.download('all')

In [6]:
data_raw = pd.read_csv('assignment.csv', low_memory=False)
data_raw.head()

Unnamed: 0,unique_id,raw_text,review_text
0,0,Spiritually and mentally inspiring! A book tha...,Menginspirasi secara spiritual dan mental! Buk...
1,1,This is one my must have books,Ini adalah salah satu yang harus saya miliki buku
2,2,It is a masterpiece of spirituality,Itu adalah mahakarya spiritualitas
3,3,"I'll be the first to admit, its literary qual...","Saya akan menjadi yang pertama mengakui, kuali..."
4,4,"It is rather simplistically written, but the ...","Ini ditulis agak sederhana, tetapi pesan di ba..."


# Text Pre-processing

In [0]:
nlp = spacy.load('en', parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

**Expanding Contractions**

In [30]:
# Expanding shortened version E.g. I don't -> I don not.
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
  contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                    flags=re.IGNORECASE|re.DOTALL)
  def expand_match(contraction):
    match = contraction.group(0)
    first_char = match[0]
    expanded_contraction = contraction_mapping.get(match) if contraction_mapping.get(match) else contraction_mapping.get(match.lower())
    expanded_contraction = first_char + expanded_contraction[1:]
    return expanded_contraction
  
  expanded_text = contractions_pattern.sub(expand_match, text)
  expanded_text = re.sub("'", "", expanded_text)
  return expanded_text
expand_contractions("I'm checking if contractions can or can't be fixed. Haven't I?")

'I am checking if contractions can or cannot be fixed. Have not I?'

**Removing Special Characters**

In [34]:
def remove_special_characters(text, remove_digits=False):
  pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
  text = re.sub(pattern, '', text)
  return text
remove_special_characters("Check @! special characters gets removed #! 1234")

'Check  special characters gets removed  1234'

**Lemmatization**

In [43]:
def lemmatize_text(text):
  text = nlp(text)
  text = " ".join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
  return text
lemmatize_text("I went to village, did farming and came back where I started")

'I go to village , do farm and come back where I start'

**Removing Stop Words**

In [44]:
def remove_stopwords(text, is_lower_case=False):
  tokens = tokenizer.tokenize(text)
  tokens = [token.strip() for token in tokens]
  if is_lower_case:
    filtered_tokens = [token for token in tokens if token not in stopword_list]
  else:
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
  filtered_text = ' '.join(filtered_tokens)
  return filtered_text
remove_stopwords("The, and, if are stopwords, strategy is not.")

', , stopwords , strategy not .'

### Combining all together - Creating a Text Normalizer

In [0]:
def normalize_corpus(corpus, contraction_expansion=True, text_lemmatization=True,
                     special_char_removal=True, stopword_removal=True):
  if contraction_expansion:
    corpus = expand_contractions(corpus)
  if text_lemmatization:
    corpus = lemmatize_text(corpus)
  if special_char_removal:
    # insert spaces between special characters to isolate them
    special_char_pattern = re.compile(r'([{.(-)!}])')
    corpus = special_char_pattern.sub(" \\1", corpus)
    corpus = remove_special_characters(corpus, remove_digits=False)
  # remove extra whitespace
  corpus = re.sub(' +', ' ', corpus)
  if stopword_removal:
    corpus = remove_stopwords(corpus)
  return corpus

In [0]:
data = data_raw.copy()
data['cleaned_text'] = data['raw_text'].apply(normalize_corpus)

In [73]:
data.head()

Unnamed: 0,unique_id,raw_text,review_text,cleaned_text
0,0,Spiritually and mentally inspiring! A book tha...,Menginspirasi secara spiritual dan mental! Buk...,spiritually mentally inspiring book allow ques...
1,1,This is one my must have books,Ini adalah salah satu yang harus saya miliki buku,one must book
2,2,It is a masterpiece of spirituality,Itu adalah mahakarya spiritualitas,masterpiece spirituality
3,3,"I'll be the first to admit, its literary qual...","Saya akan menjadi yang pertama mengakui, kuali...",first admit literary quality not much
4,4,"It is rather simplistically written, but the ...","Ini ditulis agak sederhana, tetapi pesan di ba...",rather simplistically write message behind pow...


In [109]:
sentence = data['raw_text'][140]
sentence_nlp = nlp(sentence)

# POS Tagging nltk
nltk_pos_tagged = nltk.pos_tag(sentence.split())
display(pd.DataFrame(nltk_pos_tagged, columns=['Word', 'POS tag']))

Unnamed: 0,Word,POS tag
0,If,IN
1,you,PRP
2,like,VBP
3,spiritual,JJ
4,gurus,NN
5,like,IN
6,Eckhardt,NNP
7,Tolle,NNP
8,you'll,NN
9,probably,RB


In [110]:
# POS Tagging with Spacy
spacy_pos_tagged = [(word, word.tag_, word.pos_, word.ent_type_) for word in sentence_nlp]
display(pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type', 'NER Type']))

Unnamed: 0,Word,POS tag,Tag type,NER Type
0,,_SP,SPACE,
1,If,IN,ADP,
2,you,PRP,PRON,
3,like,VBP,VERB,
4,spiritual,JJ,ADJ,
5,gurus,NNS,NOUN,
6,like,IN,ADP,
7,Eckhardt,NNPS,PROPN,PERSON
8,Tolle,NNP,PROPN,PERSON
9,you,PRP,PRON,


In [112]:
spacy.displacy.render(sentence_nlp, style='ent', jupyter=True)

**Dependency Parsing**

In [113]:
for chunk in sentence_nlp.noun_chunks:
  print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

you you nsubj like
spiritual gurus gurus dobj like
Eckhardt Tolle Tolle pobj like
you you nsubj like
Kahlil Gibran Gibran dobj like


In [115]:
for token in sentence_nlp:
  print(token.text, token.dep_, token.head.text, token.head.pos_, [child for child in token.children])

   If ADP []
If mark like VERB [ ]
you nsubj like VERB []
like advcl like ADP [If, you, gurus]
spiritual amod gurus NOUN []
gurus dobj like VERB [spiritual, like]
like prep gurus NOUN [Tolle]
Eckhardt compound Tolle PROPN []
Tolle pobj like ADP [Eckhardt]
you nsubj like ADP []
'll aux like ADP []
probably advmod like ADP []
like ROOT like ADP [like, you, 'll, probably, Gibran]
Kahlil compound Gibran PROPN []
Gibran dobj like ADP [Kahlil]


In [117]:
spacy.displacy.render(sentence_nlp, style='dep', jupyter=True);

**Spacy seems to be better at Part Of Speech Tagging.**

In [0]:
# we add some words to the stop word list
texts = np.array(data['cleaned_text'].str.split())

In [170]:
bigram = gensim.models.Phrases(texts)
texts = [bigram[line] for line in texts]
texts[1]

['one', 'must', 'book']

In [0]:
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [173]:
ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
ldamodel.show_topics()

[(0,
  '0.032*"not" + 0.023*"get" + 0.021*"one" + 0.015*"headset" + 0.015*"use" + 0.014*"tool" + 0.014*"no" + 0.012*"say" + 0.012*"go" + 0.009*"item"'),
 (1,
  '0.034*"buy" + 0.034*"knife" + 0.025*"love" + 0.018*"purchase" + 0.016*"size" + 0.016*"small" + 0.015*"one" + 0.013*"make" + 0.012*"use" + 0.011*"happy"'),
 (2,
  '0.035*"not" + 0.027*"product" + 0.020*"price" + 0.019*"great" + 0.018*"device" + 0.017*"good" + 0.014*"like" + 0.014*"use" + 0.013*"get" + 0.011*"wear"'),
 (3,
  '0.055*"not" + 0.023*"use" + 0.014*"case" + 0.014*"keyboard" + 0.011*"power" + 0.011*"like" + 0.011*"key" + 0.011*"film" + 0.010*"color" + 0.010*"light"'),
 (4,
  '0.018*"clean" + 0.015*"carry" + 0.015*"order" + 0.015*"2" + 0.014*"charger" + 0.013*"nice" + 0.013*"movie" + 0.010*"not" + 0.009*"compare" + 0.008*"come"'),
 (5,
  '0.070*"phone" + 0.044*"not" + 0.025*"one" + 0.017*"get" + 0.015*"like" + 0.014*"really" + 0.013*"use" + 0.013*"would" + 0.012*"much" + 0.012*"time"'),
 (6,
  '0.012*"top" + 0.011*"Motor