In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import re
import nltk
import PyDictionary
import pandas as pd
from collections import Counter

# <center><u>READING DATA</u></center>

In [3]:
df=pd.read_csv('AmazonLawnAndGardenReviews.csv',encoding='iso-8859-1')

In [4]:
df.head(2)

Unnamed: 0,reviewerID,asin,reviewerName,helpful__001,helpful__002,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1JZFGZEZVWQPY,B00002N674,"Carter H ""1amazonreviewer@gmail . com""",4,4,Good USA company that stands behind their prod...,4,Great Hoses,1308614400,"06 21, 2011"
1,A32JCI4AK2JTTG,B00002N674,"Darryl Bennett ""Fuzzy342""",0,0,This is a high quality 8 ply hose. I have had ...,5,Gilmour 10-58050 8-ply Flexogen Hose 5/8-Inch ...,1402272000,"06 9, 2014"


In [5]:
df.columns

Index(['reviewerID', 'asin', 'reviewerName', 'helpful__001', 'helpful__002',
       'reviewText', 'overall', 'summary', 'unixReviewTime', 'reviewTime'],
      dtype='object')

# <center><u>UTILITIES FUNCTIONS FOR TEXT PREPROCESSING</u></center>

In [6]:
def Remove_URLs(x):
    x = x.split(' ')
    x = [i for i in x if not len(re.findall(r'[\w\.-]+@[\w\.-]+',i))]
    x = ' '.join(x)
    return x
def tokenizing(x):
    return nltk.tokenize.word_tokenize(x)
def stopwords(x):
    stop_words=nltk.corpus.stopwords.words('english')
    x=[i for i in x if i not in stop_words]
    return x
def Lemmatization(x):
    lemmatizer=nltk.stem.WordNetLemmatizer()
    x = [ lemmatizer.lemmatize(i) for i in x]
    return x
def Remove_numbers(x):
    x = [re.sub('[^A-Z,a-z]+','',i) for i in x]
    x = ' '.join(x).lower()
    return x  

# <center><u>FUNCTIONS TO USE FURTHER</u></center>

In [7]:
def preprocess(x):

    x = Remove_URLs(x)
    x = tokenizing(x)
    x = stopwords(x)
    x = Lemmatization(x)
    x = Remove_numbers(x)
    return x

In [8]:
x = df.reviewText
Remove_numbers(tokenizing(x[0]))

'good usa company that stands behind their products  i have had to warranty two hoses and they send replacements right out to you  i had one burst after awhile , you could see it buldge for weeks before it went so no suprises  the other one was winter related as i am bad and leave them out most of the time  highly reccomend  note the hundred footer is heavy and like wresting an anaconda when its time to put away , but it does have a far reach '

### Sample text before preprocessing

In [9]:
x=list(df.reviewText)
df.reviewText[0]

'Good USA company that stands behind their products. I have had to warranty two hoses and they send replacements right out to you. I had one burst after awhile, you could see it buldge for weeks before it went so no suprises. The other one was winter related as I am bad and leave them out most of the time. Highly reccomend. Note the hundred footer is heavy and like wresting an anaconda when its time to put away, but it does have a far reach.'

### Sample text after preprocessing

In [10]:
preprocess(df.reviewText[0])

'good usa company stand behind product  i warranty two hose send replacement right  i one burst awhile , could see buldge week went suprises  the one winter related i bad leave time  highly reccomend  note hundred footer heavy like wresting anaconda time put away , far reach '

### Preprocessing all review texts

In [11]:
%%time
df['Preprocess_Review'] =  df.reviewText.apply(lambda x: preprocess(x) if isinstance(x,str) else ' ')

Wall time: 1min 5s


# <center><u>FEATURE ENGINEERING</u></center>

### Extracting feature words from the reviews text

In [12]:
%%time
Word_set = set()
for i in df.Preprocess_Review:
    Word_set |= set(nltk.tokenize.word_tokenize(i))

Wall time: 12.2 s


### Parts of speech tagging to differentiate the words

In [13]:
Word_tags = pd.DataFrame(nltk.pos_tag(list(Word_set)))

In [14]:
Word_tags.head()

Unnamed: 0,0,1
0,pricepoints,NNS
1,gettinginstallation,NN
2,withtrapit,VBP
3,nonautumn,IN
4,openerthe,JJ


### List of abbreivations

In [15]:
Word_tags[1].unique()

array(['NNS', 'NN', 'VBP', 'IN', 'JJ', 'VBN', 'RB', 'VBD', 'VBG', 'NNP',
       'VBZ', 'VB', 'JJS', 'WRB', 'DT', 'WP', 'JJR', 'CC', 'RP', 'FW',
       'RBR', 'EX', 'MD', 'CD', 'PRP', 'PRP$', 'WDT', 'PDT', 'RBS', 'TO',
       'WP$', ','], dtype=object)


    CC | Coordinating conjunction |
    CD | Cardinal number |
    DT | Determiner |
    EX | Existential there |
    FW | Foreign word |
    IN | Preposition or subordinating conjunction |
    JJ | Adjective |
    JJR | Adjective, comparative |
    JJS | Adjective, superlative |
    LS | List item marker |
    MD | Modal |
    NN | Noun, singular or mass |
    NNS | Noun, plural |
    NNP | Proper noun, singular |
    NNPS | Proper noun, plural |
    PDT | Predeterminer |
    POS | Possessive ending |
    PRP | Personal pronoun |
    PRP$ | Possessive pronoun |
    RB | Adverb |
    RBR | Adverb, comparative |
    RBS | Adverb, superlative |
    RP | Particle |
    SYM | Symbol |
    TO | to |
    UH | Interjection |
    VB | Verb, base form |
    VBD | Verb, past tense |
    VBG | Verb, gerund or present participle |
    VBN | Verb, past participle |
    VBP | Verb, non-3rd person singular present |
    VBZ | Verb, 3rd person singular present |
    WDT | Wh-determiner |
    WP | Wh-pronoun |
    WP$ | Possessive wh-pronoun |
    WRB | Wh-adverb |


### Extracting the words which gives meaning out of sentence (verbs)

In [16]:
Words_verbs = Word_tags[Word_tags[1].isin(['VB','VBP','VBN','VBZ','RB'])]

In [17]:
Words_verbs.head()

Unnamed: 0,0,1
2,withtrapit,VBP
5,overtorqued,VBN
7,reloadsunfortunately,RB
18,thenoverall,VBP
20,lubricated,VBN


In [18]:
df.head(3)

Unnamed: 0,reviewerID,asin,reviewerName,helpful__001,helpful__002,reviewText,overall,summary,unixReviewTime,reviewTime,Preprocess_Review
0,A1JZFGZEZVWQPY,B00002N674,"Carter H ""1amazonreviewer@gmail . com""",4,4,Good USA company that stands behind their prod...,4,Great Hoses,1308614400,"06 21, 2011",good usa company stand behind product i warra...
1,A32JCI4AK2JTTG,B00002N674,"Darryl Bennett ""Fuzzy342""",0,0,This is a high quality 8 ply hose. I have had ...,5,Gilmour 10-58050 8-ply Flexogen Hose 5/8-Inch ...,1402272000,"06 9, 2014",this high quality ply hose i good luck gilmo...
2,A3N0P5AAMP6XD2,B00002N674,H B,2,3,It's probably one of the best hoses I've ever ...,4,Very satisfied!,1336176000,"05 5, 2012",it s probably one best hose i ve ever hadpro s...


In [19]:
df.Preprocess_Review[0]

'good usa company stand behind product  i warranty two hose send replacement right  i one burst awhile , could see buldge week went suprises  the one winter related i bad leave time  highly reccomend  note hundred footer heavy like wresting anaconda time put away , far reach '

In [20]:
[i for i in nltk.pos_tag(nltk.tokenize.word_tokenize(df.Preprocess_Review[0])) if i[1] in ['VB','VBP','VBZ','RB','VBN']]

[('warranty', 'VBP'),
 ('send', 'VBP'),
 ('see', 'VB'),
 ('suprises', 'VBZ'),
 ('related', 'VBN'),
 ('leave', 'VBP'),
 ('highly', 'RB'),
 ('footer', 'RB'),
 ('away', 'RB'),
 ('far', 'RB')]

### Columnise preprocess verbs of each review sentence

In [23]:
%%time
def func(x):
    tokens = nltk.pos_tag(nltk.tokenize.word_tokenize(x))
    verbs = [i for i in tokens if i[1] in ['VB','VBP','VBZ','RB','VBN']]
    return verbs
df['Verbs'] = df.Preprocess_Review.apply(lambda x : func(x))

Wall time: 2min 48s


## Regular Expressions in Preprocessing Raw Text 

In [24]:
x[0]

'Good USA company that stands behind their products. I have had to warranty two hoses and they send replacements right out to you. I had one burst after awhile, you could see it buldge for weeks before it went so no suprises. The other one was winter related as I am bad and leave them out most of the time. Highly reccomend. Note the hundred footer is heavy and like wresting an anaconda when its time to put away, but it does have a far reach.'

#### Extracting all occurances of Required word <B> "out" </B> in Raw Text

In [25]:
re.findall(r'[o][u][t]',x[0])

['out', 'out']

#### Extracting all <B> Integers </B> of Raw Text

In [26]:
w = x[0]+'98.4   34   4343  32.33 343.0'
re.findall(r'\d+.\d+',w)

['98.4', '4343', '32.33', '343.0']

#### Extracting all words which are ending with <B> 'ge' </B> in retrieving in <B> Contineous verbs</B>

In [27]:
[w for w in nltk.tokenize.word_tokenize(x[0]) if re.search(r'ge$',w)]

['buldge']

#### Lambda function in converting all words to <B> lower case</B>

In [29]:
list(map(lambda x: x.lower(),Word_set))

['pricepoints',
 'gettinginstallation',
 'withtrapit',
 'nonautumn',
 'openerthe',
 'overtorqued',
 'readymixed',
 'reloadsunfortunately',
 'fearof',
 'carried',
 'oneupdate',
 'cognitive',
 'humongous',
 'somedayit',
 'ironmetal',
 'bubbler',
 'whole',
 'chips',
 'thenoverall',
 'fragility',
 'lubricated',
 'uphave',
 'motif',
 'induce',
 'sprays',
 'inchesi',
 'blew',
 'deployed',
 'imploring',
 'restricter',
 'goodie',
 'capacity',
 'smokepaddle',
 'multihour',
 'onehandle',
 'gardenshrubsetc',
 'zinnias',
 'triggerdo',
 'cultivatortilleri',
 'winnerupdate',
 'litter',
 'iwill',
 'respectedif',
 'returnedlooking',
 'sizeheat',
 'hav',
 'unpainted',
 'bottomline',
 'irregularity',
 'crawler',
 'cropsthis',
 'testedhere',
 'global',
 'buzy',
 'ka',
 'eel',
 'smooththe',
 'wiresgas',
 'conflicting',
 'teething',
 'bay',
 'vigorouslythese',
 'recommendation',
 'pronged',
 'sunpros',
 'treesoverall',
 'freethe',
 'huh',
 'gelbait',
 'vibrationsound',
 'lest',
 'investmentupdate',
 'comfo

In [30]:
x[0]

'Good USA company that stands behind their products. I have had to warranty two hoses and they send replacements right out to you. I had one burst after awhile, you could see it buldge for weeks before it went so no suprises. The other one was winter related as I am bad and leave them out most of the time. Highly reccomend. Note the hundred footer is heavy and like wresting an anaconda when its time to put away, but it does have a far reach.'

#### Extracting all words of raw text which are in <B> length of 5 to 7 </B>

In [32]:
[w for w in nltk.tokenize.word_tokenize(w) if re.search(r'\w{5,7}$', w)]

['company',
 'stands',
 'behind',
 'their',
 'products',
 'warranty',
 'hoses',
 'replacements',
 'right',
 'burst',
 'after',
 'awhile',
 'could',
 'buldge',
 'weeks',
 'before',
 'suprises',
 'other',
 'winter',
 'related',
 'leave',
 'Highly',
 'reccomend',
 'hundred',
 'footer',
 'heavy',
 'wresting',
 'anaconda']

#### Extracting of all words which are in <B> length of 5 </B>

In [33]:
[w for w in nltk.tokenize.word_tokenize(w) if re.search(r'^[a-zA-F]{5}$', w)]

['their',
 'hoses',
 'right',
 'burst',
 'after',
 'could',
 'weeks',
 'other',
 'leave',
 'heavy']

#### <B> Finding all occurances </B> of given word in raw text

In [34]:
re.findall(r'[o][u][t]',x[0])

['out', 'out']

#### Extracting all words which are <b>Greater than length 3 </b>

In [35]:
wordlist = nltk.tokenize.word_tokenize(x[3])
[w for w in wordlist if re.search('^[a-z]{3,}$', w)]

['probably',
 'should',
 'have',
 'bought',
 'something',
 'bit',
 'more',
 'flexible',
 'and',
 'less',
 'rugged',
 'since',
 'constantly',
 'for',
 'washing',
 'cars',
 'but',
 'that',
 'fault',
 'not',
 'product',
 'fault']

#### Extracting all the <b> Float values </b> of Raw text

In [36]:
re.findall('\d+\.\d+', w)

['98.4', '32.33', '343.0']

In [37]:
re.search(r'[i][t]+',w)

<_sre.SRE_Match object; span=(174, 176), match='it'>

Searching for the all the required <B>combinations of words</B> which are with <B> ghi- 1st letter,mno-2nd letter,jlk-3rd letter,def-4th letter </b>

In [18]:
[w for w in nltk.tokenize.word_tokenize(x[0]) if re.search('^[ghi][mno][jlk][def]$', w)]

[]

#### Importing the words from NLTK repository of Corpus
the words are tagged with one hot encoding

In [3]:
import nltk
a=nltk.corpus.brown.tagged_words(categories='news')[:15]

In [13]:
fd = nltk.FreqDist(nltk.corpus.brown.words(categories='news'))
fd.most_common(5)

[('the', 5580), (',', 5188), ('.', 4030), ('of', 2849), ('and', 2146)]

Conditional FreqDist to visualize the words of having <B> specified length condition</B>

In [52]:
fd = nltk.ConditionalFreqDist(nltk.corpus.brown.words(categories='news'))
d =nltk.ConditionalFreqDist(nltk.pos_tag(nltk.tokenize.word_tokenize(x[0])))
d.items()


dict_items([('Good', FreqDist({'JJ': 1})), ('USA', FreqDist({'NNP': 1})), ('company', FreqDist({'NN': 1})), ('that', FreqDist({'WDT': 1})), ('stands', FreqDist({'VBZ': 1})), ('behind', FreqDist({'IN': 1})), ('their', FreqDist({'PRP$': 1})), ('products', FreqDist({'NNS': 1})), ('.', FreqDist({'.': 6})), ('I', FreqDist({'PRP': 3})), ('have', FreqDist({'VBP': 1, 'VB': 1})), ('had', FreqDist({'VBN': 1, 'VBD': 1})), ('to', FreqDist({'TO': 3})), ('warranty', FreqDist({'VB': 1})), ('two', FreqDist({'CD': 1})), ('hoses', FreqDist({'NNS': 1})), ('and', FreqDist({'CC': 3})), ('they', FreqDist({'PRP': 1})), ('send', FreqDist({'VBP': 1})), ('replacements', FreqDist({'NNS': 1})), ('right', FreqDist({'RB': 1})), ('out', FreqDist({'IN': 1, 'RP': 1})), ('you', FreqDist({'PRP': 2})), ('one', FreqDist({'CD': 1, 'NN': 1})), ('burst', FreqDist({'NN': 1})), ('after', FreqDist({'IN': 1})), ('awhile', FreqDist({'NN': 1})), (',', FreqDist({',': 2})), ('could', FreqDist({'MD': 1})), ('see', FreqDist({'VB': 1})

In [39]:
type(d.tabulate())

                ,    .   CC   CD   DT   IN   JJ  JJS   MD   NN  NNP  NNS  PRP PRP$   RB   RP   TO   VB  VBD  VBG  VBN  VBP  VBZ  WDT  WRB 
           ,    2    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
           .    0    6    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
        Good    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
      Highly    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
           I    0    0    0    0    0    0    0    0    0    0    0    0    3    0    0    0    0    0    0    0    0    0    0    0    0 
        Note    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0 
         The    0    0    0

NoneType

In [26]:
cfd = nltk.ConditionalFreqDist(nltk.corpus.brown.tagged_words(categories='news')[:15])

In [30]:
pd.DataFrame(cfd.tabulate()).head()

                 AT    IN    JJ JJ-TL    NN NN-TL   NP$ NP-TL    NR   VBD 
    Atlanta's     0     0     0     0     0     0     1     0     0     0 
       County     0     0     0     0     0     1     0     0     0     0 
       Friday     0     0     0     0     0     0     0     0     1     0 
       Fulton     0     0     0     0     0     0     0     1     0     0 
        Grand     0     0     0     1     0     0     0     0     0     0 
         Jury     0     0     0     0     0     1     0     0     0     0 
          The     1     0     0     0     0     0     0     0     0     0 
           an     1     0     0     0     0     0     0     0     0     0 
     election     0     0     0     0     1     0     0     0     0     0 
investigation     0     0     0     0     1     0     0     0     0     0 
           of     0     1     0     0     0     0     0     0     0     0 
      primary     0     0     0     0     1     0     0     0     0     0 
     produced     0     0

In [31]:
 nltk.corpus.brown.tagged_sents(categories='news')

[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('Atlant