In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import gensim

In [3]:
reviews_dataset = pd.read_csv('SentimentReviews.csv')
reviews_dataset.head()

Unnamed: 0.1,Unnamed: 0,brand,categories,manufacturer,reviews.rating,reviews.text,label,reviews.text_reduce
0,0,Universal Music,"Movies, Music & Books,Music,R&b,Movies & TV,Mo...",Universal Music Group / Cash Money,5,i love this album. it's very good. more to the...,positive,i love this album. it's very good. more to the...
1,1,Lundberg,"Food,Packaged Foods,Snacks,Crackers,Snacks, Co...",Lundberg,5,Good flavor. This review was collected as part...,positive,Good flavor. This review was collected as part...
2,2,Lundberg,"Food,Packaged Foods,Snacks,Crackers,Snacks, Co...",Lundberg,5,Good flavor.,positive,Good flavor.
3,3,K-Y,"Personal Care,Medicine Cabinet,Lubricant/Sperm...",K-Y,1,I read through the reviews on here before look...,negative,I read through the reviews on here before look...
4,4,K-Y,"Personal Care,Medicine Cabinet,Lubricant/Sperm...",K-Y,1,My husband bought this gel for us. The gel cau...,negative,My husband bought this gel for us. The gel cau...


In [6]:
reviews_dataset.drop(columns=['Unnamed: 0', 'brand', 'categories', 'manufacturer', 'reviews.text', 'label'], inplace=True)

In [10]:
reviews_dataset['reviews.rating'] = reviews_dataset['reviews.rating'].replace([5,4,3,2,1],['Positive', 'Positive', 'OK', 'Negative', 'Negative'])

In [11]:
reviews_dataset.head()

Unnamed: 0,reviews.rating,reviews.text_reduce
0,Positive,i love this album. it's very good. more to the...
1,Positive,Good flavor. This review was collected as part...
2,Positive,Good flavor.
3,Negative,I read through the reviews on here before look...
4,Negative,My husband bought this gel for us. The gel cau...


In [18]:
dataset = list(reviews_dataset['reviews.text_reduce'].values)

In [20]:
lemmatizer = WordNetLemmatizer()

In [24]:
corpus = []

for sentences in dataset:
    review = re.sub('[^a-zA-Z0-9]', ' ', sentences)
    review = review.lower()
    tokens = word_tokenize(review)
    
    review = [lemmatizer.lemmatize(word) for word in tokens if word not in list(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [25]:
corpus

['love album good hip hop side current pop sound hype listen everyday gym give 5star rating way metaphor crazy',
 'good flavor review collected part promotion',
 'good flavor',
 'read review looking buying one couple lubricant ultimately disappointed even live review read starter neither boyfriend could notice sort enhanced captivating sensation notice however messy consistency reminiscent liquid vaseline difficult clean pleasant especially since lacked captivating sensation expecting disappointed paid much lube use could use normal personal lubricant 1 le money 2 le mess',
 'husband bought gel u gel caused irritation felt like burning skin recommend gel',
 'boyfriend bought spice thing bedroom highly disappointed product bought one absolutely love ky mine thought would similar affect absolutely nothing buy',
 'bought earlier today excited check based product description expecting something like regular ky fan expected left little disappointed',
 'bought product husband try impressed t

In [32]:
words = []

for sent in corpus:
    tokens = word_tokenize(sent)
    words.append(tokens)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [45]:
word2vec = gensim.models.Word2Vec(words, window=5, min_count=2)

In [46]:
word2vec.wv.index_to_key

['product',
 'movie',
 'great',
 'review',
 'part',
 'love',
 'promotion',
 'collected',
 'use',
 'good',
 'like',
 'one',
 'skin',
 'hair',
 'clean',
 'really',
 'would',
 'time',
 'using',
 'smell',
 'used',
 'wipe',
 'well',
 'feel',
 'get',
 'make',
 'work',
 'day',
 'easy',
 'first',
 'much',
 'year',
 'tide',
 'kid',
 'also',
 'look',
 'best',
 'clorox',
 'even',
 'family',
 'pod',
 'recommend',
 'buy',
 'go',
 'new',
 'scent',
 'little',
 'loved',
 'dry',
 'better',
 'moisturizer',
 'keep',
 'always',
 'conditioner',
 'see',
 'soft',
 'bought',
 'face',
 'free',
 'way',
 'clothes',
 'back',
 'old',
 'nice',
 'week',
 'tried',
 'try',
 'received',
 'cleaning',
 'color',
 'definitely',
 'watch',
 'line',
 'mop',
 'still',
 'effect',
 'long',
 'lot',
 'every',
 'got',
 'think',
 'funny',
 'made',
 'price',
 'need',
 'never',
 'amazing',
 'olay',
 'fresh',
 'original',
 'could',
 'thing',
 'since',
 'without',
 'shampoo',
 'bottle',
 '2',
 'say',
 'find',
 'enjoyed',
 'many',
 'know