In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import sqlite3
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from tqdm import tqdm

# Loading Data

In [4]:
con = sqlite3.connect("database.sqlite")
data = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3 """, con)
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


# Data Cleaning

In [5]:
def output(x):
    if x < 3:
        return 0
    return 1
data["Output"]=data.Score.map(output)

In [6]:
data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=True)

In [7]:
data.shape

(364173, 11)

In [9]:
data=data[data.HelpfulnessNumerator<=data.HelpfulnessDenominator]

In [10]:
data.shape

(364171, 11)

# Text Preprocessing

In [11]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [13]:
stopwords=set(stopwords.words('english'))

In [15]:
preprocessed_text = []
# tqdm is for printing the status bar
for sentance in tqdm(data['Text'].values):
    sentance = re.sub(r"http\S+", "", sentance)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    # https://gist.github.com/sebleier/554280
    sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords)
    preprocessed_text.append(sentance.strip())

100%|█████████████████████████████████████████████████████████████████████████| 364171/364171 [06:13<00:00, 974.45it/s]


In [17]:
preprocessed_summary = []
# tqdm is for printing the status bar
for sentance in tqdm(data['Summary'].values):
    sentance = re.sub(r"http\S+", "", sentance)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    # https://gist.github.com/sebleier/554280
    sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords)
    preprocessed_summary.append(sentance.strip())

100%|████████████████████████████████████████████████████████████████████████| 364171/364171 [04:22<00:00, 1387.36it/s]


In [19]:
df= pd.DataFrame({"Text":preprocessed_text,"Summart":preprocessed_summary,"Output":data["Output"]})

In [21]:
df.head(3)

Unnamed: 0,Text,Summart,Output
0,bought several vitality canned dog food produc...,good quality dog food,1
1,product arrived labeled jumbo salted peanuts p...,advertised,0
2,confection around centuries light pillowy citr...,delight says,1


# AVG_W2V

In [23]:
#we are converting into the format where gensim.word2vec is understandable
text=[]
for i in tqdm(preprocessed_text):
    text.append(i.split())
summary=[]
for i in tqdm(preprocessed_summary):
    summary.append(i.split())

100%|███████████████████████████████████████████████████████████████████████| 364171/364171 [00:06<00:00, 59505.94it/s]
100%|██████████████████████████████████████████████████████████████████████| 364171/364171 [00:02<00:00, 142830.90it/s]


In [24]:
#first argument is text you want convert
#the format of text should be [[w1,w2,..],[w1,w2,...],...] this is why we created text=[],summary=[] to get into that format
#min_count=5 means consider those words that occurs atleast 5 times in corpus
#window=10 means it consider 5 words before and after of target word
#workers means cores of computer
#vector_size=50 means the vector size will be 50 dimensional for each word
w2v_model_text = Word2Vec(text,min_count=1,vector_size=50,window=10,workers=4)

In [25]:
w2v_model_text.wv.most_similar("computer") #this gives similar words and value says how similar it is
#from this we can say word2vec consider sematic relationship

[('driver', 0.7656199932098389),
 ('windows', 0.7550932765007019),
 ('mirror', 0.7455266714096069),
 ('ceiling', 0.739784300327301),
 ('wall', 0.7203813195228577),
 ('pc', 0.713884711265564),
 ('patio', 0.7099625468254089),
 ('hook', 0.7069205045700073),
 ('keyboard', 0.7066042423248291),
 ('hall', 0.7011048793792725)]

In [26]:
#this gives to numpy vector size you mentioned in above
w2v_model_text.wv["computer"] #wv means wordvector

array([-0.2126676 , -0.01129488,  1.00491   ,  2.170243  , -1.9453751 ,
       -0.04441053,  1.329476  ,  1.7078251 , -1.0448779 , -0.0793432 ,
        1.1371223 , -0.4501713 ,  1.4649297 ,  2.7449675 , -1.2288195 ,
       -0.997115  ,  0.89129627, -0.9795717 ,  0.77115273,  1.5371093 ,
       -0.9909169 ,  1.461338  , -0.20024483, -0.43253207, -0.84365815,
       -0.00501731, -0.09819211, -0.90709704, -0.82251865,  0.7382851 ,
        0.42454004,  0.2568354 , -0.5872567 , -0.39878687,  0.11796318,
       -0.7680922 , -1.492321  , -3.5129592 ,  0.29314142, -2.5419798 ,
        0.78526056, -0.6571092 ,  0.14415418, -2.4304802 ,  2.9333928 ,
        2.8084698 ,  0.25779676,  0.8923801 , -2.1703744 ,  1.3833836 ],
      dtype=float32)

In [27]:
w2v_model_summary = Word2Vec(summary,min_count=1,vector_size=50,window=10,workers=4)

In [30]:
import warnings
warnings.filterwarnings("ignore")

vector_text=[]
vector_summary=[]
for i in tqdm(text):
    vec=np.zeros(50)
    n=len(i)
    for j in i:
        vec=vec+w2v_model_text.wv[j]
    vector_text.append(vec/n)
for i in tqdm(summary):
    vec=np.zeros(50)
    n=len(i)
    for j in i:
        vec=vec+w2v_model_summary.wv[j]
    vector_summary.append(vec/n)     

100%|████████████████████████████████████████████████████████████████████████| 364171/364171 [01:45<00:00, 3450.03it/s]
100%|███████████████████████████████████████████████████████████████████████| 364171/364171 [00:13<00:00, 26996.12it/s]


In [31]:
df_vectors = pd.DataFrame({"Text_vector":vector_text,"Summary_vector":vector_summary,"Output":df["Output"]})

In [32]:
df_vectors.head()

Unnamed: 0,Text_vector,Summary_vector,Output
0,"[1.1416406190913657, -0.7423293062526247, -1.0...","[-0.6148168751969934, 0.1694793924689293, -0.3...",1
1,"[-0.2960836895638042, 0.8461025146146616, -0.3...","[-0.5952607989311218, 0.15128670632839203, 0.6...",0
2,"[-0.04053393369540572, 0.5774340040516108, 0.4...","[-0.7522922456264496, 0.24452942609786987, 0.0...",1
3,"[-0.4054381677673923, 0.1373030828932921, -0.7...","[-0.10200139973312616, -0.041098836809396744, ...",0
4,"[-0.15094553321026838, -0.21456006054694837, -...","[0.32099931687116623, 0.4066705498844385, 0.77...",1
