# Word Embeddings

### Libraries

In [3]:
# import libraries

import pandas as pd
import numpy as np

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from scipy import stats

### Datasets

In [2]:
# load review sentiment data

review_df = pd.read_csv('review_sentiment.csv')

review_df

Unnamed: 0,review_id,text,positive,neutral,negative
0,NvusujU9_5pIUbn9SZ6hMA,Stopped by to munch a burger during today's Se...,1,0,0
1,vHOeBa7aMA_na4rfS2Db5A,"Yelp doesn't allow to leave 0 star review, so ...",0,0,1
2,hG9RTxxivb0ZXzEk4JXTXA,I find it hard to believe there are so many pe...,0,0,1
3,zIVkwgahZjOneChZFUYY4g,Love this place! Almost all of their menu item...,1,0,0
4,DLczAuvMAlAnY5EeDGhTVg,Excellent customer service. I wish I could ren...,1,0,0
...,...,...,...,...,...
63446,OgoBp7fbXnLSKvsQb4O_tw,"I really loved the food and service. I mean, t...",1,0,0
63447,Q7e8EtZMmdknDrQE7huMoQ,Their Grove location was the bomb. Delicious f...,1,0,0
63448,zzMW6zbsFaQMjoGu2bGVdA,A nice ean BBQ joint right across from some ne...,1,0,0
63449,scgoa60EvhW2Mz7JMqLYGw,The perfect Hookah bar. I'm not sure what they...,1,0,0


In [None]:
# load corpus data

corpus = open('corpus.txt', 'r').read()
corpus = corpus.split('\n')
corpus = corpus[:-1]

In [None]:
# see contents of corpus

for review in corpus[:5]:
    print('-' * 50)
    print(review)

### Dense embeddings

In [None]:
def text_to_vector(embeddings, text, sequence_len):
    '''
    Function to convert text to embedding vectors
    '''
    tokens = text.split()
    vec = []
    n = 0
    i = 0
    while i < len(tokens) and n < sequence_len:
        try:
            vec.extend(embeddings.get_vector(tokens[i]))
            n += 1
        except KeyError:
            True 
        finally:
            i += 1
    for j in range(sequence_len - n):
        vec.extend(np.zeros(embeddings.vector_size,))
    return vec

In [None]:
# tokenize preprocessed corpus

tokenized_corpus = [word_tokenize(review) for review in corpus]

In [None]:
# train Word2Vec model

word2vec_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

In [None]:
# corpus statistics

lens = [len(c.split()) for c in corpus]
print(np.min(lens), np.max(lens), np.mean(lens), np.std(lens), stats.mode(lens))

In [None]:
# convert corpus to embeddings

embeddings_corpus = []
for c in corpus:
    embeddings_corpus.append(text_to_vector(word2vec_model.wv, c, 10))

In [None]:
# TODO: save embeddings, then load in classification notebook and test

In [None]:
'''

X = np.array(embeddings_corpus)
y = reviews['positive']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train classifier
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(classification_report(y_test, y_pred))

'''