# Feature engineering

### Libraries

In [1]:
# import libraries

import re

import pandas as pd
import numpy as np

from gensim.models import KeyedVectors
from scipy import stats
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import opinion_lexicon
import nltk
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

### Data

In [2]:
# load review sentiment data

review_df = pd.read_csv('data/review_sentiment.csv')

review_df

Unnamed: 0,review_id,text,sentiment
0,LLzom-2TITa4gasV7_fCCA,Great experience purchasing a washer and dryer...,1
1,a5JHzBrWxRd_OmIvV7znDA,Went here based on the high ratings and raves ...,-1
2,X-o--dwf0HuFMittYi4wCA,"oh Millers, how i wanted to like you. You are...",-1
3,INGNbsyo-MouZZzcxnCSGQ,This place gets two stars from me only because...,-1
4,k7VatXVLism-cTDJE8TTUw,"This place was awesome. Clean, beautiful and t...",1
...,...,...,...
11681,IlU-MQzMKc7jAHWwK5VFGQ,"To be fair, I tried them in their first week. ...",0
11682,Qt3BsRvQuJccDQfFWM1XPw,Awful place. It's dirty. Had two birthday part...,-1
11683,3CQQ8Im_UX6QqDECuXYK8A,A truly vegetarian delight! I took a Jewish f...,1
11684,ery1nBM7zKweFLBe-bT5ag,I have a 2011 Toyota Sienna Limited. During th...,-1


In [3]:
# load corpus data

corpus = open('data/corpus.txt', 'r').read()
corpus = corpus.split('\n')
corpus = corpus[:-1]

In [4]:
# see contents of corpus

for review in corpus[:5]:
    print('-' * 50)
    print(review)

--------------------------------------------------
great experience purchasing washer dryer bought new home needed new appliance iam glad decided go even though review great understand since issue whatsoever excellent selection knowledgeable salesperson timely delivery level service received beyond expected ! really felt like working hard give best price offered many option stay budget even took time check additional sale price discount could offer given honest feedback question answered happy sears would highly recommend !
--------------------------------------------------
went based high rating raf people entire experience underwhelming start finish website fairly useless want figure menu going pick location choice location drill bottom see weak description menu offering explanation price roast beef roast beef combo roast pork roast pork combo baked ham baked ham combo roast turkey roast turkey combo broccoli rabe broccoli rabe combo explanation whats combo add cheese way table place

### Dense embeddings

In [5]:
# load word2vec vectors

wv = KeyedVectors.load("embeddings/reviews_wv")

In [6]:
def text_to_vector(embeddings, text, sequence_len, strategy=None):
    '''
    Function to convert text to word embeddings
    '''
    tokens = text.split()
    vec = []
    n = 0
    i = 0
    while i < len(tokens) and n < sequence_len:
        try:
            vec.extend(embeddings.get_vector(tokens[i]))
            n += 1
        except KeyError:
            True
        finally:
            i += 1
    for _ in range(sequence_len - n):
        vec.extend(np.zeros(embeddings.vector_size,))
    if strategy == 'mean':
        vec = np.mean(vec, axis=0)
    elif strategy == 'max':
        vec = np.max(vec, axis=0)
    return vec

In [7]:
# corpus statistics

lens = [len(c.split()) for c in corpus]

print('Number of reviews:', len(corpus))
print('Minimum number of words:', np.min(lens))
print('Maximum number of words:', np.max(lens))
print('Average number of words:', np.mean(lens))
print('Standard deviation of words:', np.std(lens))
print('Mode of words:', stats.mode(lens))

Number of reviews: 11686
Minimum number of words: 0
Maximum number of words: 468
Average number of words: 55.066061954475444
Standard deviation of words: 48.93715036619566
Mode of words: ModeResult(mode=25, count=240)


In [8]:
# convert corpus into dataset with appended embeddings representation

simple_corpus = []
for review in review_df['text']:
    review = re.sub('[^a-zA-Z]', ' ', review).lower()
    simple_corpus.append(review)

embeddings_corpus = []
word_limit = 50
for review in simple_corpus:
    embeddings_corpus.append(text_to_vector(wv, review, word_limit))

### Transformations


In [9]:
# BoW

bag_of_words = CountVectorizer()
bow_features = bag_of_words.fit_transform(corpus)

sparse.save_npz('features/bag_of_words.npz', bow_features)

bow_features.shape

(11686, 20892)

In [10]:
lsa_model = TruncatedSVD(n_components=8)
lsa_topic_matrix = lsa_model.fit_transform(bow_features)

In [11]:
# 1-hot encoding

one_hot = CountVectorizer(binary=True)
features = one_hot.fit_transform(corpus)

sparse.save_npz('features/one_hot.npz', features)

features.shape

(11686, 20892)

In [12]:
# N-grams

n_grams = CountVectorizer(ngram_range=(1, 2), max_features=15000)
features = n_grams.fit_transform(corpus)

sparse.save_npz('features/n_grams.npz', features)

features.shape

(11686, 15000)

In [13]:
# TF-IDF

tf_idf = TfidfVectorizer()
features = tf_idf.fit_transform(corpus)

sparse.save_npz('features/tf_idf.npz', features)

features.shape

(11686, 20892)

In [14]:
# Word2Vec

features = np.array(embeddings_corpus)

np.save('features/word2vec.npy', features)

features.shape

(11686, 7500)

Handling negation - lexicons

In [15]:

nltk.download('opinion_lexicon')

positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())

# Extend lexicons with NOT_ prefix
extended_positive_words = positive_words.union({'NOT_' + word for word in negative_words})
extended_negative_words = negative_words.union({'NOT_' + word for word in positive_words})

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\bruna\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


In [16]:
def count_sentiment_words(text, positive_words, negative_words):
    words = text.split()
    pos_count = sum(1 for word in words if word in positive_words)
    neg_count = sum(1 for word in words if word in negative_words)
    return pos_count, neg_count



In [17]:

pos_counts = []
neg_counts = []
for review in corpus:
    pos_count, neg_count = count_sentiment_words(review, positive_words, negative_words)
    print(f"Positive words: {pos_count}, Negative words: {neg_count}")
    pos_counts.append(pos_count)
    neg_counts.append(neg_count)

# Convert to numpy arrays
pos_counts = np.array(pos_counts).reshape(-1, 1)
neg_counts = np.array(neg_counts).reshape(-1, 1)

sentiment_features = np.hstack((pos_counts, neg_counts))
sentiment_features_sparse = sparse.csr_matrix(sentiment_features)

# Combine the sparse matrices
combined_features = sparse.hstack([bow_features, sentiment_features_sparse])
sparse.save_npz('features/combined_features', combined_features)

print(combined_features.shape)

Positive words: 11, Negative words: 2
Positive words: 8, Negative words: 9
Positive words: 22, Negative words: 8
Positive words: 6, Negative words: 6
Positive words: 9, Negative words: 0
Positive words: 2, Negative words: 3
Positive words: 12, Negative words: 0
Positive words: 2, Negative words: 0
Positive words: 4, Negative words: 5
Positive words: 11, Negative words: 3
Positive words: 2, Negative words: 0
Positive words: 10, Negative words: 2
Positive words: 11, Negative words: 0
Positive words: 4, Negative words: 0
Positive words: 7, Negative words: 0
Positive words: 4, Negative words: 0
Positive words: 18, Negative words: 14
Positive words: 4, Negative words: 0
Positive words: 6, Negative words: 2
Positive words: 5, Negative words: 5
Positive words: 3, Negative words: 0
Positive words: 6, Negative words: 0
Positive words: 9, Negative words: 1
Positive words: 4, Negative words: 0
Positive words: 3, Negative words: 0
Positive words: 6, Negative words: 5
Positive words: 2, Negative wo