In [33]:
import json
from sklearn.model_selection import train_test_split
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import pickle as pkl

In [15]:
# Path to the All_Beauty reviews file you downloaded
reviews_file_path = 'All_Beauty.jsonl'
SEED = 0

In [16]:
# Load the reviews and the ratings
texts, ratings = [], []
with open(reviews_file_path, 'r') as file:
    for line in file:
        review = json.loads(line.strip())
        texts.append(review['text'])
        ratings.append(review['rating'])

In [17]:
# Convert ratings to binary sentiment (1 for positive, 0 for negative)
sentiments = [1 if rating >= 4 else 0 for rating in ratings]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, sentiments, test_size=0.2, random_state=SEED)

In [18]:
# pkl the train and test data
train_df = pd.DataFrame({'text': X_train, 'sentiment': y_train})
test_df = pd.DataFrame({'text': X_test, 'sentiment': y_test})
train_df.to_pickle('beauty-train.pkl')
test_df.to_pickle('beauty-test.pkl')

In [24]:
len(X_train), len(X_test)

(561222, 140306)

In [27]:
# Generate embeddings
def generate_embeddings(texts):
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    batch_size = 256  # Adjust based on your system's memory capacity
    embeddings = []
    print('transforming data...')
    for i in range(0, len(texts), batch_size):
        if i % 5000 == 0:
            print(i, '/', len(texts))
        batch = texts[i:i + batch_size]
        batch_embeddings = embedding_model.encode(batch)
        embeddings.extend(batch_embeddings)
    return embeddings

train_embeddings = generate_embeddings(X_train)
test_embeddings = generate_embeddings(X_test)

transforming data...
0 / 561222
160000 / 561222
320000 / 561222
480000 / 561222
transforming data...
0 / 140306


In [34]:
train_embeddings = np.array(train_embeddings)
test_embeddings = np.array(test_embeddings)

# save out as pkl
with open('beauty-train-embeddings.pkl', 'wb') as file:
    pkl.dump(train_embeddings, file)
    
with open('beauty-test-embeddings.pkl', 'wb') as file:
    pkl.dump(test_embeddings, file)

In [35]:
train_embeddings.shape, test_embeddings.shape

((561222, 384), (140306, 384))

In [32]:
# classifying the embeddings as a sanity check
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

classifier = LogisticRegression(max_iter=1000)
classifier.fit(train_embeddings, y_train)

train_predictions = classifier.predict(train_embeddings)
test_predictions = classifier.predict(test_embeddings)

train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

print('Train Accuracy:', train_accuracy)
print('Test Accuracy:', test_accuracy)

Train Accuracy: 0.8677439587186532
Test Accuracy: 0.8673185751143928
