In [2]:
import json
from sklearn.model_selection import train_test_split
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import pickle as pkl

In [28]:
# Path to the All_Beauty reviews file you downloaded
reviews_file_path = 'All_Beauty.jsonl'
SEED = 0

In [29]:
# Load the reviews and the ratings
texts, ratings = [], []
with open(reviews_file_path, 'r') as file:
    for line in file:
        review = json.loads(line.strip())
        texts.append(review['text'])
        ratings.append(review['rating'])

In [30]:
# get indices where number of words is > 5 and number of words is < 100
indices = [i for i, text in enumerate(texts) if 5 < len(text.split()) < 100]
texts = [texts[i] for i in indices]
ratings = [ratings[i] for i in indices]

In [31]:
# Convert ratings to binary sentiment (1 for positive, 0 for negative)
sentiments = [1 if rating >= 4 else 0 for rating in ratings]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, sentiments, test_size=0.2, random_state=SEED)

In [32]:
# pkl the train and test data
train_df = pd.DataFrame({'text': X_train, 'sentiment': y_train})
test_df = pd.DataFrame({'text': X_test, 'sentiment': y_test})
train_df.to_pickle('beauty-train.pkl')
test_df.to_pickle('beauty-test.pkl')

In [33]:
len(X_train), len(X_test)

(434663, 108666)

In [34]:
# Generate embeddings
def generate_embeddings(texts):
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    batch_size = 256  # Adjust based on your system's memory capacity
    embeddings = []
    print('transforming data...')
    num_batches = len(texts) // batch_size
    for i in range(0, len(texts), batch_size):
        if i % 100 == 0:
            print(f'Processing batch {i // batch_size + 1}/{num_batches}')
        batch = texts[i:i + batch_size]
        batch_embeddings = embedding_model.encode(batch)
        embeddings.extend(batch_embeddings)
    return embeddings

In [35]:
train_embeddings = generate_embeddings(X_train)
test_embeddings = generate_embeddings(X_test)

transforming data...
Processing batch 1/1697
Processing batch 26/1697
Processing batch 51/1697
Processing batch 76/1697
Processing batch 101/1697
Processing batch 126/1697
Processing batch 151/1697
Processing batch 176/1697
Processing batch 201/1697
Processing batch 226/1697
Processing batch 251/1697
Processing batch 276/1697
Processing batch 301/1697
Processing batch 326/1697
Processing batch 351/1697
Processing batch 376/1697
Processing batch 401/1697
Processing batch 426/1697
Processing batch 451/1697
Processing batch 476/1697
Processing batch 501/1697
Processing batch 526/1697
Processing batch 551/1697
Processing batch 576/1697
Processing batch 601/1697
Processing batch 626/1697
Processing batch 651/1697
Processing batch 676/1697
Processing batch 701/1697
Processing batch 726/1697
Processing batch 751/1697
Processing batch 776/1697
Processing batch 801/1697
Processing batch 826/1697
Processing batch 851/1697
Processing batch 876/1697
Processing batch 901/1697
Processing batch 926/1

In [36]:
train_embeddings = np.array(train_embeddings)
test_embeddings = np.array(test_embeddings)

# save out as pkl
with open('beauty-train-embeddings.pkl', 'wb') as file:
    pkl.dump(train_embeddings, file)
    
with open('beauty-test-embeddings.pkl', 'wb') as file:
    pkl.dump(test_embeddings, file)

In [37]:
train_embeddings.shape, test_embeddings.shape

((434663, 384), (108666, 384))

In [38]:
# classifying the embeddings as a sanity check
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

classifier = LogisticRegression(max_iter=1000)
classifier.fit(train_embeddings, y_train)

train_predictions = classifier.predict(train_embeddings)
test_predictions = classifier.predict(test_embeddings)

train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

print('Train Accuracy:', train_accuracy)
print('Test Accuracy:', test_accuracy)

Train Accuracy: 0.8616767472731749
Test Accuracy: 0.861879520733256


In [41]:
import numpy as np

def generate_ordered_sentence_neighborhood(sentence, num_samples=5000):
    words = sentence.split()
    num_words = len(words)
    
    # Initialize the list to store neighborhood sentences
    neighborhood_sentences = [sentence]  # Include the original sentence as the first sample
    
    for _ in range(num_samples - 1):  # We already have the original sentence, hence num_samples - 1
        num_words_to_remove = np.random.randint(1, num_words)  # Number of words to remove
        words_to_remove = np.random.choice(range(num_words), size=num_words_to_remove, replace=False)
        perturbed_sentence = ' '.join([word for idx, word in enumerate(words) if idx not in words_to_remove])
        neighborhood_sentences.append(perturbed_sentence)
    
    return neighborhood_sentences

# Example usage:
original_sentence = "This is an example sentence to demonstrate how to randomly remove words."

# Generating 5000 sentences (including the original)
neighborhood = generate_ordered_sentence_neighborhood(original_sentence)
print(f"Generated {len(neighborhood)} sentences. Showing the first 10:")
for idx, sentence in enumerate(neighborhood[:10]):
    print(f"Sample {idx}: {sentence}")


Generated 5000 sentences. Showing the first 10:
Sample 0: This is an example sentence to demonstrate how to randomly remove words.
Sample 1: is an sentence to demonstrate to remove
Sample 2: This demonstrate
Sample 3: This demonstrate remove words.
Sample 4: demonstrate how remove words.
Sample 5: This is an example to demonstrate how to randomly remove words.
Sample 6: This an example sentence demonstrate how randomly remove words.
Sample 7: example sentence demonstrate how randomly words.
Sample 8: is an to demonstrate how to randomly remove words.
Sample 9: is to to remove
