In [9]:
import torch
import transformers
import nlpaug.augmenter.word as naw
import torch
print("NLPAug imported successfully")
import random

print(f"Torch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Current device: {torch.cuda.get_device_name(0)}")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
# Sample positive and negative reviews
positive_reviews = [
    "This product is amazing!",
    "I highly recommend this.",
    "It's the best I've ever used.",
    "Excellent quality and value.",
    "Five stars!",
    "Great customer service.",
    "Exactly what I was looking for.",
    "Very satisfied with my purchase.",
    "Outstanding performance.",
    "Worth every penny!"
]

negative_reviews = [
    "This product is terrible.",
    "I would not recommend this.",
    "It's the worst I've ever used.",
    "Poor quality and overpriced.",
    "One star!",
    "Horrible customer service.",
    "Not what I expected at all.",
    "Very disappointed with my purchase.",
    "Unreliable performance.",
    "Complete waste of money!"
]

# Generate 500 base sentences by repeating and slightly modifying the samples
all_reviews = []
for i in range(250):
    # Add some random variation to avoid exact duplicates
    pos_review = positive_reviews[i % len(positive_reviews)]
    neg_review = negative_reviews[i % len(negative_reviews)]

    # Add simple variations to make the dataset more diverse
    if random.random() > 0.5:
        pos_review = "Honestly, " + pos_review
    if random.random() > 0.5:
        neg_review = "Unfortunately, " + neg_review

    all_reviews.append(pos_review)
    all_reviews.append(neg_review)

# Initialize augmenters
aug_insert = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased',
    action="insert",
    aug_p=0.1  # Probability of augmenting each word
)

aug_sub = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased',
    action="substitute",
    aug_p=0.1,
    stopwords=['not', 'no', 'never']  # Prevent changing sentiment-critical words
)


# Augment the dataset
augmented_reviews = []
for review in all_reviews:
    try:
        # Insert words
        aug_text = aug_insert.augment(review)[0]
        augmented_reviews.append(aug_text)

        # Substitute words
        aug_text = aug_sub.augment(review)[0]
        augmented_reviews.append(aug_text)

        # Simple word deletion (manual approach)
        words = review.split()
        if len(words) > 3:  # Only delete if we have enough words
            del_idx = random.randint(0, len(words)-1)
            words.pop(del_idx)
            aug_text = " ".join(words)
            augmented_reviews.append(aug_text)

    except Exception as e:
        print(f"Error augmenting review: {review}")
        print(f"Error message: {str(e)}")
        continue

# Combine original and augmented reviews
final_reviews = all_reviews + augmented_reviews

# Print some statistics
print(f"Original reviews: {len(all_reviews)}")
print(f"Augmented reviews: {len(augmented_reviews)}")
print(f"Total reviews: {len(final_reviews)}")

# Print some examples
print("\nExample augmentations:")
for i in range(3):
    orig_idx = random.randint(0, len(all_reviews)-1)
    aug_idx = random.randint(0, len(augmented_reviews)-1)
    print(f"\nOriginal: {all_reviews[orig_idx]}")
    print(f"Augmented: {augmented_reviews[aug_idx]}")

NLPAug imported successfully
Torch version: 2.5.1+cu121
Transformers version: 4.48.1
CUDA available: True
Current device: NVIDIA GeForce RTX 4080 SUPER
cuda
Original reviews: 500
Augmented reviews: 1364
Total reviews: 1864

Example augmentations:

Original: Honestly, This product is amazing!
Augmented: unreliable belief.

Original: Unfortunately, I would not recommend this.
Augmented: i highly recommend that.

Original: Unfortunately, It's the worst I've ever used.
Augmented: unfortunately, i would so not recommend this.


In [48]:
from transformers import pipeline
import random
import torch
from tqdm import tqdm

class ReviewReconstructor:
    def __init__(self, device=None):
        self.device = device if device else ('cuda' if torch.cuda.is_available() else 'cpu')
        self.generator = self._create_generator()
        # Sentiment keywords for better context understanding
        self.positive_keywords = {'amazing', 'recommend', 'best', 'excellent', 'stars', 'great', 
                                'exactly', 'satisfied', 'outstanding', 'worth'}
        self.negative_keywords = {'terrible', 'not', 'worst', 'poor', 'horrible', 'disappointed', 
                                'unreliable', 'waste', 'unfortunately'}

    def _create_generator(self):
        return pipeline(
            "text2text-generation",
            model="t5-base",
            device=0 if self.device == 'cuda' else -1,
            clean_up_tokenization_spaces=True
        )

    def _detect_sentiment(self, text):
        """Detect sentiment based on keyword presence."""
        text_lower = text.lower()
        pos_count = sum(1 for word in self.positive_keywords if word in text_lower)
        neg_count = sum(1 for word in self.negative_keywords if word in text_lower)
        return 'positive' if pos_count > neg_count else 'negative'

    def _get_context(self, text_list, current_idx):
        # Get surrounding context
        prev_texts = [t for t in text_list[max(0, current_idx - 2):current_idx] if t.strip()]
        next_texts = [t for t in text_list[current_idx + 1:current_idx + 3] if t.strip()]
        
        # Combine context
        context_text = " ".join(prev_texts + next_texts)
        sentiment = self._detect_sentiment(context_text)
        
        # Format prompt with sentiment guidance
        prompt = f"complete {sentiment} review:"
        if prev_texts:
            prompt += f" {' '.join(prev_texts)}"
        prompt += " [MISSING]"
        if next_texts:
            prompt += f" {' '.join(next_texts)}"
            
        return prompt, sentiment

    def _clean_generated_text(self, text, sentiment):
        """Clean and validate generated text."""
        # Remove common prefix artifacts
        artifacts = [
            "review::", "negative review:", "complete positive review:",
            "complete negative review:", "positive review:", "complete review:", ":", "True"
        ]
        for artifact in artifacts:
            text = text.replace(artifact, "").strip()

        # Remove [MISSING] placeholders
        text = text.replace("[MISSING]", "").strip()

        # Ensure proper sentence structure
        if len(text.split()) < 3:
            text = "This product is excellent!" if sentiment == 'positive' else "This product is disappointing."

        # Ensure proper ending punctuation
        if not any(text.endswith(char) for char in ".!?"):
            text += "."

        return text

    def reconstruct_texts(self, reviews, missing_indices, batch_size=32):
        """Reconstruct missing texts with batched processing."""
        reconstructed = reviews.copy()

        # Process in batches
        for i in tqdm(range(0, len(missing_indices), batch_size)):
            batch_indices = missing_indices[i:i + batch_size]
            prompts = []
            sentiments = []
            
            # Prepare batch
            for idx in batch_indices:
                prompt, sentiment = self._get_context(reviews, idx)
                prompts.append(prompt)
                sentiments.append(sentiment)
            
            # Generate texts
            generated = self.generator(
                prompts,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                top_k=50,
                repetition_penalty=1.2,
                num_return_sequences=1,
                no_repeat_ngram_size=2,
                max_length=50,
                batch_size=batch_size
            )
            
            # Process generated texts
            for idx, gen, sentiment in zip(batch_indices, generated, sentiments):
                # The generated output is already a dictionary with 'generated_text' key
                text = gen['generated_text']  # Removed the [0] indexing
                cleaned_text = self._clean_generated_text(text, sentiment)
                reconstructed[idx] = cleaned_text
            
        return reconstructed

# Initialize reconstructor
reconstructor = ReviewReconstructor()
missing_percentage=0.1

# Create gaps
num_missing = int(len(final_reviews) * missing_percentage)
missing_indices = random.sample(range(len(final_reviews)), num_missing)
reviews_with_gaps = final_reviews.copy()

for idx in missing_indices:
    reviews_with_gaps[idx] = ""

# Reconstruct
reconstructed = reconstructor.reconstruct_texts(reviews_with_gaps, missing_indices)

# Print results
print(f"\nTotal reviews: {len(final_reviews)}")
print(f"Gaps created: {num_missing}")
print("\nReconstruction Examples:")

# Show some examples
sample_size = min(5, len(missing_indices))
for idx in random.sample(missing_indices, sample_size):
    print(f"Reconstructed: {reconstructed[idx]}")
    


Device set to use cuda:0
100%|██████████| 6/6 [00:03<00:00,  1.97it/s]


Total reviews: 1864
Gaps created: 186

Reconstruction Examples:
This product is disappointing.
This product is disappointing.
This product is disappointing.
This product is disappointing.
This product is disappointing.



