In [1]:
!pip install groq
!pip install vaderSentiment
!pip install textstat

Collecting groq
  Downloading groq-0.11.0-py3-none-any.whl.metadata (13 kB)
Downloading groq-0.11.0-py3-none-any.whl (106 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/106.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.5/106.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.11.0
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.0-py3-none-any.whl.metadata (3.2 kB)
Do

In [2]:
import os
import re
import pandas as pd
from groq import Groq

# Set up your Groq API key
os.environ["GROQ_API_KEY"] = ""  # or directly pass as Groq(api_key="your_groq_api_key")

# Initialize the Groq client
client = Groq()

# Function to generate synthetic text using multiple examples as context
def generate_synthetic_text(input_examples):
    # Format the examples into a prompt context
    context = "\n".join([f"- {text}" for text in input_examples])
    prompt = f"Based on the following examples of text with a similar sentiment:\n{context}\nGenerate a new example with the same sentiment. Please enclose the example in double quotes."

    completion = client.chat.completions.create(
        model="llama-3.1-70b-versatile",
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        temperature=1,
        max_tokens=1024,
        top_p=1,
        stream=True,
        stop=None,
    )

    generated_text = ""
    for chunk in completion:
        generated_text += chunk.choices[0].delta.content or ""

    # Extract only the text enclosed in double quotes
    quoted_text = re.search(r'"(.*?)"', generated_text)
    return quoted_text.group(1) if quoted_text else None

# Load your dataset
data_path = '/content/preprocessed_twitter_data_small (1).csv'
data = pd.read_csv(data_path)

# Define the column names
sentiment_column = 'sentiment'  # Replace with the actual sentiment column name
text_column = 'text'  # Replace with the actual text column name

# Create synthetic data for each sentiment class
synthetic_data = []
sentiment_classes = data[sentiment_column].unique()  # Extract unique sentiment classes

for sentiment in sentiment_classes:
    # Select all rows of this sentiment to use as context
    sentiment_data = data[data[sentiment_column] == sentiment][text_column].tolist()

    # Generate 10 synthetic rows for this sentiment class
    for i in range(10):
        synthetic_text = generate_synthetic_text(sentiment_data[:10])  # Use the first 10 examples as context
        if synthetic_text:  # Only add if extraction was successful
            synthetic_data.append({
                "original_text": sentiment_data[i % 10],  # Store each example from the first 10 as original text
                "synthetic_text": synthetic_text,
                "sentiment": sentiment
            })

# Convert synthetic data to DataFrame
synthetic_df = pd.DataFrame(synthetic_data)

# Save synthetic data to CSV
synthetic_df.to_csv('/content/synthetic_data_llama.csv', index=False)
print("Synthetic data saved successfully.")


Synthetic data saved successfully.


# Perplexity, Lexical Diversity and Embedding Similarity

In [4]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the synthetic data (make sure this file path is correct)
synthetic_data_path = '/content/synthetic_data_llama.csv'
synthetic_data = pd.read_csv(synthetic_data_path)

# Initialize GPT-2 model for perplexity calculation
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Initialize Sentence Transformer model for embedding similarity
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to calculate perplexity
def calculate_perplexity(text):
    inputs = gpt2_tokenizer(text, return_tensors="pt")
    outputs = gpt2_model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss
    perplexity = torch.exp(loss)
    return perplexity.item()

# Function to calculate lexical diversity, handling empty text
def lexical_diversity(text):
    words = text.split()
    return len(set(words)) / len(words) if len(words) > 0 else 0

# Calculate metrics for each row in the synthetic dataset
perplexities_original = []
perplexities_synthetic = []
lexical_diversities_original = []
lexical_diversities_synthetic = []
embedding_similarities = []

for index, row in synthetic_data.iterrows():
    original_text = row['original_text']
    synthetic_text = row['synthetic_text']

    # Perplexity
    perplexities_original.append(calculate_perplexity(original_text))
    perplexities_synthetic.append(calculate_perplexity(synthetic_text))

    # Lexical Diversity
    lexical_diversities_original.append(lexical_diversity(original_text))
    lexical_diversities_synthetic.append(lexical_diversity(synthetic_text))

    # Embedding Similarity
    original_embedding = embedding_model.encode([original_text])[0]
    synthetic_embedding = embedding_model.encode([synthetic_text])[0]
    similarity = cosine_similarity([original_embedding], [synthetic_embedding])[0][0]
    embedding_similarities.append(similarity)

# Add calculated metrics to the dataframe
synthetic_data['perplexity_original'] = perplexities_original
synthetic_data['perplexity_synthetic'] = perplexities_synthetic
synthetic_data['lexical_diversity_original'] = lexical_diversities_original
synthetic_data['lexical_diversity_synthetic'] = lexical_diversities_synthetic
synthetic_data['embedding_similarity'] = embedding_similarities

# Calculate average metrics grouped by sentiment
grouped_metrics = synthetic_data.groupby('sentiment').agg({
    'perplexity_original': 'mean',
    'perplexity_synthetic': 'mean',
    'lexical_diversity_original': 'mean',
    'lexical_diversity_synthetic': 'mean',
    'embedding_similarity': 'mean'
}).reset_index()

# Display the grouped metrics
print("Comparison of Original and Synthetic Text Metrics by Sentiment Type:")
print(grouped_metrics)




Comparison of Original and Synthetic Text Metrics by Sentiment Type:
    sentiment  perplexity_original  perplexity_synthetic  \
0  Irrelevant          3805.482076             90.143864   
1    Negative           324.332158             58.703008   
2     Neutral           501.634111            117.940827   
3    Positive           247.962753             33.712287   

   lexical_diversity_original  lexical_diversity_synthetic  \
0                    0.949769                     0.954778   
1                    0.953587                     0.975301   
2                    0.964069                     0.981607   
3                    0.983687                     0.988235   

   embedding_similarity  
0              0.186310  
1              0.196073  
2              0.099976  
3              0.184331  


# VADAR Sentiment Analysis

In [5]:
# !pip install vaderSentiment
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

# Function to calculate sentiment intensity
def sentiment_strength(text):
    return analyzer.polarity_scores(text)['compound']

# Load the synthetic data (make sure this file path is correct)
synthetic_data_path = '/content/synthetic_data_llama.csv'
synthetic_data = pd.read_csv(synthetic_data_path)

real_data=pd.read_csv('/content/preprocessed_twitter_data_small (1).csv')

# Apply sentiment strength analysis on synthetic and original data
synthetic_data['sentiment_strength'] = synthetic_data['synthetic_text'].apply(sentiment_strength)
real_data['sentiment_strength'] = real_data['text'].apply(sentiment_strength)

# Compare the average sentiment strength by sentiment type
strength_comparison = synthetic_data.groupby('sentiment')['sentiment_strength'].mean()
print("Average Sentiment Strength Comparison by Sentiment Type:")
print(strength_comparison)


Average Sentiment Strength Comparison by Sentiment Type:
sentiment
Irrelevant    0.18370
Negative     -0.37350
Neutral      -0.19392
Positive      0.67157
Name: sentiment_strength, dtype: float64


# N Gram Analysis

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Calculate bigram overlap
vectorizer = CountVectorizer(ngram_range=(2, 2))

# Transform the real data to extract bigrams
original_bigrams = vectorizer.fit_transform(real_data['text']).toarray()
synthetic_bigrams = vectorizer.transform(synthetic_data['synthetic_text']).toarray()

# Sum along the rows to get the total count of each bigram across the dataset
original_bigrams_sum = original_bigrams.sum(axis=0)
synthetic_bigrams_sum = synthetic_bigrams.sum(axis=0)

# Calculate overlap as a percentage by taking the minimum count of each bigram and dividing by the maximum
overlap = np.sum(np.minimum(original_bigrams_sum, synthetic_bigrams_sum)) / np.sum(np.maximum(original_bigrams_sum, synthetic_bigrams_sum)) * 100

print("Bigram Overlap Percentage between Original and Synthetic Texts:", overlap)


Bigram Overlap Percentage between Original and Synthetic Texts: 2.3126463700234194


# Topic Consistency

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd

# Load combined data with original and synthetic texts
file_path = '/content/synthetic_data_llama.csv'  # Update with actual path
data = pd.read_csv(file_path)

# Initialize LDA and TF-IDF Vectorizer
n_topics = 5  # Adjust the number of topics as necessary
lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
vectorizer = TfidfVectorizer(max_features=1000)

# Function to calculate topic distribution for a specific sentiment type and dataset
def calculate_topic_distribution(data_subset, text_column):
    tfidf_matrix = vectorizer.fit_transform(data_subset[text_column])
    topic_distribution = lda.fit_transform(tfidf_matrix)
    return topic_distribution.mean(axis=0)

# Dictionary to store results
comparison_results = {}

# List of unique sentiments
sentiments = data['sentiment'].unique()

# Analyze each sentiment type
for sentiment in sentiments:
    # Filter data for the current sentiment
    original_data_sentiment = data[data['sentiment'] == sentiment]['original_text']
    synthetic_data_sentiment = data[data['sentiment'] == sentiment]['synthetic_text']

    # Calculate topic distribution for original and synthetic data for this sentiment
    original_topic_dist = calculate_topic_distribution(data[data['sentiment'] == sentiment], 'original_text')
    synthetic_topic_dist = calculate_topic_distribution(data[data['sentiment'] == sentiment], 'synthetic_text')

    # Store results
    comparison_results[sentiment] = {
        'Original Topic Distribution': original_topic_dist,
        'Synthetic Topic Distribution': synthetic_topic_dist
    }

# Display results
for sentiment, dist in comparison_results.items():
    print(f"\nSentiment: {sentiment}")
    print("Original Topic Distribution:", dist['Original Topic Distribution'])
    print("Synthetic Topic Distribution:", dist['Synthetic Topic Distribution'])



Sentiment: Irrelevant
Original Topic Distribution: [0.3565141  0.1786207  0.20403984 0.21300903 0.04781633]
Synthetic Topic Distribution: [0.12239071 0.2002812  0.20501089 0.27614345 0.19617375]

Sentiment: Negative
Original Topic Distribution: [0.12169552 0.33359285 0.12797208 0.20553899 0.21120056]
Synthetic Topic Distribution: [0.28335702 0.19844521 0.20051218 0.19824464 0.11944095]

Sentiment: Neutral
Original Topic Distribution: [0.18886666 0.12060077 0.12064597 0.2027136  0.36717301]
Synthetic Topic Distribution: [0.2024838  0.12098591 0.35576861 0.19889396 0.12186772]

Sentiment: Positive
Original Topic Distribution: [0.13622398 0.22522698 0.29807291 0.12899818 0.21147795]
Synthetic Topic Distribution: [0.12582722 0.20387084 0.20408298 0.3390149  0.12720406]


# Diversity of Generated Text

In [8]:
import pandas as pd

# Specify the path to your file containing both original and synthetic data
file_path = '/content/synthetic_data_llama.csv'  # Update with actual file path

# Load the data
combined_data = pd.read_csv(file_path)

# Calculate uniqueness within the synthetic data itself
unique_synthetic_texts = combined_data['synthetic_text'].nunique()
total_synthetic_texts = len(combined_data['synthetic_text'])
self_uniqueness_ratio = unique_synthetic_texts / total_synthetic_texts * 100

# Calculate overlap between original and synthetic data
overlapping_texts = combined_data[combined_data['synthetic_text'].isin(combined_data['original_text'])].shape[0]
original_to_synthetic_overlap_ratio = (overlapping_texts / total_synthetic_texts) * 100

# Print the results
print("Uniqueness Ratio of Synthetic Data (compared to itself):", self_uniqueness_ratio, "%")
print("Overlap Ratio between Original and Synthetic Data:", original_to_synthetic_overlap_ratio, "%")


Uniqueness Ratio of Synthetic Data (compared to itself): 100.0 %
Overlap Ratio between Original and Synthetic Data: 0.0 %


# Readability Scores

In [9]:
# Install textstat library

import pandas as pd
from textstat import flesch_reading_ease, flesch_kincaid_grade

# Load the combined dataset containing both original and synthetic text columns
file_path = '/content/synthetic_data_llama.csv'  # Update with the actual file path
combined_data = pd.read_csv(file_path)

# Calculate readability for synthetic and original texts separately
combined_data['flesch_reading_ease_original'] = combined_data['original_text'].apply(flesch_reading_ease)
combined_data['flesch_reading_ease_synthetic'] = combined_data['synthetic_text'].apply(flesch_reading_ease)

# Compare average readability by sentiment type for both original and synthetic texts
readability_comparison = combined_data.groupby('sentiment')[
    ['flesch_reading_ease_original', 'flesch_reading_ease_synthetic']
].mean()

# Print the readability comparison by sentiment
print("Average Readability Comparison by Sentiment Type:")
print(readability_comparison)


Average Readability Comparison by Sentiment Type:
            flesch_reading_ease_original  flesch_reading_ease_synthetic
sentiment                                                              
Irrelevant                        70.560                         78.012
Negative                          65.778                         65.627
Neutral                           67.858                         71.549
Positive                          95.657                         69.383
