In [None]:
import pandas as pd
import json
import google.generativeai as genai
from collections import Counter
import re
import csv
import time
import ast
import json5
import datetime

In [None]:
#here, having only 1 gemini API would do, but to improve performance, we can use multiple APIs
gemini_api = 'your_gemini_api'
gemini_api_2 = 'optional_gemini_api'
gemini_api_3 = 'optional_gemini_api'
gemini_api_4 = 'optional_gemini_api'
gemini_api_5 = 'optional_gemini_api'

In [None]:
def escape_inner_double_quotes(s):
    def replacer(match):
        content = match.group(1)
        fixed_content = re.sub(r'(?<!\\)"', r'\\"', content)
        return f'"{fixed_content}"'
    pattern = r'"((?:[^"\\]|\\.)*)"'
    return re.sub(pattern, replacer, s)

def robust_json_loads(raw_output):
    cleaned = raw_output.strip()
    if cleaned.startswith("```") and cleaned.endswith("```"):
        cleaned = cleaned[3:-3].strip()
    lines = cleaned.splitlines()
    if lines and lines[0].strip().lower() == "json":
        cleaned = "\n".join(lines[1:]).strip()
    cleaned = cleaned.strip("`").strip()
    cleaned = re.sub(r'[\x00-\x1f]+', ' ', cleaned)
    m = re.search(r'\[.*\]', cleaned, re.DOTALL)
    candidate = m.group(0) if m else cleaned
    candidate_fixed = re.sub(r'"\s+"', '", "', candidate)
    candidate_fixed2 = escape_inner_double_quotes(candidate_fixed)
    parsers = [
        ("json.loads", json.loads),
        ("ast.literal_eval", ast.literal_eval),
        ("unicode_escape", lambda s: json.loads(s.encode('utf-8').decode('unicode_escape'))),
        ("json5.loads", json5.loads)
    ]
    for candidate_to_try in [candidate, candidate_fixed, candidate_fixed2]:
        for name, parse_func in parsers:
            try:
                result = parse_func(candidate_to_try)
                if isinstance(result, list):
                    return result
                else:
                    print(f"{name} did not return a list.")
            except Exception as e:
                print(f"Error using {name} on candidate: {e}")
    print("All parsing attempts failed. Final candidate output:")
    print(candidate)
    return None

In [None]:
toggle = 0
async def augment_comments(comments):
    if not comments or not isinstance(comments, list):
        print("Invalid or empty comments list provided.")
        return None

    prompt = f'''
    You are a comment augmentation model. Your task is to take a list of clean YouTube comments and generate augmented versions of each comment. The augmented comment should:

    DO:
    - Retain the original sentiment and meaning exactly.
    - Introduce linguistic diversity by varying vocabulary, sentence structure, and phrasing.
    - Preserve all domain-specific terminology and details so that the context remains intact.
    - Produce an output that is distinctly different from the input while conveying the same sentiment.
    - Ensure the augmented output is valid and properly formatted as a JSON array.

    DON'T:
    - Alter the sentiment (a neutral comment must remain neutral).
    - Introduce any unrelated or extraneous information beyond rephrasing.
    - Use overly complex or unnatural language that doesn't match typical conversational style.

    Additional Instructions:
    - If the input comment is in a non-English language, translate it into English while preserving its original sentiment and tone.
    - **IMPORTANT:** When producing the output, ensure that each augmented comment is enclosed in double quotes (" ") and that every internal double quotes are correctly escaped (for example, use \" for an internal quote).
    - If a comment contains a double quoted word like "word", then its quotations must be escaped like this \"word\" in the output.
    - Do not mix quote types (avoid starting with a single quote and ending with a double quote, or vice versa).

    Input format (comments):
    {comments}

    Response format should be exactly like the following (horizontal format):
    ["Augmented Comment 1", "Augmented Comment 2", "Augmented Comment 3", ...]

    Not like the following (vertical format):
    ["Augmented Comment 1",
    "Augmented Comment 2",
    "Augmented Comment 3",
    ...]

    Do not include any extra explanations or commentary; simply return the list of augmented comments in the exact order as provided in the input.
'''
    #This is the logic to toggle each APIs, since gemini has rate limit of 10 request per minute, you can modify this logic as per your APIs usage.
    global toggle
    api_keys = [gemini_api, gemini_api_2, gemini_api_3, gemini_api_4, gemini_api_5]
    api_key = api_keys[toggle % 5]
    toggle += 1

    genai.configure(api_key=api_key)
    model = genai.GenerativeModel(model_name="gemini-2.0-flash-exp")
    try:
        response = model.generate_content(prompt)
        if response and response.text:
            raw_output = response.text.strip()
            try:
                aug_comment_match = re.search(r'\[.*\]', raw_output, re.DOTALL)
                if aug_comment_match:
                    aug_comments = robust_json_loads(aug_comment_match.group(0))
                else:
                    print("Raw output:", raw_output)
                    print("No augmented comments found in the response.")
                    return None
                if aug_comments is None:
                    print("robust_json_loads returned None.")
                    return None
                if len(aug_comments) == len(comments):
                    print(f'len of comments: {len(comments)}, len of augmented comments: {len(aug_comments)}')
                    return aug_comments
                else:
                    print(f"Mismatch between number of comments and augmented comments. len of comments: {len(comments)}, len of augmented comments: {len(aug_comments)}\nRaw output: {raw_output}\nAugmented comments: {aug_comments}")
                    return None
            except Exception as e:
                print(f"Error during robust JSON parsing: {e}\nRaw output: {raw_output}")
                return None
        else:
            print("Empty or invalid response from Gemini.")
            return None

    except Exception as e:
        print(f"Error during comment augmentation: {e}")
        return None

In [None]:
comments = [
    "Oh great, the 'strategy' is back. Let's see how well that works out this time.",
    "Here comes the 'professional' again, ready to turn his chips into dust.",
    "'Expert' player alert—don’t hold your breath for a win though.",
    "Well, well, the 'high roller' is at it again, about to blow his last stack.",
    "Look out, everyone, the 'wizard' of blackjack is in the building… prepare for disaster.",
    "The 'mastermind' strikes once more—this should be fun to watch (for all the wrong reasons).",
    "Oh, the 'champion' is back, let's hope his luck improves this time (it won’t).",
    "A 'winning' streak? Nah, that’s not really his thing… unless you count losses.",
    "Another day, another round of 'perfect' decisions by the so-called 'pro'.",
    "'Lucky' gambler strikes again—too bad the only thing he's winning is debt.",
    "Isn’t it cute how the 'guru' always finds new ways to lose big? Classic."
]

In [None]:
await augment_comments(comments)

len of comments: 11, len of augmented comments: 11


['Ah, the "strategy" returns. I\'m eager to observe its effectiveness this time around.',
 'Here comes the "professional" again, all set to convert his chips into thin air, I see.',
 '"Expert" player in the house – though I wouldn\'t expect a victory anytime soon.',
 'Well, look who it is, the "high roller" back again, probably ready to gamble away his final chips.',
 'Watch out, folks, the blackjack "wizard" has arrived… brace yourselves for a train wreck.',
 'The "mastermind" makes another appearance—this promises to be entertaining, albeit for all the wrong reasons.',
 'Oh look, the "champion" is back; let\'s hope his fortune changes this time (spoiler: it won’t).',
 'A "winning" streak? Not really his forte… unless you are counting all the losses.',
 'Another day, another set of "perfect" choices from our esteemed \'pro\'.',
 '"Lucky" gambler is back at it—too bad he\'s really just racking up more debt.',
 'Isn’t it adorable how the "guru" consistently discovers fresh methods for s

In [None]:
def clean(comment):
    return re.sub(r'\d{1,2}(:\d{2}){1,3}|\s+', ' ', comment).strip()

In [None]:
def published_at():
    return datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")

In [None]:
df = pd.read_csv('path_to_comments')

In [None]:
len(df)

55000

In [None]:
async def augmentation_comment_batch(df):
  df_negative = df.sample(frac=1, random_state=42).reset_index(drop=True)
  print(f"Total negative comments: {len(df_negative)}")
  augmented_rows = []
  batch_size = 20
  request_count = 0
  start_time = time.time()
  tot_com = 0
  failed_com = 0
  for i in range(0, len(df_negative), batch_size):
    batch_df = df_negative.iloc[i: i + batch_size]
    original_comments = [clean(com) for com in batch_df["Comment"].tolist()]
    tot_com += len(original_comments)
    if request_count == 45:
      elapsed_time = time.time() - start_time
      sleep_time = max(0, 90 - elapsed_time)
      sleep_time = sleep_time if sleep_time >= 10 else 10
      print(f"Rate limit reached. Sleeping for {sleep_time} seconds.")
      time.sleep(sleep_time)
      request_count = 0
      start_time = time.time()
    try:
      augmented_comments = await augment_comments(original_comments)
      request_count += 1
    except Exception as e:
      print(f"Error during augmentation: {e}")
      failed_com += len(original_comments)
      continue
    if augmented_comments is None:
      failed_com += len(original_comments)
      continue
    for i, (_, row) in enumerate(batch_df.iterrows()):
      new_row = {
          "ComId": str(row["ComId"]) + "_aug",
          "Vid": row["Vid"],
          "VideoTitle": row["VideoTitle"],
          "AuthorName": "AugmentedUser",
          "AuthorCid": "AugmentedCID",
          "Comment": augmented_comments[i],
          "Sentiment": row["Sentiment"],
          "LikeCount": 0,
          "ReplyCount": 0,
          "PublishedAt": published_at(),
          "RegionCode": row["RegionCode"],
          "CategoryId": row["CategoryId"]
      }
      augmented_rows.append(new_row)
    if tot_com % 200 == 0:
      print(f'Total Comment Extracted: {tot_com} Total Comment Augmented: {len(augmented_rows)} Total Failed Comments: {failed_com}')
  df_augmented = pd.DataFrame(augmented_rows)
  df_augmented.to_csv("df_neu_tail.csv", index=False)
  print(f'Total Comment Extracted: {tot_com} Total Comment Augmented: {len(augmented_rows)} Total Failed Comments: {failed_com}')
  print("Augmentation complete!")

In [None]:
df_neu = df.iloc[50000:50600]

In [None]:
len(df_neu)

600

In [None]:
await augmentation_comment_batch(df_neu)

Total negative comments: 600
len of comments: 20, len of augmented comments: 20
len of comments: 20, len of augmented comments: 20
len of comments: 20, len of augmented comments: 20
len of comments: 20, len of augmented comments: 20
len of comments: 20, len of augmented comments: 20
Error using json.loads on candidate: Invalid \escape: line 1 column 121 (char 120)
len of comments: 20, len of augmented comments: 20
len of comments: 20, len of augmented comments: 20
len of comments: 20, len of augmented comments: 20
len of comments: 20, len of augmented comments: 20
len of comments: 20, len of augmented comments: 20
Total Comment Extracted: 200 Total Comment Augmented: 200 Total Failed Comments: 0
len of comments: 20, len of augmented comments: 20
len of comments: 20, len of augmented comments: 20
Error using json.loads on candidate: Invalid \escape: line 1 column 137 (char 136)
Error using ast.literal_eval on candidate: unterminated string literal (detected at line 1) (<unknown>, line 1