In [None]:
import pandas as pd
import numpy as np
import os
import pickle

from umap import UMAP

from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
STOP_WORDS = list(ENGLISH_STOP_WORDS)

import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import colors as mcolors, cm, lines as mlines
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.cm import ScalarMappable

from sklearn.metrics.pairwise import cosine_similarity as cosine
from sklearn.feature_extraction.text import TfidfTransformer

from scipy.sparse import lil_matrix, csr_matrix, hstack, vstack

import networkx as nx
from sknetwork.clustering import Leiden

#!pip install google-generativeai httpx
import google.generativeai as genai
import httpx

RANDOM_STATE = 42

In [3]:
with open('data/amazon_reviews_with_embeddings_and_sentiment.pkl', 'rb') as f:
    amazon = pickle.load(f)
print(type(amazon))

<class 'pandas.core.frame.DataFrame'>


In [7]:
print(amazon.shape)
amazon.head()

(10000, 9)


Unnamed: 0,text,label,document,embedding,sentiment,polarity,2D_embeddings,cluster,cluster_probs
0,Perfect Mantra I've been looking for this mant...,1,Perfect Mantra I've been looking for this mant...,"[0.028224848210811615, 0.03228772059082985, -0...","{'Very Positive': 0.6087847948074341, 'Positiv...",0.878487,"[-2.2331846, 6.180949]",0,"[0.3138719003283342, 0.0677317084709766, 0.179..."
1,Quality I'm just a little disapointed with the...,1,Quality I'm just a little disapointed with the...,"[0.025785230100154877, -0.010898678563535213, ...","{'Negative': 0.5796696543693542, 'Neutral': 0....",-0.517629,"[0.86983263, 10.463978]",8,"[0.04463512527283897, 0.17382948287054564, 0.0..."
2,Exceptional murder-mystery writer scores big. ...,1,Exceptional murder-mystery writer scores big. ...,"[0.012370972894132137, 0.03619104251265526, -0...","{'Very Positive': 0.4494209885597229, 'Positiv...",0.858294,"[-3.6821277, 6.0072913]",2,"[0.0014503532935837857, 0.03718678093448527, 0..."
3,all the prodding questions you never wanted to...,1,all the prodding questions you never wanted to...,"[-0.001434302655979991, 0.027779487892985344, ...","{'Very Positive': 0.6956683993339539, 'Positiv...",0.831368,"[-2.8234124, 5.5528483]",2,"[0.0, 0.0, 0.709769771673625, 0.00414207689491..."
4,Best Wok ever I am so glad I bought this wok. ...,1,Best Wok ever I am so glad I bought this wok. ...,"[-0.0065714335069060326, -0.06777285784482956,...","{'Very Positive': 0.792975127696991, 'Positive...",0.891844,"[2.9735239, 10.172374]",6,"[0.0, 0.0, 0.0, 0.005072706647455209, 0.0, 0.0..."


In [9]:
with open('data/cluster_info_A_amazon.pkl', 'rb') as f:
    amazon_A = pickle.load(f)
print(type(amazon_A))
print(amazon_A.shape)
amazon_A.head()

<class 'pandas.core.frame.DataFrame'>
(14, 8)


Unnamed: 0,Cluster,Count,Top_Words,Polarity,Centroid,2D_Centroid,Top_Representative_Docs,Top_Words_TFIDF
0,0,898,"{'album': 659, 'cd': 617, 'great': 393, 'music...",0.623137,"[0.011593475740294592, -0.009051741308382663, ...","[-6.3378177, 11.714415]",[A Great CD!! I just brouth the cd DATE and i ...,"{'cd': 0.4524236413718734, 'album': 0.18096945..."
1,1,691,"{'movie': 665, 'great': 324, 'film': 251, 'goo...",0.560485,"[0.0028552300495080555, 0.028399670222276224, ...","[-3.3686092, 8.544506]",[ORGANIZATION This is a great family movie. My...,"{'movie': 0.2605772667341845, 'location': 0.20..."
2,2,1073,"{'book': 1618, 'read': 774, 'location': 479, '...",0.613922,"[0.008959872890230298, 0.02307058386564865, -0...","[-3.1564274, 6.083331]",[Awesome story! I read this book DATE and was ...,"{'book': 0.5235569991369154, 'read': 0.2966822..."
3,3,398,"{'great': 244, 'good': 163, 'use': 149, 'produ...",0.495964,"[0.0005142235821762343, -0.009841657266474955,...","[1.3146882, 11.555882]","[pretty decent Love it, wireless works a lil f...","{'great': 0.2819436353692122, 'works': 0.13012..."
4,4,115,"{'book': 196, 'read': 93, 'good': 51, 'story':...",0.305131,"[0.0037351046676200855, 0.03597368089299973, -...","[-3.7777233, 5.4500985]",[Sad to see all of these one-star ratings I th...,"{'book': 0.39259818498534416, 'read': 0.215929..."


In [10]:
with open('data/cluster_info_B_amazon.pkl', 'rb') as f:
    amazon_B = pickle.load(f)
print(type(amazon_B))
print(amazon_B.shape)
amazon_B.head()

<class 'pandas.core.frame.DataFrame'>
(14, 8)


Unnamed: 0,Cluster,Count,Top_Words,Polarity,Centroid,2D_Centroid,Top_Representative_Docs,Top_Words_TFIDF
0,0,511,"{'cd': 306, 'album': 295, 'like': 278, 'music'...",-0.32087,"[0.005391814288830293, 0.00656481949659372, -0...","[-5.6849732, 11.980106]",[NUM well crafted songs does NOT make a great ...,"{'cd': 0.3029982081186027, 'just': 0.196057664..."
1,1,692,"{'movie': 868, 'film': 305, 'like': 244, 'just...",-0.410263,"[-0.010680105894241518, 0.04162282063466355, -...","[-3.8498094, 9.028647]",[What the hell did I just watch. I am a true B...,"{'movie': 0.3745573527068864, 'didn': 0.138533..."
2,2,140,"{'book': 184, 'location': 109, 'read': 60, 'ju...",-0.056081,"[0.004660180020229225, 0.03213820015807869, -0...","[-3.2405305, 5.507901]","[Once again, Russo spreads lies to make a quic...","{'book': 0.40700837223395525, 'location': 0.23..."
3,3,738,"{'product': 274, 'work': 262, 'buy': 187, 'jus...",-0.441048,"[-0.012323810111746356, 0.007264140421755306, ...","[1.3935039, 11.628746]",[Won't stand by warranty ORGANIZATION shopping...,"{'unit': 0.2459720427620044, 'product': 0.1892..."
4,4,835,"{'book': 1418, 'read': 540, 'just': 301, 'like...",-0.421902,"[0.0019212170836502894, 0.04200785495278957, -...","[-3.8869593, 5.000855]",[Very Disappointing I'm not sure where to star...,"{'book': 0.49640651618480125, 'read': 0.215109..."


In [13]:
narrative_dimensions = [
    "Economic growth – Economic decline",
    "Financial stability – Financial instability",
    "Innovation – Tradition",
    "Market confidence – Market anxiety",
    "Risk-taking – Risk aversion",
    "Technological optimism – Technological skepticism",
    "Environmental responsibility – Environmental negligence",
    "Public health focus – Economic focus",
    "Consumer empowerment – Consumer manipulation",
    "Globalization – Nationalism",
    "Regulation – Deregulation",
    "Competition – Monopoly",
    "Efficiency – Redundancy",
    "Productivity – Laziness",
    "Sustainability – Exploitation",
    "Scalability – Inflexibility",
    "Inclusivity – Exclusivity",
    "Transparency – Obfuscation",
    "Accountability – Irresponsibility",
    "Safety – Danger",
    "Collaboration – Isolation",
    "Accessibility – Elitism",
    "Long-term thinking – Short-term thinking",
    "Innovation – Stagnation",
    
    # Moral & Ethical
    "Justice – Injustice",
    "Equity – Inequity",
    "Honesty – Deception",
    "Integrity – Corruption",
    "Altruism – Self-interest",
    "Responsibility – Negligence",
    "Loyalty – Betrayal",
    "Consent – Coercion",
    "Fairness – Bias",
    "Empathy – Apathy",
    "Gratitude – Entitlement",
    "Generosity – Greed",
    "Respect – Disregard",
    "Forgiveness – Vengeance",
    "Courage – Cowardice",
    "Authenticity – Hypocrisy",
    
    # Social & Interpersonal
    "Authority – Rebellion",
    "Agreement – Disagreement",
    "Politeness – Rudeness",
    "Assertiveness – Passivity",
    "Consensus – Polarization",
    "Trust – Distrust",
    "Leadership – Subordination",
    "Support – Opposition",
    "Closeness – Distance",
    "Mentorship – Criticism",
    "Professionalism – Informality",
    "Respect – Dismissiveness",
    
    # Emotional & Psychological
    "Optimism – Pessimism",
    "Excitement – Boredom",
    "Confidence – Doubt",
    "Calm – Anxiety",
    "Hope – Despair",
    "Happiness – Sadness",
    "Empowerment – Helplessness",
    "Passion – Indifference",
    "Resilience – Fragility",
    "Fulfillment – Frustration",
    "Enthusiasm – Apathy",
    "Pride – Shame",
    "Joy – Grief",
    "Motivation – Complacency",
    "Curiosity – Disinterest",
    "Satisfaction – Disappointment",
    
    # Tone & Style
    "Formal – Informal",
    "Objective – Subjective",
    "Concise – Wordy",
    "Direct – Indirect",
    "Sincere – Sarcastic",
    "Serious – Humorous",
    "Persuasive – Descriptive",
    "Analytical – Narrative",
    "Technical – Layperson-friendly",
    "Measured – Exaggerated",
    "Assertive – Tentative",
    "Friendly – Hostile",
    
    # Intent & Function
    "Persuasion – Information",
    "Promotion – Critique",
    "Instruction – Storytelling",
    "Reporting – Speculating",
    "Defending – Attacking",
    "Clarification – Confusion",
    "Invitation – Rejection",
    "Compliance – Resistance",
    "Escalation – De-escalation",
    "Problem framing – Solution framing",
    "Root cause analysis – Symptom description",
    "Innovation proposal – Status quo defense",
    "Apology – Justification",
    "Advocacy – Observation",
    
    # Perspective & Framing
    "First-person – Third-person",
    "Subjective – Objective",
    "Quantitative – Qualitative",
    "Macro – Micro",
    "Local – Global",
    "Personal – Institutional",
    "Insider – Outsider",
    "Predictive – Retrospective",
    "Static – Dynamic",
    "Present-focused – Future-focused",
    
    # Academic & Scientific Style
    "Evidence-based – Anecdotal",
    "Theoretical – Empirical",
    "Precise – Ambiguous",
    "Cautious – Bold",
    "Methodological – Exploratory",
    "Hypothesis-driven – Observation-driven",
    "Peer-oriented – Public-oriented",
    "Originality – Replication",
    "Novelty – Confirmation",
    "Scientific neutrality – Advocacy",
    
    # Consumer & Review
    "Value for money – Overpriced",
    "User-friendly – Complicated",
    "High quality – Poor quality",
    "Responsive – Unresponsive",
    "Durable – Fragile",
    "Reliable – Buggy",
    "Trustworthy – Misleading",
    "Helpful – Unhelpful",
    "Recommended – Not recommended"
]
dimension_list_str = "\n".join(narrative_dimensions)
with open("out/narrative_dimensions.txt", "w", encoding="utf-8") as f:
    f.write(dimension_list_str)

In [14]:
def generate_narrative_comparison_prompt(text1, text2):
    prompt_template = f"""
Role: You are a narrative analyst specialist in identifying all semantic perspectives in texts, so called narrative dimensions.
Task: Identify and compare all narrative dimensions present in the texts "text1" and "text2".

Narrative dimensions are fine-grained semantic elements that capture contrasts in topic, subtopic, perspective, intention, nuance, morality, emotion, sentiment, personality, intensity, solution/problem framing, accountability, or other discourse-level distinctions.
Use the list of dimensions below as inspiration, allow the emergence of any new one, be creative and open. Only output JSON as specified.

--- Dimensions ---
{dimension_list_str}

For each narrative dimension, follow this output format:
- Title: A short phrase that captures the main contrast.
- Subtitle: The two poles of the dimension (e.g., financial stability – financial instability).
- Presence: Indicate whether it appears in text1 only, text2 only, or both.
- Coverage: Percentage (0–100%) indicating how much of each text is devoted to this dimension. Together, the dimensions should account for 100% of each text. Zero coverage is allowed for one text, but not both.
- Perspective Shift: Estimate the directional change between texts as a percentage (0–100%).
- Cosine Similarity: Indicate the cosine similarity (from –1 to 1) between the text representations for this dimension.
- Evidence: Quote at least 2–3 representative sentences from each text that reflect each pole.

Example Output Format:

Dimension: Financial stability  
Subtitle: Stability – Instability  
Presence: Both  
Coverage: text1: 30%, text2: 15%  
Perspective Shift: 40%  
Cosine Similarity: –0.42  
Evidence:  
- text1: 
  • "The company faces major risks due to market downturn."  
  • "Investors are pulling out amid uncertainty."  
  • "Revenue has dropped by 20% over the last quarter."  
- text2: 
  • "Recent investments have secured long-term growth."  
  • "The financial outlook remains positive, according to the Q2 report."  
  • "Diversification strategies have reduced exposure to risk."

Repeat this for all narrative dimensions needed to fully account for both texts.

Output **only** a valid JSON list of objects in this exact format — no additional text, no commentary, no explanations.

Example output:
[
  {{
    "dimension_title": "Financial Stability",
    "polar_extremes_subtitle": "Financial instability – Financial stability",
    "presence": "both",
    "coverage_text1": 60,
    "coverage_text2": 40,
    "difference_confidence": 40,
    "cosine_similarity": 0.72,
    "text1_evidence_sentences": [
      "The company is struggling with cash flow.",
      "Debt levels are increasing rapidly.",
      "Investors have lost confidence."
    ],
    "text2_evidence_sentences": [
      "Strong profit growth is reported.",
      "Revenue streams are diversifying.",
      "The market shows consistent recovery."
    ]
  }}
]

--- Begin Text Analysis ---

text1:
{text1}

text2:
{text2}
"""
    return prompt_template


In [15]:
import google.generativeai as genai
import io
import httpx
import os
from dotenv import load_dotenv

#load_dotenv() # Load the .env file with API key
#genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
#genai.configure(api_key="YOUR_API_KEY")

genai.configure(api_key="AIzaSyAjgfv08Tqt5R4GzjUXDoQUPgFLGxV8u_U")
client = genai.GenerativeModel("gemini-2.0-flash") 

In [16]:
import json
import re
import ast
import os

def safe_json_load(raw_response_text):
    # Strip leading/trailing whitespace and remove non-JSON "explanation" text if any
    raw_text = raw_response_text.strip()

    # Attempt quick fix: if it starts/ends with JSON brackets
    if not raw_text.startswith('[') and '[' in raw_text:
        raw_text = raw_text[raw_text.index('['):]
    if not raw_text.endswith(']') and ']' in raw_text:
        raw_text = raw_text[:raw_text.rindex(']') + 1]

    # Remove or escape invalid escape characters
    def escape_invalid_escapes(s):
        # Fix invalid escape sequences: \x, \u (if malformed), or backslashes not part of valid escape
        s = re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', s)  # Replace lone backslashes
        return s

    try:
        return json.loads(escape_invalid_escapes(raw_text))
    except json.JSONDecodeError as e:
        try:
            # Try ast.literal_eval as fallback (tolerates single quotes, trailing commas)
            return ast.literal_eval(raw_text)
        except Exception as fallback_error:
            print("⚠️ JSON parsing failed.")
            print("JSON error:", e)
            print("Fallback error:", fallback_error)
            return None


In [17]:
for i in range(len(amazon_A)):
    text1 = amazon_A['Top_Representative_Docs'].iloc[i]
    text2 = amazon_B['Top_Representative_Docs'].iloc[i]

    prompt = generate_narrative_comparison_prompt(text1, text2)
    response = client.generate_content(prompt)

    # Dynamic response variable (you can also use a list or dict instead of naming each one)
    var_name = f"response_row{i}"
    globals()[var_name] = response

    # Parse the JSON response safely
    parsed_llm_out = safe_json_load(response.text)

    # Save to file
    with open(f"out/amazon_ND_row{i}.json", "w", encoding="utf-8") as f:
        json.dump(parsed_llm_out, f, ensure_ascii=False, indent=2)

    print(f"✅ Processed and saved row {i}")

✅ Processed and saved row 0
✅ Processed and saved row 1
✅ Processed and saved row 2
✅ Processed and saved row 3
✅ Processed and saved row 4
✅ Processed and saved row 5
✅ Processed and saved row 6
✅ Processed and saved row 7
✅ Processed and saved row 8
✅ Processed and saved row 9
✅ Processed and saved row 10
✅ Processed and saved row 11
✅ Processed and saved row 12
✅ Processed and saved row 13
