In [None]:
!pip install sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

### If in case the graph isnt seen in the first run, just rerun the code

In [2]:
import os
import pandas as pd
import networkx as nx
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import plotly.graph_objs as go
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# === CONFIGURATION ===
human_csvs = ["/content/games_americas.csv",
              "/content/games_africa.csv",
              "/content/games_european_modern.csv",
              "/content/games_south_asia.csv",
              "/content/games_sea_east_asia_central_asia.csv"
              ]
llm_csvs = ["/content/games_llm.csv",]
human_columns = {"text": "rules", "date": "date"}  # columns to use
#llm_response_cols = ['Response_GPT4o', 'Response_Llama', 'Response_Mistral', 'Response_Olmo']

threshold = 0.5
# === LOAD DATA ===

def load_llm_flat_file(file_list, future_base=2100):
    llm_texts, llm_dates, llm_nodes = [], [], []
    llm_prompts, llm_truths, llm_models = {}, {}, {}
    node_index = 0

    for file in file_list:
        df = pd.read_csv(file)
        for i in range(len(df)):
            if pd.notna(df.at[i, "Response"]):
                node = f"L{node_index}"
                llm_texts.append(df.at[i, "Response"])
                llm_dates.append(future_base)
                llm_nodes.append(node)
                llm_prompts[node] = df.at[i, "Prompts"]
                llm_models[node] = df.at[i, "Model"]
                llm_truths[node] = df.at[i, "Ground_truth_GPT4"]
                node_index += 1

    return llm_texts, llm_dates, llm_nodes, llm_truths, llm_models, llm_prompts


def load_csv_data(file_list, text_col, date_col=None):
    texts, dates = [], []
    for file in file_list:
        df = pd.read_csv(file)
        texts.extend(df[text_col].dropna().tolist())

        if date_col and date_col in df.columns:
            dates.extend(df[date_col].fillna(0).astype(int).tolist())
        else:
            dates.extend([None] * len(df))  # ← handles missing or absent date column
    return texts, dates

human_texts, human_dates = load_csv_data(human_csvs, human_columns["text"], human_columns["date"])
llm_texts, llm_dates, llm_nodes, llm_truths, llm_models, llm_prompts = load_llm_flat_file(
    llm_csvs,
    future_base=2025
)

# === SIMILARITY & EMBEDDINGS ===
model = SentenceTransformer("all-MiniLM-L6-v2")
human_emb = model.encode(human_texts)
llm_emb = model.encode(llm_texts)
hh_sim = cosine_similarity(human_emb, human_emb)
hl_sim = cosine_similarity(human_emb, llm_emb)

# === BUILD GRAPH ===
G = nx.DiGraph()
human_nodes = [f"H{i}" for i in range(len(human_texts))]
llm_nodes = [f"L{i}" for i in range(len(llm_texts))]

for i, node in enumerate(human_nodes):
    G.add_node(node, type='human', date=human_dates[i], text=human_texts[i])

for i, node in enumerate(llm_nodes):
    G.add_node(node, type='llm', date=2025, text=llm_texts[i])

# === ADD EDGES ===
for i in range(len(human_nodes)):
    for j in range(len(human_nodes)):
        if i != j and human_dates[i] < human_dates[j] and hh_sim[i, j] > threshold:
            G.add_edge(human_nodes[i], human_nodes[j], weight=hh_sim[i, j])

for i, l in enumerate(llm_nodes):
    for j, h in enumerate(human_nodes):
        if hl_sim[j, i] > threshold:
            G.add_edge(h, l, weight=hl_sim[j, i])

formulas = {
    "entropy_decay": lambda s: 10 * (1 - s * np.log(1 + s)),
}

eval_records = []

for formula_name, formula_fn in formulas.items():
    # Reset novelty
    for node in G.nodes:
        G.nodes[node]['novelty'] = 10.0 if G.in_degree(node) == 0 else float('inf')

    # Apply propagation with current formula
    sorted_nodes = sorted(G.nodes, key=lambda n: G.nodes[n]['date'])
    for node in sorted_nodes:
        for succ in G.successors(node):
            sim = G[node][succ]['weight']
            candidate = formula_fn(sim)
            candidate = max(0, candidate)
            current = G.nodes[succ]['novelty']
            G.nodes[succ]['novelty'] = min(current, candidate)

    # Gather results for LLM nodes
    llm_novelties = []
    for node in G.nodes:
        if G.nodes[node]['type'] == 'llm':
            llm_novelties.append({
                "node": node,
                "model": llm_models[node],
                "text": G.nodes[node]['text'],
                "date": G.nodes[node]['date'],
                "novelty": G.nodes[node]['novelty'],
                "ground_truth": llm_truths.get(node, np.nan),
                "formula": formula_name
            })

    llm_df = pd.DataFrame(llm_novelties)
    llm_df.to_csv(f"llm_novelty_scores_{formula_name}.csv", index=False)

    # Store evaluation metrics
    for model_name, group in llm_df.groupby("model"):
        y_true = group["ground_truth"]
        y_pred = group["novelty"]
        eval_records.append({
            "formula": formula_name,
            "model": model_name,
            "MSE": mean_squared_error(y_true, y_pred),
            "MAE": mean_absolute_error(y_true, y_pred),
            "R2": r2_score(y_true, y_pred)
        })

# Save summary evaluation
summary_df = pd.DataFrame(eval_records)
summary_df.to_csv("novelty_formula_evaluation_summary.csv", index=False)

# === INTERACTIVE GRAPH ===
pos = nx.spring_layout(G, seed=42)
edge_trace = []
for u, v, d in G.edges(data=True):
    x0, y0 = pos[u]
    x1, y1 = pos[v]
    edge_trace.append(go.Scatter(
        x=[x0, x1, None], y=[y0, y1, None],
        mode='lines',
        line=dict(width=2 * d['weight'], color='gray'),
        hoverinfo='none'
    ))

node_x, node_y, hover_texts, colors = [], [], [], []
for node in G.nodes:
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    d = G.nodes[node]
    color = "skyblue" if d['type'] == 'human' else "lightgreen"
    colors.append(color)
    novelty = d['novelty']
    novelty_display = novelty if np.isfinite(novelty) else "N/A"
    hover = (
        f"<b>{node}</b><br>"
        f"Type: {d['type']}<br>"
        f"Date: {d['date']}<br>"
        f"Novelty: {novelty_display}<br>"
    )
    hover_texts.append(hover)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    text=hover_texts,
    hoverinfo='text',
    textposition='top center',
    marker=dict(size=5, color=colors, line_width=2)
)

fig = go.Figure(data=edge_trace + [node_trace],
                layout=go.Layout(
                    title="Interactive Influence Graph with Exponential Novelty Decay",
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20, l=5, r=2, t=40),
                    xaxis=dict(showgrid=False, zeroline=False),
                    yaxis=dict(showgrid=False, zeroline=False)
                ))
fig.show()

human_novelties = [
    {
        "node": node,
        "text": G.nodes[node]['text'],
        "date": G.nodes[node]['date'],
        "novelty": G.nodes[node]['novelty']
    }
    for node in G.nodes
    if G.nodes[node]['type'] == 'human'
]

human_df = pd.DataFrame(human_novelties)
human_df.to_csv("human_novelty_scores.csv", index=False)


llm_novelties = [
    {
        "node": node,
        "text": G.nodes[node]['text'],
        "date": G.nodes[node]['date'],
        "novelty": G.nodes[node]['novelty']
    }
    for node in G.nodes
    if G.nodes[node]['type'] == 'llm'
]

llm_df = pd.DataFrame(llm_novelties)
llm_df.to_csv("llm_novelty_scores.csv", index=False)



