In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('df_topic.csv')
topics_info = pd.read_csv('topic_info.csv', )

In [None]:
df.head(5)

In [None]:
# only show some columns
topics_info[['Name', 'Representation', 'Aspect1','Aspect2']]#.to_csv('eg.csv', index=False)

# Only one topics per interview

In [None]:
# Count the distribution of topics across experiments
experiment_topic_distribution = df.groupby(["Experiment", "one_topic_name"]).size().unstack(fill_value=0)

# Count the distribution of topics across experiments and conditions
experiment_condition_topic_distribution = df.groupby(["Experiment", "Condition", "one_topic_name"]).size().unstack(fill_value=0)

# Plot: Distribution of topics across experiments
plt.figure(figsize=(10, 6))
experiment_topic_distribution.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Distribution of Topics Across Experiments')
plt.xlabel('Experiment')
plt.ylabel('Count')
plt.legend(title='Topics')
plt.tight_layout()
plt.show()

# Plot: Distribution of topics across experiments and conditions
plt.figure(figsize=(12, 8))
experiment_condition_topic_distribution.plot(kind='bar', stacked=True, figsize=(12, 8))
plt.title('Distribution of Topics Across Experiments and Conditions')
plt.xlabel('Experiment and Condition')
plt.ylabel('Count')
plt.legend(title='Topics')
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import ast

# Assume df has 'Experiment', 'Condition', 'multiple_topics_name' columns,
# and multiple_topics_name is a string that either looks like "['topicA', 'topicB']"
# or just "topicA".

# Step 1: Convert strings to lists if they represent lists
def to_list(x):
    if pd.isna(x) or x.strip() == "":
        return []
    # Try to parse as a list using literal_eval
    try:
        val = ast.literal_eval(x)
        # If val is not a list (e.g. a single string), wrap it in a list
        if not isinstance(val, list):
            val = [val]
        return val
    except (ValueError, SyntaxError):
        # If literal_eval fails, it means x is probably just a single topic string
        return [x]

df['multiple_topics_name'] = df['multiple_topics_name'].apply(to_list)

# Step 2: Expand and assign weights
expanded_rows = []
for _, row in df.iterrows():
    topics = row['multiple_topics_name']
    if len(topics) == 0:
        continue
    weight = 1.0 / len(topics)
    for t in topics:
        expanded_rows.append({
            'Experiment': row['Experiment'],
            'Condition': row['Condition'],
            'Topic': t,
            'Weight': weight
        })

expanded_df = pd.DataFrame(expanded_rows)

# Step 3: Aggregate weights
agg = expanded_df.groupby(['Experiment', 'Condition', 'Topic'], as_index=False)['Weight'].sum()

# Step 4: Pivot to get topic distribution
pivot = agg.pivot_table(index=['Experiment', 'Condition'], columns='Topic', values='Weight', fill_value=0)

# Normalize rows to sum to 1
pivot = pivot.div(pivot.sum(axis=1), axis=0)

# Step 5: Plot a 100%-stacked bar chart
plot_df = pivot.reset_index()
plot_df['Group'] = plot_df['Experiment'].astype(str) + ' - ' + plot_df['Condition'].astype(str)

topic_cols = [c for c in plot_df.columns if c not in ['Experiment', 'Condition', 'Group']]

fig, ax = plt.subplots(figsize=(10, 6))
bottom = [0] * len(plot_df)

for t in topic_cols:
    ax.bar(plot_df['Group'], plot_df[t], bottom=bottom, label=t)
    bottom = [i + j for i, j in zip(bottom, plot_df[t])]

ax.set_ylabel("Proportion")
ax.set_title("Topic Distribution by Experiment-Condition")
ax.set_xticklabels(plot_df['Group'], rotation=45, ha='right')
ax.legend(title="Topic", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


# Make network graph of topics and their relationships

In [17]:
import pandas as pd
import networkx as nx

def build_master_graph(df):
    """
    Build a single master directed graph of topic transitions.
    Each edge has:
     - 'weight': how many times the transition occurred
     - 'metadata': a list of dicts with keys {file_name, experiment, condition}
    """
    G = nx.DiGraph()
    
    # Group by File Name so transitions are built within each participant
    for file_name, group in df.groupby("File Name"):
        # Sort by utterance_index to ensure chronological order
        group = group.sort_values("utterance_index")
        
        # Extract lists for building edges
        topics   = group["one_topic_name"].tolist()
        exps     = group["Experiment"].tolist()
        conds    = group["Condition"].tolist()
        
        # Create edges for consecutive utterances
        for i in range(len(topics) - 1):
            u = topics[i]
            v = topics[i + 1]
            exp = exps[i]
            cond = conds[i]
            
            # If the edge already exists, update it
            if G.has_edge(u, v):
                # Increment the weight
                G[u][v]['weight'] += 1
                # Append the new metadata
                G[u][v]['metadata'].append({
                    "file_name": file_name,
                    "experiment": exp,
                    "condition": cond
                })
            else:
                # Create a new edge
                G.add_edge(u, v,
                           weight=1,
                           metadata=[{
                               "file_name": file_name,
                               "experiment": exp,
                               "condition": cond
                           }]
                          )
                
    return G

def filter_graph(G_master, experiment=None, condition=None, file_name=None):
    """
    Return a subgraph of G_master containing only those edges
    whose 'metadata' matches the requested experiment, condition, or file_name.
    
    If an edge's metadata list contains ANY record matching the filter,
    that edge is included in the subgraph.
    
    Args:
      - experiment: e.g. "OBE1" (or None to ignore this filter)
      - condition: e.g. 1 (or None to ignore)
      - file_name: e.g. "ID 05" (or None)
      
    Returns:
      nx.DiGraph subgraph
    """
    
    # We'll collect edges to keep in a list
    edges_to_keep = []
    
    for (u, v, data) in G_master.edges(data=True):
        meta_list = data.get('metadata', [])
        
        # Check if ANY record in meta_list satisfies all non-None filters
        keep_edge = False
        
        for record in meta_list:
            # Record is like {"file_name":..., "experiment":..., "condition":...}
            if experiment is not None and record['experiment'] != experiment:
                # mismatch, skip to next record
                continue
            
            if condition is not None and record['condition'] != condition:
                # mismatch, skip to next record
                continue
            
            if file_name is not None and record['file_name'] != file_name:
                # mismatch, skip to next record
                continue
            
            # If we got here, it means for this record, all specified filters matched
            keep_edge = True
            break  # no need to check more records
        
        if keep_edge:
            edges_to_keep.append((u, v))
    
    # Now create a subgraph with only edges_to_keep
    # We'll preserve the original edge attributes from G_master
    G_sub = nx.DiGraph()
    for (u, v) in edges_to_keep:
        # copy over data
        G_sub.add_edge(u, v, **G_master[u][v])
    
    # It's often nice to prune isolated nodes that have no edges
    # so let's remove them if they have no connections
    nodes_to_keep = set()
    for (u, v) in edges_to_keep:
        nodes_to_keep.add(u)
        nodes_to_keep.add(v)
    
    G_sub = G_sub.subgraph(nodes_to_keep).copy()
    
    return G_sub

import matplotlib.pyplot as plt

def plot_graph(G, title=None):
    """
    Plot a NetworkX DiGraph with node labels, edge arrows, and edge weight labels.
    """
    plt.figure(figsize=(10, 8))
    
    # A layout for the nodes
    pos = nx.spring_layout(G, seed=42)
    
    # Draw nodes, edges, labels
    nx.draw_networkx_nodes(G, pos, node_size=1200)
    nx.draw_networkx_edges(G, pos, arrowstyle="->", arrows=True, arrowsize=15)
    nx.draw_networkx_labels(G, pos, font_size=10)
    
    # Draw edge weight labels
    edge_labels = nx.get_edge_attributes(G, "weight")
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_color='red')
    
    if title:
        plt.title(title)
    plt.axis("off")
    plt.show()

In [18]:
G_master = build_master_graph(df)

In [None]:
# 1) Plot the entire master graph (could be huge, so be careful):
plot_graph(G_master, title="All Data")

# 2) Plot a filtered subgraph for a single condition
G_obe1_cond1 = filter_graph(G_master, experiment="OBE1", condition=1)
plot_graph(G_obe1_cond1, title="Experiment=OBE1, Condition=1")

# 3) Plot a single participant
G_id05 = filter_graph(G_master, file_name="ID 05")
plot_graph(G_id05, title="Participant ID 05")


In [None]:
# Filter the DataFrame for a specific file name (e.g., "ID 05")
file_name_to_observe = "Id 14"
subset = df[df["File Name"] == file_name_to_observe].sort_values("utterance_index")

# Create a directed graph for the specific file name
G_specific = nx.DiGraph()
topics = subset["one_topic_name"].tolist()

# Add edges for the specific file
for i in range(len(topics) - 1):
    G_specific.add_edge(topics[i], topics[i + 1])

# Visualize the graph for the specific file name
plt.figure(figsize=(10, 6))
pos = nx.spring_layout(G_specific, seed=42)  # Layout for the graph
nx.draw(G_specific, pos, with_labels=True, node_size=3000, font_size=10, arrowsize=20)
plt.title(f"Directed Graph of Topics for File: {file_name_to_observe}", fontsize=14)
plt.show()


In [11]:
import math
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx

def plot_topic_network(H, title="Topic Network Graph", show_edge_labels=True, save_path=None, ):
    """
    Plot a directed network graph for topics.

    Parameters:
        H (Graph): The directed graph to plot.
        title (str): The title of the plot.
        show_edge_labels (bool): Whether to show edge weight labels.
        save_path (str, optional): Path to save the plot as an image file. If None, the plot is not saved.
    """

    # FIGURE -------------------------------------------------------------------------
    fig = plt.figure(figsize=(15, 10), dpi=300)

    # NODES positions: spring layout----------------------------------------------------
    pos = nx.spring_layout(H, seed=42, k=0.5, scale=3)

    # Node sizes: based on degree or weight
    size_range = [800, 2500]
    node_weights = {node: sum(H[u][v]['weight'] for u, v in H.in_edges(node)) for node in H.nodes()}
    max_weight = max(node_weights.values(), default=1)
    
    node_sizes = [np.interp(node_weights[node], [0, max_weight], size_range) for node in H.nodes()]

    # Plot nodes with numbers as labels
    nx.draw_networkx_nodes(H, pos, node_size=node_sizes, node_color='#808080', edgecolors="white", linewidths=1.5)
    nx.draw_networkx_labels(H, pos, labels={node: node for node in H.nodes()}, font_size=10, font_color='black')

    # EDGES attributes--------------------------------------------------------------------
    all_weights = [H[u][v]['weight'] for u, v in H.edges()]
    max_weight = max(all_weights) if all_weights else 1

    # Draw edges
    edge_labels = {}
    for u, v in H.edges():
        weight = H[u][v]['weight']
        edge_width = np.interp(weight, [0, max_weight], [0.5, 3])
        nx.draw_networkx_edges(
            H, pos,
            edgelist=[(u, v)],
            width=edge_width,
            arrowstyle='-|>',
            arrowsize=15,
            edge_color='#3b3b3b',
            connectionstyle='arc3,rad=0.1' if H.has_edge(v, u) else 'arc3,rad=0.0'
        )
        edge_labels[(u, v)] = f"{weight:.2f}"

    # Draw edge labels if enabled
    if show_edge_labels:
        nx.draw_networkx_edge_labels(
            H, pos,
            edge_labels=edge_labels,
            font_size=8,
            label_pos=0.5,
            bbox=dict(alpha=0),
            verticalalignment='center'
        )

    # LEGEND BOX----------------------------------------------------------------------------
    handles = [plt.Line2D([0], [0], marker='o', color='#808080', linestyle='', markersize=10)]
    labels = ["Topics"]
    plt.legend(
        handles,
        labels,
        loc='best',
        title="Legend",
        fontsize=10,
        title_fontsize=12
    )

    # PLOT figure----------------------------------------------------------------------------
    plt.title(title, fontsize=16, color='#3b3b3b')
    plt.axis("off")
    plt.tight_layout()

    # Save the plot if save_path is provided
    if save_path:
        plt.savefig(save_path, bbox_inches='tight', dpi=1000, facecolor=facecolor)

    plt.show()


In [None]:
plot_topic_network(G, title="Topic Network Graph", show_edge_labels=True)