<a href="https://colab.research.google.com/github/FredSadeghi/Amazon_CoPurchase_Network_Analysis/blob/main/BigDataAmazon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gzip
import csv
import re
import pandas as pd
from textblob import TextBlob
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import os
import time
import igraph as ig
import plotly.graph_objects as go

# Install required packages
!pip uninstall -y python-louvain community
!pip install python-louvain leidenalg python-igraph

# Try importing the Louvain method
try:
    from community import best_partition as louvain_best_partition
    USE_LOUVAIN = True
    print("Successfully imported python-louvain for community detection.")
except ImportError as e:
    print(f"Failed to import python-louvain: {e}")
    print("Falling back to NetworkX's greedy_modularity_communities for community detection.")
    USE_LOUVAIN = False

Found existing installation: python-louvain 0.16
Uninstalling python-louvain-0.16:
  Successfully uninstalled python-louvain-0.16
[0mCollecting python-louvain
  Using cached python_louvain-0.16-py3-none-any.whl
Installing collected packages: python-louvain
Successfully installed python-louvain-0.16
Successfully imported python-louvain for community detection.


In [2]:
if not os.path.exists('/content/Amazon_CoPurchase_Network_Analysis'):
    !git clone https://github.com/FredSadeghi/Amazon_CoPurchase_Network_Analysis.git

else:
    print("Repository already cloned. Skipping.")

Repository already cloned. Skipping.


In [3]:
# --- Part 1: Preprocessing with Caching ---
# Input and output file paths
input_file = '/content/Amazon_CoPurchase_Network_Analysis/amazon-meta.txt.gz'
product_output = 'products_cleaned.csv'
category_output = 'categories_cleaned.csv'
review_output = 'reviews_cleaned.csv'
edge_output = 'edges.csv'
products_enriched_file = 'products_enriched.csv'
categories_expanded_file = 'categories_expanded.csv'
reviews_processed_file = 'reviews_processed.csv'

In [4]:
def parse_amazon_data():
    """Parse raw Amazon metadata into separate CSV files for products, categories, reviews, and weighted edges.
    Exclude products with missing, empty, or 'Unknown' Group values.
    Only include edges between valid products."""
    with gzip.open(input_file, 'rt', encoding='latin-1') as f, \
         open(product_output, 'w', newline='', encoding='utf-8') as prod_out, \
         open(category_output, 'w', newline='', encoding='utf-8') as cat_out, \
         open(review_output, 'w', newline='', encoding='utf-8') as rev_out, \
         open(edge_output, 'w', newline='', encoding='utf-8') as edge_out:

        product_writer = csv.writer(prod_out)
        category_writer = csv.writer(cat_out)
        review_writer = csv.writer(rev_out)
        edge_writer = csv.writer(edge_out)

        # Write headers
        product_writer.writerow(['Id', 'ASIN', 'Title', 'Group', 'SalesRank'])
        category_writer.writerow(['ASIN', 'CategoryPath'])
        review_writer.writerow(['ASIN', 'CustomerID', 'Rating', 'Votes', 'Helpful', 'Sentiment', 'Date'])
        edge_writer.writerow(['SourceASIN', 'TargetASIN', 'Weight'])

        current = {}
        edge_counts = {}  # To track weights for edges
        valid_asins = set()  # Track ASINs of products with valid Group and Title
        skipped_products = 0
        skipped_reviews = 0
        skipped_categories = 0
        skipped_edges = 0

        for line in f:
            line = line.strip()

            # New product entry
            if line.startswith("Id:"):
                if current.get('ASIN') and current.get('Id'):
                    # Check if group is present and valid
                    group = current.get('group')
                    title = current.get('title')
                    if (group is None or str(group).strip().lower() == 'unknown' or not str(group).strip() or
                        title is None or str(title).strip().lower() == 'unknown' or not str(title).strip()):
                        skipped_products += 1
                    else:
                        product_row = [
                            current.get('Id'),
                            current.get('ASIN'),
                            title,
                            group,
                            current.get('salesrank', '-1')
                        ]
                        if None in product_row[:2] or any(str(x).strip() == '' for x in product_row[:2]):
                            skipped_products += 1
                        else:
                            product_writer.writerow(product_row)
                            valid_asins.add(current['ASIN'])

                            for cat in current.get('categories', []):
                                if cat and str(cat).strip():
                                    category_writer.writerow([current['ASIN'], cat])
                                else:
                                    skipped_categories += 1

                            for review in current.get('reviews', []):
                                sentiment = compute_sentiment(review['rating'])
                                review_row = [
                                    current['ASIN'],
                                    review['customer'],
                                    review['rating'],
                                    review['votes'],
                                    review['helpful'],
                                    sentiment,
                                    review['date']
                                ]
                                if None in review_row or any(str(x).strip() == '' for x in review_row[:5]):
                                    skipped_reviews += 1
                                else:
                                    review_writer.writerow(review_row)

                            for similar_asin in current.get('similar', []):
                                if similar_asin and str(similar_asin).strip():
                                    edge = tuple(sorted([current['ASIN'], similar_asin]))
                                    edge_counts[edge] = edge_counts.get(edge, 0) + 1
                                else:
                                    skipped_edges += 1

                current = {'categories': [], 'reviews': [], 'similar': []}
                current['Id'] = line.split('Id:')[1].strip()

            elif line.startswith("ASIN:"):
                current['ASIN'] = line.split("ASIN:")[1].strip()

            elif 'title:' in line:
                match = re.search(r'title:\s*(.*)', line)
                if match:
                    current['title'] = match.group(1).strip()

            elif 'group:' in line:
                match = re.search(r'group:\s*(.*)', line)
                if match:
                    current['group'] = match.group(1).strip()

            elif 'salesrank:' in line:
                match = re.search(r'salesrank:\s*(.*)', line)
                if match:
                    current['salesrank'] = match.group(1).strip()

            elif line.startswith("similar:"):
                parts = line.split()
                current['similar'] = parts[2:] if len(parts) > 2 else []

            elif line.startswith("|"):
                current['categories'].append(line.strip())

            elif re.match(r'\d{4}-\d{1,2}-\d{1,2}', line):
                parts = line.split()
                if len(parts) >= 7:
                    review = {
                        'date': parts[0],  # e.g., 2000-7-28
                        'customer': parts[2],
                        'rating': int(parts[4]),
                        'votes': int(parts[6]),
                        'helpful': int(parts[8])
                    }
                    current['reviews'].append(review)

        # Write the last product
        if current.get('ASIN') and current.get('Id'):
            group = current.get('group')
            title = current.get('title')
            if (group is None or str(group).strip().lower() == 'unknown' or not str(group).strip() or
                title is None or str(title).strip().lower() == 'unknown' or not str(title).strip()):
                skipped_products += 1
            else:
                product_row = [
                    current.get('Id'),
                    current.get('ASIN'),
                    title,
                    group,
                    current.get('salesrank', '-1')
                ]
                if None in product_row[:2] or any(str(x).strip() == '' for x in product_row[:2]):
                    skipped_products += 1
                else:
                    product_writer.writerow(product_row)
                    valid_asins.add(current['ASIN'])

                    for cat in current.get('categories', []):
                        if cat and str(cat).strip():
                            category_writer.writerow([current['ASIN'], cat])
                        else:
                            skipped_categories += 1

                    for review in current.get('reviews', []):
                        sentiment = compute_sentiment(review['rating'])
                        review_row = [
                            current['ASIN'],
                            review['customer'],
                            review['rating'],
                            review['votes'],
                            review['helpful'],
                            sentiment,
                            review['date']
                        ]
                        if None in review_row or any(str(x).strip() == '' for x in review_row[:5]):
                            skipped_reviews += 1
                        else:
                            review_writer.writerow(review_row)

                    for similar_asin in current.get('similar', []):
                        if similar_asin and str(similar_asin).strip():
                            edge = tuple(sorted([current['ASIN'], similar_asin]))
                            edge_counts[edge] = edge_counts.get(edge, 0) + 1
                        else:
                            skipped_edges += 1

        # Write edges only for valid ASINs
        for (source, target), weight in edge_counts.items():
            if source in valid_asins and target in valid_asins:
                edge_writer.writerow([source, target, weight])
            else:
                skipped_edges += 1

        # Print summary of skipped rows
        print(f"Skipped {skipped_products} product rows due to missing required columns or invalid Group/Title.")
        print(f"Skipped {skipped_categories} category rows due to missing or empty category paths.")
        print(f"Skipped {skipped_reviews} review rows due to missing required columns.")
        print(f"Skipped {skipped_edges} edge rows due to missing/empty ASINs or invalid products.")

In [5]:
def compute_sentiment(rating):
    """Compute a placeholder sentiment score based on rating (no review text available)."""
    if rating <= 2:
        return -1.0  # Negative
    elif rating == 3:
        return 0.0   # Neutral
    else:
        return 1.0   # Positive

In [6]:
def convert_data():
    """Convert CSV files into cleaned, structured pandas DataFrames."""
    products_df = pd.read_csv(product_output)
    categories_df = pd.read_csv(category_output)
    reviews_df = pd.read_csv(review_output)

    # Drop rows with missing values in critical columns
    products_df = products_df.dropna(subset=['Id', 'ASIN'])
    categories_df = categories_df.dropna(subset=['ASIN', 'CategoryPath'])
    reviews_df = reviews_df.dropna(subset=['ASIN', 'CustomerID', 'Rating', 'Votes', 'Helpful'])

    # Double-check filtering for Group and Title
    initial_count = len(products_df)
    products_df = products_df[
        (products_df['Group'].notna()) &
        (products_df['Group'].str.strip().str.lower() != 'unknown') &
        (products_df['Group'].str.strip() != '') &
        (products_df['Title'].notna()) &
        (products_df['Title'].str.strip().str.lower() != 'unknown') &
        (products_df['Title'].str.strip() != '')
    ]
    print(f"Filtered out {initial_count - len(products_df)} product rows with invalid Group or Title in convert_data.")

    # Clean products DataFrame
    products_df['SalesRank'] = pd.to_numeric(products_df['SalesRank'], errors='coerce').fillna(-1).astype(int)

    # Parse categories
    def parse_category_path(cat_path):
        if pd.isna(cat_path):
            return []
        parts = cat_path.split("|")
        return [re.sub(r"\[\d+\]", "", part).strip() for part in parts if part]

    categories_df['CategoryLevels'] = categories_df['CategoryPath'].apply(parse_category_path)
    categories_expanded = categories_df.explode('CategoryLevels')

    # Aggregate review metrics
    review_summary = reviews_df.groupby('ASIN').agg({
        'CustomerID': 'count',
        'Rating': 'mean',
        'Votes': 'sum',
        'Helpful': 'sum',
        'Sentiment': 'mean'
    }).rename(columns={
        'CustomerID': 'NumReviews',
        'Rating': 'AvgRating',
        'Votes': 'TotalVotes',
        'Helpful': 'TotalHelpful',
        'Sentiment': 'AvgSentiment'
    }).reset_index()

    # Join with products
    products_enriched = products_df.merge(review_summary, on='ASIN', how='left')
    products_enriched = products_enriched.fillna({
        'NumReviews': 0, 'AvgRating': 0.0, 'TotalVotes': 0, 'TotalHelpful': 0, 'AvgSentiment': 0.0
    })

    # Save final cleaned data
    products_enriched.to_csv(products_enriched_file, index=False)
    categories_expanded.to_csv(categories_expanded_file, index=False)
    reviews_df.to_csv(reviews_processed_file, index=False)

    return products_enriched, categories_expanded, reviews_df

In [7]:
def preprocess_with_cache(force_reprocess=False):
    """Preprocess the data with caching. If cached files exist, load them; otherwise, preprocess."""
    required_files = [
        product_output, category_output, review_output, edge_output,
        products_enriched_file, categories_expanded_file, reviews_processed_file
    ]

    all_files_exist = all(os.path.exists(f) for f in required_files)

    if all_files_exist and not force_reprocess:
        print("Cached preprocessed files found. Loading from cache...")
        start = time.time()
        products_enriched = pd.read_csv(products_enriched_file)
        categories_expanded = pd.read_csv(categories_expanded_file)
        reviews_processed = pd.read_csv(reviews_processed_file)
        print(f"Loading cached data took {time.time() - start:.2f} seconds")
        return products_enriched, categories_expanded, reviews_processed

    print("Cached files not found or reprocessing forced. Running preprocessing...")
    start = time.time()
    print("Parsing Amazon metadata...")
    parse_start = time.time()
    parse_amazon_data()
    print(f"Parsing took {time.time() - parse_start:.2f} seconds")

    print("Converting and cleaning data...")
    convert_start = time.time()
    products_enriched, categories_expanded, reviews_processed = convert_data()
    print(f"Converting took {time.time() - convert_start:.2f} seconds")
    print("Done. Cleaned data saved to products_enriched.csv, categories_expanded.csv, reviews_processed.csv, and edges.csv")
    print(f"Total preprocessing took {time.time() - start:.2f} seconds")
    return products_enriched, categories_expanded, reviews_processed

In [8]:
# --- Part 2: Graph Construction & Analysis ---
# Output file paths
edges_file = 'edges.csv'
products_file = 'products_enriched.csv'
graph_metrics_file = 'graph_metrics.csv'
influential_nodes_file = 'influential_nodes.csv'
degree_dist_plot = 'degree_distribution.png'
graph_plot = 'copurchase_graph.png'

In [9]:
def build_copurchase_graph(edges_df, products_df=None):
    """Construct an undirected graph with weighted edges from co-purchasing relationships.
    Only include edges between ASINs present in products_df if provided."""
    G = nx.Graph()
    valid_asins = set(products_df['ASIN']) if products_df is not None else None

    edges = []
    for _, row in edges_df.iterrows():
        source = row['SourceASIN']
        target = row['TargetASIN']
        if valid_asins is None or (source in valid_asins and target in valid_asins):
            edges.append((source, target, {'weight': row['Weight']}))
        else:
            continue

    G.add_edges_from(edges)
    print(f"Graph constructed with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")
    return G

In [10]:
def add_node_attributes(G, products_df):
    """Add product attributes to graph nodes efficiently."""
    if products_df is not None:
        # Only include nodes that are in products_df
        products_df = products_df[products_df['ASIN'].isin(G.nodes)]
        attributes = {}
        for _, row in products_df.iterrows():
            asin = row['ASIN']
            attributes[asin] = {
                'Title': row['Title'],
                'Group': row['Group'],
                'SalesRank': row.get('SalesRank', -1),
                'AvgRating': row.get('AvgRating', 0.0),
                'NumReviews': row.get('NumReviews', 0),
                'AvgSentiment': row.get('AvgSentiment', 0.0)
            }
        nx.set_node_attributes(G, attributes)

        # Remove nodes that don't have attributes (shouldn't happen with proper filtering)
        nodes_to_remove = [node for node in G.nodes if node not in attributes]
        if nodes_to_remove:
            print(f"Removing {len(nodes_to_remove)} nodes that lack attributes (not in products_df).")
            G.remove_nodes_from(nodes_to_remove)

In [11]:
def analyze_graph_structure(G):
    """Analyze the graph's structure and compute basic metrics."""
    sampled_nodes = list(G.nodes)[:1000]
    sampled_graph = G.subgraph(sampled_nodes)

    metrics = {
        'num_nodes': G.number_of_nodes(),
        'num_edges': G.number_of_edges(),
        'avg_clustering': nx.average_clustering(sampled_graph),
        'num_components': nx.number_connected_components(G)
    }
    degrees = [d for _, d in G.degree()]
    metrics['avg_degree'] = np.mean(degrees) if degrees else 0

    print("Graph Metrics:")
    for key, value in metrics.items():
        print(f"{key}: {value}")

    return metrics, degrees

In [12]:
def plot_degree_distribution(degrees):
    """Plot the degree distribution with a power-law fit."""
    plt.figure(figsize=(10, 6))
    counts = Counter(degrees)
    plt.scatter(counts.keys(), counts.values(), color='blue', alpha=0.5, label='Degree')
    degrees = np.array(list(counts.keys()))
    frequencies = np.array(list(counts.values()))
    mask = (degrees > 0) & (frequencies > 0)
    log_degrees = np.log10(degrees[mask])
    log_freq = np.log10(frequencies[mask])
    if len(log_degrees) > 1:
        coeffs = np.polyfit(log_degrees, log_freq, 1)
        plt.plot(degrees, 10**(coeffs[1] + coeffs[0] * np.log10(degrees)), 'k--',
                 label=f'Power-law fit (γ={-coeffs[0]:.2f})')
    plt.xscale('log')
    plt.yscale('log')
    plt.xlabel('Degree')
    plt.ylabel('Frequency')
    plt.title('Degree Distribution of Co-Purchasing Network')
    plt.legend()
    plt.savefig(degree_dist_plot)
    plt.close()
    print(f"Degree distribution plot saved to {degree_dist_plot}")

In [13]:
def find_influential_nodes(G):
    """Compute centrality metrics to identify influential nodes."""
    print("Computing degree centrality...")
    degree_centrality = nx.degree_centrality(G)

    print("Computing PageRank...")
    pagerank = nx.pagerank(G, weight='weight', max_iter=50)

    centrality_df = pd.DataFrame({
        'ASIN': list(degree_centrality.keys()),
        'DegreeCentrality': list(degree_centrality.values()),
        'PageRank': [pagerank.get(node, 0) for node in degree_centrality]
    })
    influential_nodes = centrality_df.sort_values(by='PageRank', ascending=False).head(10)

    return centrality_df, influential_nodes

In [14]:
def detect_communities(G):
    """Detect communities using either Leiden (if available), Louvain, or NetworkX's greedy modularity."""
    if USE_LOUVAIN:
        try:
            g = ig.Graph.from_networkx(G)
            try:
                partition = g.community_leiden(objective_function='modularity', weights='weight')
                partition_dict = {g.vs[i]['_nx_name']: comm for i, comm in enumerate(partition.membership)}
                for node, comm in partition_dict.items():
                    G.nodes[node]['community'] = comm
                num_communities = len(set(partition_dict.values()))
                print(f"Detected {num_communities} communities using Leiden method")
            except:
                partition = louvain_best_partition(G)
                for node, comm in partition.items():
                    G.nodes[node]['community'] = comm
                num_communities = len(set(partition.values()))
                print(f"Detected {num_communities} communities using Louvain method")
        except Exception as e:
            print(f"Error in Louvain/Leiden: {e}, falling back to NetworkX")
            communities = list(nx.algorithms.community.greedy_modularity_communities(G))
            partition = {}
            for comm_id, comm_nodes in enumerate(communities):
                for node in comm_nodes:
                    partition[node] = comm_id
                    G.nodes[node]['community'] = comm_id
            num_communities = len(communities)
            print(f"Detected {num_communities} communities using NetworkX greedy modularity")
    else:
        communities = list(nx.algorithms.community.greedy_modularity_communities(G))
        partition = {}
        for comm_id, comm_nodes in enumerate(communities):
            for node in comm_nodes:
                partition[node] = comm_id
                G.nodes[node]['community'] = comm_id
        num_communities = len(communities)
        print(f"Detected {num_communities} communities using NetworkX greedy modularity")
    return partition

In [15]:
def analyze_by_category(G, products_df):
    """Analyze edges within and between product groups."""
    asin_to_group = dict(zip(products_df['ASIN'], products_df['Group']))
    group_edges = {'within': {}, 'between': {}}
    for u, v in G.edges():
        group_u = asin_to_group.get(u, 'Unknown')
        group_v = asin_to_group.get(v, 'Unknown')
        if group_u == group_v:
            group_edges['within'][group_u] = group_edges['within'].get(group_u, 0) + 1
        else:
            edge = tuple(sorted([group_u, group_v]))
            group_edges['between'][edge] = group_edges['between'].get(edge, 0) + 1
    print("Edges within groups:", group_edges['within'])
    print("Edges between groups:", group_edges['between'])

In [16]:
def analyze_sentiment_centrality(centrality_df, products_df):
    """Analyze correlation between centrality and sentiment/reviews."""
    merged_df = centrality_df.merge(products_df[['ASIN', 'AvgSentiment', 'NumReviews']], on='ASIN')
    corr_pagerank_sentiment = merged_df['PageRank'].corr(merged_df['AvgSentiment'])
    corr_pagerank_reviews = merged_df['PageRank'].corr(merged_df['NumReviews'])
    print(f"Correlation between PageRank and AvgSentiment: {corr_pagerank_sentiment:.3f}")
    print(f"Correlation between PageRank and NumReviews: {corr_pagerank_reviews:.3f}")

In [17]:
def visualize_graph(G, products_df):
    """Visualize a subgraph with enhanced features for better analysis."""
    centrality_df = pd.DataFrame.from_dict(nx.pagerank(G, weight='weight'), orient='index', columns=['PageRank'])

    community_sizes = Counter(nx.get_node_attributes(G, 'community').values())
    largest_community = max(community_sizes, key=community_sizes.get)
    print(f"Largest community: Community {largest_community} with {community_sizes[largest_community]} nodes")

    nodes_in_largest_community = [node for node, comm in nx.get_node_attributes(G, 'community').items()
                                 if comm == largest_community]

    community_df = centrality_df.loc[nodes_in_largest_community]
    top_nodes = community_df.nlargest(20, 'PageRank').index.tolist()
    subgraph = G.subgraph(top_nodes)

    plt.figure(figsize=(15, 10))
    pos = nx.kamada_kawai_layout(subgraph, scale=2)

    groups = [subgraph.nodes[node].get('Group', 'Unknown') for node in subgraph.nodes()]
    unique_groups = list(set(groups))
    colors = plt.cm.Set2(np.linspace(0, 1, len(unique_groups)))
    group_to_color = dict(zip(unique_groups, colors))
    node_colors = [group_to_color[group] for group in groups]

    pagerank_values = [centrality_df.loc[node, 'PageRank'] for node in subgraph.nodes()]
    node_sizes = [v * 50000 for v in pagerank_values]

    edge_weights = [subgraph[u][v]['weight'] for u, v in subgraph.edges()]
    max_weight = max(edge_weights) if edge_weights else 1
    edge_widths = [w / max_weight * 5 for w in edge_weights]

    nx.draw(subgraph, pos, node_size=node_sizes, node_color=node_colors,
            edge_color='gray', width=edge_widths, alpha=0.6)

    top_5_nodes = community_df.nlargest(5, 'PageRank').index.tolist()
    labels = {}
    for node in top_5_nodes:
        title = subgraph.nodes[node].get('Title', 'Unknown')[:20]
        group = subgraph.nodes[node].get('Group', 'Unknown')
        labels[node] = f"{title}\n({group})"
    nx.draw_networkx_labels(subgraph, pos, labels, font_size=10, font_color='black', font_weight='bold')

    from matplotlib.lines import Line2D
    legend_elements = [Line2D([0], [0], marker='o', color='w', label=group,
                              markerfacecolor=group_to_color[group], markersize=10)
                       for group in unique_groups]
    plt.legend(handles=legend_elements, title="Product Groups", loc='best')

    plt.title(f'Co-Purchasing Network (Top Nodes in Community {largest_community})')
    plt.savefig(graph_plot)
    plt.close()
    print(f"Graph visualization saved to {graph_plot}")

    summary_data = []
    for node in top_nodes:
        summary_data.append({
            'ASIN': node,
            'Title': subgraph.nodes[node].get('Title', 'Unknown'),
            'Group': subgraph.nodes[node].get('Group', 'Unknown'),
            'PageRank': centrality_df.loc[node, 'PageRank'],
            'AvgRating': subgraph.nodes[node].get('AvgRating', 0.0),
            'NumReviews': subgraph.nodes[node].get('NumReviews', 0),
            'Community': subgraph.nodes[node].get('community', -1)
        })
    summary_df = pd.DataFrame(summary_data)
    summary_df.to_csv('visualized_nodes_summary.csv', index=False)
    print("Summary of visualized nodes saved to visualized_nodes_summary.csv")
    print(summary_df)

In [18]:
def visualize_graph_interactive(G, products_df):
    """Visualize an interactive subgraph using Plotly."""
    centrality_df = pd.DataFrame.from_dict(nx.pagerank(G, weight='weight'), orient='index', columns=['PageRank'])
    community_sizes = Counter(nx.get_node_attributes(G, 'community').values())
    largest_community = max(community_sizes, key=community_sizes.get)
    nodes_in_largest_community = [node for node, comm in nx.get_node_attributes(G, 'community').items()
                                 if comm == largest_community]
    community_df = centrality_df.loc[nodes_in_largest_community]
    top_nodes = community_df.nlargest(20, 'PageRank').index.tolist()
    subgraph = G.subgraph(top_nodes)

    pos = nx.kamada_kawai_layout(subgraph, scale=2)

    edge_x = []
    edge_y = []
    for edge in subgraph.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.5, color='#888'),
        hoverinfo='none',
        mode='lines')

    node_x = [pos[node][0] for node in subgraph.nodes()]
    node_y = [pos[node][1] for node in subgraph.nodes()]

    groups = [subgraph.nodes[node].get('Group', 'Unknown') for node in subgraph.nodes()]
    unique_groups = list(set(groups))
    colors = plt.cm.Set2(np.linspace(0, 1, len(unique_groups)))
    group_to_color = dict(zip(unique_groups, [f'rgb({int(c[0]*255)},{int(c[1]*255)},{int(c[2]*255)})' for c in colors]))
    node_colors = [group_to_color[group] for group in groups]

    pagerank_values = [centrality_df.loc[node, 'PageRank'] for node in subgraph.nodes()]
    node_sizes = [v * 50000 for v in pagerank_values]

    node_text = []
    for node in subgraph.nodes():
        title = subgraph.nodes[node].get('Title', 'Unknown')
        group = subgraph.nodes[node].get('Group', 'Unknown')
        pagerank = centrality_df.loc[node, 'PageRank']
        avg_rating = subgraph.nodes[node].get('AvgRating', 0.0)
        num_reviews = subgraph.nodes[node].get('NumReviews', 0)
        community = subgraph.nodes[node].get('community', -1)
        node_text.append(f"ASIN: {node}<br>Title: {title}<br>Group: {group}<br>PageRank: {pagerank:.6f}<br>AvgRating: {avg_rating:.1f}<br>NumReviews: {num_reviews}<br>Community: {community}")

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers',
        hoverinfo='text',
        text=node_text,
        marker=dict(
            showscale=False,
            color=node_colors,
            size=node_sizes,
            line_width=2))

    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title=f'Co-Purchasing Network (Top Nodes in Community {largest_community})',
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=40),
                        xaxis=dict(showgrid=False, zeroline=False),
                        yaxis=dict(showgrid=False, zeroline=False)))

    fig.write_html('copurchase_graph_interactive.html')
    print("Interactive graph saved to copurchase_graph_interactive.html")

In [19]:
def perform_graph_analysis(interactive=False):
    """Execute graph construction and analysis using preprocessed data."""
    if not os.path.exists(edges_file):
        raise FileNotFoundError(f"Edges file {edges_file} not found. Preprocessing must complete first.")
    edges_df = pd.read_csv(edges_file)

    products_df = None
    if os.path.exists(products_file):
        products_df = pd.read_csv(products_file)
        print(f"Loaded products data with {len(products_df)} records")

    print("Building co-purchasing graph...")
    start = time.time()
    G = build_copurchase_graph(edges_df, products_df)
    print(f"Graph construction took {time.time() - start:.2f} seconds")

    if products_df is not None:
        print("Adding node attributes...")
        start = time.time()
        add_node_attributes(G, products_df)
        print(f"Adding attributes took {time.time() - start:.2f} seconds")

    print("Analyzing graph structure...")
    start = time.time()
    metrics, degrees = analyze_graph_structure(G)
    print(f"Structure analysis took {time.time() - start:.2f} seconds")

    print("Plotting degree distribution...")
    start = time.time()
    plot_degree_distribution(degrees)
    print(f"Degree plotting took {time.time() - start:.2f} seconds")

    print("Identifying influential nodes...")
    start = time.time()
    centrality_df, influential_nodes = find_influential_nodes(G)
    print(f"Influential nodes computation took {time.time() - start:.2f} seconds")

    print("Detecting communities...")
    start = time.time()
    partition = detect_communities(G)
    print(f"Community detection took {time.time() - start:.2f} seconds")

    print("Analyzing by category...")
    start = time.time()
    analyze_by_category(G, products_df)
    print(f"Category analysis took {time.time() - start:.2f} seconds")

    print("Analyzing sentiment and reviews...")
    start = time.time()
    analyze_sentiment_centrality(centrality_df, products_df)
    print(f"Sentiment analysis took {time.time() - start:.2f} seconds")

    metrics_df = pd.DataFrame([metrics])
    metrics_df.to_csv(graph_metrics_file, index=False)
    print(f"Graph metrics saved to {graph_metrics_file}")

    centrality_df.to_csv(influential_nodes_file, index=False)
    print(f"Influential nodes saved to {influential_nodes_file}")
    print("\nTop 10 Influential Nodes (by PageRank):")
    print(influential_nodes)

    print("Visualizing graph...")
    start = time.time()
    if interactive:
        visualize_graph_interactive(G, products_df)
    else:
        visualize_graph(G, products_df)
    print(f"Graph visualization took {time.time() - start:.2f} seconds")

In [20]:
# --- Main Execution ---
if __name__ == "__main__":
    print("Part 1: Preprocessing Amazon Metadata")
    start = time.time()
    products_enriched, categories_expanded, reviews_processed = preprocess_with_cache(force_reprocess=True)

    print("\nPart 2: Graph Construction and Analysis")
    start = time.time()
    perform_graph_analysis(interactive=True)
    print(f"Graph analysis took {time.time() - start:.2f} seconds")
    print("Graph analysis complete.")

Part 1: Preprocessing Amazon Metadata
Cached files not found or reprocessing forced. Running preprocessing...
Parsing Amazon metadata...
Skipped 5870 product rows due to missing required columns or invalid Group/Title.
Skipped 0 category rows due to missing or empty category paths.
Skipped 0 review rows due to missing required columns.
Skipped 557325 edge rows due to missing/empty ASINs or invalid products.
Parsing took 127.20 seconds
Converting and cleaning data...
Filtered out 0 product rows with invalid Group or Title in convert_data.
Converting took 184.15 seconds
Done. Cleaned data saved to products_enriched.csv, categories_expanded.csv, reviews_processed.csv, and edges.csv
Total preprocessing took 311.36 seconds

Part 2: Graph Construction and Analysis
Loaded products data with 542682 records
Building co-purchasing graph...
Graph constructed with 366987 nodes and 987903 edges
Graph construction took 69.76 seconds
Adding node attributes...
Adding attributes took 27.78 seconds
Anal