<a href="https://colab.research.google.com/github/FredSadeghi/Amazon_CoPurchase_Network_Analysis/blob/main/BigDataAmazon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gzip
import csv
import re
import pandas as pd
from textblob import TextBlob
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import os
import time

# Uninstall and reinstall python-louvain to ensure a clean installation
!pip uninstall -y python-louvain community
!pip install python-louvain

# Try importing the Louvain method
try:
    from community import best_partition as louvain_best_partition
    USE_LOUVAIN = True
    print("Successfully imported python-louvain for community detection.")
except ImportError as e:
    print(f"Failed to import python-louvain: {e}")
    print("Falling back to NetworkX's greedy_modularity_communities for community detection.")
    USE_LOUVAIN = False

Found existing installation: python-louvain 0.16
Uninstalling python-louvain-0.16:
  Successfully uninstalled python-louvain-0.16
[0mCollecting python-louvain
  Using cached python_louvain-0.16-py3-none-any.whl
Installing collected packages: python-louvain
Successfully installed python-louvain-0.16
Successfully imported python-louvain for community detection.


In [2]:
if not os.path.exists('/content/Amazon_CoPurchase_Network_Analysis'):
    !git clone https://github.com/FredSadeghi/Amazon_CoPurchase_Network_Analysis.git

else:
    print("Repository already cloned. Skipping.")

Repository already cloned. Skipping.


In [3]:
# --- Part 1: Preprocessing ---
# Input and output file paths
input_file = '/content/Amazon_CoPurchase_Network_Analysis/amazon-meta.txt.gz'
product_output = 'products_cleaned.csv'
category_output = 'categories_cleaned.csv'
review_output = 'reviews_cleaned.csv'
edge_output = 'edges.csv'

In [4]:
def parse_amazon_data():
    """Parse raw Amazon metadata into separate CSV files for products, categories, reviews, and weighted edges."""
    with gzip.open(input_file, 'rt', encoding='latin-1') as f, \
         open(product_output, 'w', newline='', encoding='utf-8') as prod_out, \
         open(category_output, 'w', newline='', encoding='utf-8') as cat_out, \
         open(review_output, 'w', newline='', encoding='utf-8') as rev_out, \
         open(edge_output, 'w', newline='', encoding='utf-8') as edge_out:

        product_writer = csv.writer(prod_out)
        category_writer = csv.writer(cat_out)
        review_writer = csv.writer(rev_out)
        edge_writer = csv.writer(edge_out)

        # Write headers
        product_writer.writerow(['Id', 'ASIN', 'Title', 'Group', 'SalesRank'])
        category_writer.writerow(['ASIN', 'CategoryPath'])
        review_writer.writerow(['ASIN', 'CustomerID', 'Rating', 'Votes', 'Helpful', 'Sentiment', 'Date'])
        edge_writer.writerow(['SourceASIN', 'TargetASIN', 'Weight'])

        current = {}
        edge_counts = {}  # To track weights for edges
        for line in f:
            line = line.strip()

            # New product entry
            if line.startswith("Id:"):
                if current.get('ASIN') and current.get('Id'):
                    product_writer.writerow([
                        current.get('Id'),
                        current.get('ASIN'),
                        current.get('title', 'Unknown'),
                        current.get('group', 'Unknown'),
                        current.get('salesrank', '-1')
                    ])
                    for cat in current.get('categories', []):
                        category_writer.writerow([current['ASIN'], cat])
                    for review in current.get('reviews', []):
                        sentiment = compute_sentiment(review['rating'])
                        review_writer.writerow([
                            current['ASIN'], review['customer'], review['rating'],
                            review['votes'], review['helpful'], sentiment, review['date']
                        ])
                    for similar_asin in current.get('similar', []):
                        # Create undirected edge by sorting ASINs
                        edge = tuple(sorted([current['ASIN'], similar_asin]))
                        edge_counts[edge] = edge_counts.get(edge, 0) + 1
                current = {'categories': [], 'reviews': [], 'similar': []}
                current['Id'] = line.split('Id:')[1].strip()

            elif line.startswith("ASIN:"):
                current['ASIN'] = line.split("ASIN:")[1].strip()

            elif 'title:' in line:
                match = re.search(r'title:\s*(.*)', line)
                if match:
                    current['title'] = match.group(1).strip()

            elif 'group:' in line:
                match = re.search(r'group:\s*(.*)', line)
                if match:
                    current['group'] = match.group(1).strip()

            elif 'salesrank:' in line:
                match = re.search(r'salesrank:\s*(.*)', line)
                if match:
                    current['salesrank'] = match.group(1).strip()

            elif line.startswith("similar:"):
                parts = line.split()
                current['similar'] = parts[2:] if len(parts) > 2 else []

            elif line.startswith("|"):
                current['categories'].append(line.strip())

            elif re.match(r'\d{4}-\d{1,2}-\d{1,2}', line):
                parts = line.split()
                if len(parts) >= 7:
                    review = {
                        'date': parts[0],  # e.g., 2000-7-28
                        'customer': parts[2],
                        'rating': int(parts[4]),
                        'votes': int(parts[6]),
                        'helpful': int(parts[8])
                    }
                    current['reviews'].append(review)

        # Write the last product
        if current.get('ASIN') and current.get('Id'):
            product_writer.writerow([
                current.get('Id'),
                current.get('ASIN'),
                current.get('title', 'Unknown'),
                current.get('group', 'Unknown'),
                current.get('salesrank', '-1')
            ])
            for cat in current.get('categories', []):
                category_writer.writerow([current['ASIN'], cat])
            for review in current.get('reviews', []):
                sentiment = compute_sentiment(review['rating'])
                review_writer.writerow([
                    current['ASIN'], review['customer'], review['rating'],
                    review['votes'], review['helpful'], sentiment, review['date']
                ])
            for similar_asin in current.get('similar', []):
                edge = tuple(sorted([current['ASIN'], similar_asin]))
                edge_counts[edge] = edge_counts.get(edge, 0) + 1

        # Write weighted edges
        for (source, target), weight in edge_counts.items():
            edge_writer.writerow([source, target, weight])

In [5]:
def compute_sentiment(rating):
    """Compute a placeholder sentiment score based on rating (no review text available)."""
    if rating <= 2:
        return -1.0  # Negative
    elif rating == 3:
        return 0.0   # Neutral
    else:
        return 1.0   # Positive

In [6]:
def convert_data():
    """Convert CSV files into cleaned, structured pandas DataFrames."""
    products_df = pd.read_csv(product_output)
    categories_df = pd.read_csv(category_output)
    reviews_df = pd.read_csv(review_output)

    # Clean products DataFrame
    products_df['SalesRank'] = pd.to_numeric(products_df['SalesRank'], errors='coerce').fillna(-1).astype(int)
    products_df['Title'] = products_df['Title'].fillna('Unknown')
    products_df['Group'] = products_df['Group'].fillna('Unknown')

    # Parse categories
    def parse_category_path(cat_path):
        if pd.isna(cat_path):
            return []
        parts = cat_path.split("|")
        return [re.sub(r"\[\d+\]", "", part).strip() for part in parts if part]

    categories_df['CategoryLevels'] = categories_df['CategoryPath'].apply(parse_category_path)
    categories_expanded = categories_df.explode('CategoryLevels')

    # Aggregate review metrics
    review_summary = reviews_df.groupby('ASIN').agg({
        'CustomerID': 'count',
        'Rating': 'mean',
        'Votes': 'sum',
        'Helpful': 'sum',
        'Sentiment': 'mean'
    }).rename(columns={
        'CustomerID': 'NumReviews',
        'Rating': 'AvgRating',
        'Votes': 'TotalVotes',
        'Helpful': 'TotalHelpful',
        'Sentiment': 'AvgSentiment'
    }).reset_index()

    # Join with products
    products_enriched = products_df.merge(review_summary, on='ASIN', how='left')
    products_enriched = products_enriched.fillna({
        'NumReviews': 0, 'AvgRating': 0.0, 'TotalVotes': 0, 'TotalHelpful': 0, 'AvgSentiment': 0.0
    })

    # Save final cleaned data
    products_enriched.to_csv('products_enriched.csv', index=False)
    categories_expanded.to_csv('categories_expanded.csv', index=False)
    reviews_df.to_csv('reviews_processed.csv', index=False)

    return products_enriched, categories_expanded, reviews_df

In [7]:
# --- Part 2: Graph Construction & Analysis ---
# Output file paths
edges_file = 'edges.csv'
products_file = 'products_enriched.csv'
graph_metrics_file = 'graph_metrics.csv'
influential_nodes_file = 'influential_nodes.csv'
degree_dist_plot = 'degree_distribution.png'
graph_plot = 'copurchase_graph.png'

In [8]:
def build_copurchase_graph(edges_df):
    """Construct an undirected graph with weighted edges from co-purchasing relationships."""
    G = nx.Graph()  # Use undirected graph
    for _, row in edges_df.iterrows():
        source = row['SourceASIN']
        target = row['TargetASIN']
        weight = row['Weight']
        G.add_edge(source, target, weight=weight)
    print(f"Graph constructed with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")
    return G

In [9]:
def add_node_attributes(G, products_df):
    """Add product attributes to graph nodes."""
    if products_df is not None:
        for _, row in products_df.iterrows():
            asin = row['ASIN']
            if asin in G.nodes:
                G.nodes[asin]['Title'] = row.get('Title', 'Unknown')
                G.nodes[asin]['Group'] = row.get('Group', 'Unknown')
                G.nodes[asin]['SalesRank'] = row.get('SalesRank', -1)
                G.nodes[asin]['AvgRating'] = row.get('AvgRating', 0.0)
                G.nodes[asin]['NumReviews'] = row.get('NumReviews', 0)
                G.nodes[asin]['AvgSentiment'] = row.get('AvgSentiment', 0.0)

In [10]:
def analyze_graph_structure(G):
    """Analyze the graph's structure and compute basic metrics."""
    metrics = {
        'num_nodes': G.number_of_nodes(),
        'num_edges': G.number_of_edges(),
        'density': nx.density(G),
        'avg_clustering': nx.average_clustering(G),
        'num_components': nx.number_connected_components(G)  # For undirected graph
    }
    degrees = [d for _, d in G.degree()]
    metrics['avg_degree'] = np.mean(degrees) if degrees else 0

    print("Graph Metrics:")
    for key, value in metrics.items():
        print(f"{key}: {value}")

    return metrics, degrees

In [11]:
def plot_degree_distribution(degrees):
    """Plot the degree distribution with a power-law fit."""
    plt.figure(figsize=(10, 6))
    counts = Counter(degrees)
    plt.scatter(counts.keys(), counts.values(), color='blue', alpha=0.5, label='Degree')
    # Fit power-law
    degrees = np.array(list(counts.keys()))
    frequencies = np.array(list(counts.values()))
    mask = (degrees > 0) & (frequencies > 0)  # Avoid log(0)
    log_degrees = np.log10(degrees[mask])
    log_freq = np.log10(frequencies[mask])
    if len(log_degrees) > 1:  # Need at least 2 points to fit
        coeffs = np.polyfit(log_degrees, log_freq, 1)
        plt.plot(degrees, 10**(coeffs[1] + coeffs[0] * np.log10(degrees)), 'k--',
                 label=f'Power-law fit (γ={-coeffs[0]:.2f})')
    plt.xscale('log')
    plt.yscale('log')
    plt.xlabel('Degree')
    plt.ylabel('Frequency')
    plt.title('Degree Distribution of Co-Purchasing Network')
    plt.legend()
    plt.savefig(degree_dist_plot)
    plt.close()
    print(f"Degree distribution plot saved to {degree_dist_plot}")

In [12]:
def find_influential_nodes(G):
    """Compute centrality metrics to identify influential nodes."""
    degree_centrality = nx.degree_centrality(G)
    betweenness_centrality = nx.betweenness_centrality(G, k=100)  # Reduced k for speed
    pagerank = nx.pagerank(G, weight='weight')  # Use weighted PageRank

    centrality_df = pd.DataFrame({
        'ASIN': list(degree_centrality.keys()),
        'DegreeCentrality': list(degree_centrality.values()),
        'BetweennessCentrality': [betweenness_centrality.get(node, 0) for node in degree_centrality],
        'PageRank': [pagerank.get(node, 0) for node in degree_centrality]
    })
    influential_nodes = centrality_df.sort_values(by='PageRank', ascending=False).head(10)

    return centrality_df, influential_nodes

In [13]:
def detect_communities(G):
    """Detect communities using either Louvain (if available) or NetworkX's greedy modularity."""
    if USE_LOUVAIN:
        # Use python-louvain's best_partition
        partition = louvain_best_partition(G)
        for node, comm in partition.items():
            G.nodes[node]['community'] = comm
        num_communities = len(set(partition.values()))
        print(f"Detected {num_communities} communities using Louvain method")
    else:
        # Fallback to NetworkX's greedy_modularity_communities
        communities = list(nx.algorithms.community.greedy_modularity_communities(G))
        partition = {}
        for comm_id, comm_nodes in enumerate(communities):
            for node in comm_nodes:
                partition[node] = comm_id
                G.nodes[node]['community'] = comm_id
        num_communities = len(communities)
        print(f"Detected {num_communities} communities using NetworkX greedy modularity")
    return partition

In [14]:
def analyze_by_category(G, products_df):
    """Analyze edges within and between product groups."""
    asin_to_group = dict(zip(products_df['ASIN'], products_df['Group']))
    group_edges = {'within': {}, 'between': {}}
    for u, v in G.edges():
        group_u = asin_to_group.get(u, 'Unknown')
        group_v = asin_to_group.get(v, 'Unknown')
        if group_u == group_v:
            group_edges['within'][group_u] = group_edges['within'].get(group_u, 0) + 1
        else:
            edge = tuple(sorted([group_u, group_v]))
            group_edges['between'][edge] = group_edges['between'].get(edge, 0) + 1
    print("Edges within groups:", group_edges['within'])
    print("Edges between groups:", group_edges['between'])

In [15]:
def analyze_sentiment_centrality(centrality_df, products_df):
    """Analyze correlation between centrality and sentiment/reviews."""
    merged_df = centrality_df.merge(products_df[['ASIN', 'AvgSentiment', 'NumReviews']], on='ASIN')
    corr_pagerank_sentiment = merged_df['PageRank'].corr(merged_df['AvgSentiment'])
    corr_pagerank_reviews = merged_df['PageRank'].corr(merged_df['NumReviews'])
    print(f"Correlation between PageRank and AvgSentiment: {corr_pagerank_sentiment:.3f}")
    print(f"Correlation between PageRank and NumReviews: {corr_pagerank_reviews:.3f}")

In [16]:
def visualize_graph(G):
    """Visualize a sampled subgraph with enhanced features."""
    top_nodes = sorted(G.degree(), key=lambda x: x[1], reverse=True)[:50]
    top_nodes = [node for node, _ in top_nodes]
    subgraph = G.subgraph(top_nodes)

    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(subgraph)
    # Color by group
    groups = [subgraph.nodes[node].get('Group', 'Unknown') for node in subgraph.nodes()]
    unique_groups = list(set(groups))
    colors = plt.cm.tab10(np.linspace(0, 1, len(unique_groups)))
    group_to_color = dict(zip(unique_groups, colors))
    node_colors = [group_to_color[group] for group in groups]
    # Size by degree
    degrees = [subgraph.degree(node) * 10 for node in subgraph.nodes()]
    # Edge weights
    edge_weights = [subgraph[u][v]['weight'] for u, v in subgraph.edges()]
    nx.draw(subgraph, pos, node_size=degrees, node_color=node_colors, edge_color='gray',
            width=edge_weights, with_labels=True, font_size=8)
    plt.title('Co-Purchasing Network (Sampled Subgraph)')
    plt.savefig(graph_plot)
    plt.close()
    print(f"Graph visualization saved to {graph_plot}")

In [17]:
def perform_graph_analysis():
    """Execute graph construction and analysis using preprocessed data."""
    if not os.path.exists(edges_file):
        raise FileNotFoundError(f"Edges file {edges_file} not found. Preprocessing must complete first.")
    edges_df = pd.read_csv(edges_file)

    products_df = None
    if os.path.exists(products_file):
        products_df = pd.read_csv(products_file)
        print(f"Loaded products data with {len(products_df)} records")

    print("Building co-purchasing graph...")
    start = time.time()
    G = build_copurchase_graph(edges_df)
    print(f"Graph construction took {time.time() - start:.2f} seconds")

    if products_df is not None:
        print("Adding node attributes...")
        start = time.time()
        add_node_attributes(G, products_df)
        print(f"Adding attributes took {time.time() - start:.2f} seconds")

    print("Analyzing graph structure...")
    start = time.time()
    metrics, degrees = analyze_graph_structure(G)
    print(f"Structure analysis took {time.time() - start:.2f} seconds")

    print("Plotting degree distribution...")
    start = time.time()
    plot_degree_distribution(degrees)
    print(f"Degree plotting took {time.time() - start:.2f} seconds")

    print("Identifying influential nodes...")
    start = time.time()
    centrality_df, influential_nodes = find_influential_nodes(G)
    print(f"Influential nodes computation took {time.time() - start:.2f} seconds")

    print("Detecting communities...")
    start = time.time()
    partition = detect_communities(G)
    print(f"Community detection took {time.time() - start:.2f} seconds")

    print("Analyzing by category...")
    start = time.time()
    analyze_by_category(G, products_df)
    print(f"Category analysis took {time.time() - start:.2f} seconds")

    print("Analyzing sentiment and reviews...")
    start = time.time()
    analyze_sentiment_centrality(centrality_df, products_df)
    print(f"Sentiment analysis took {time.time() - start:.2f} seconds")

    metrics_df = pd.DataFrame([metrics])
    metrics_df.to_csv(graph_metrics_file, index=False)
    print(f"Graph metrics saved to {graph_metrics_file}")

    centrality_df.to_csv(influential_nodes_file, index=False)
    print(f"Influential nodes saved to {influential_nodes_file}")
    print("\nTop 10 Influential Nodes (by PageRank):")
    print(influential_nodes)

    print("Visualizing graph...")
    start = time.time()
    visualize_graph(G)
    print(f"Graph visualization took {time.time() - start:.2f} seconds")

In [18]:
# --- Main Execution ---
if __name__ == "__main__":
    print("Part 1: Preprocessing Amazon Metadata")
    start = time.time()
    print("Parsing Amazon metadata...")
    parse_start = time.time()
    parse_amazon_data()
    print(f"Parsing took {time.time() - parse_start:.2f} seconds")

    print("Converting and cleaning data...")
    convert_start = time.time()
    products_enriched, categories_expanded, reviews_processed = convert_data()
    print(f"Converting took {time.time() - convert_start:.2f} seconds")
    print("Done. Cleaned data saved to products_enriched.csv, categories_expanded.csv, reviews_processed.csv, and edges.csv")
    print(f"Total preprocessing took {time.time() - start:.2f} seconds")

    print("\nPart 2: Graph Construction and Analysis")
    start = time.time()
    perform_graph_analysis()
    print(f"Graph analysis took {time.time() - start:.2f} seconds")
    print("Graph analysis complete.")

Part 1: Preprocessing Amazon Metadata
Parsing Amazon metadata...
Parsing took 116.80 seconds
Converting and cleaning data...
Converting took 182.06 seconds
Done. Cleaned data saved to products_enriched.csv, categories_expanded.csv, reviews_processed.csv, and edges.csv
Total preprocessing took 298.86 seconds

Part 2: Graph Construction and Analysis
Loaded products data with 548552 records
Building co-purchasing graph...
Graph constructed with 554789 nodes and 1545228 edges
Graph construction took 101.43 seconds
Adding node attributes...
Adding attributes took 40.60 seconds
Analyzing graph structure...
Graph Metrics:
num_nodes: 554789
num_edges: 1545228
density: 1.0040784922418376e-05
avg_clustering: 0.3013179862039754
num_components: 5614
avg_degree: 5.570506985538646
Structure analysis took 26.49 seconds
Plotting degree distribution...
Degree distribution plot saved to degree_distribution.png
Degree plotting took 0.67 seconds
Identifying influential nodes...
Influential nodes computati