<a href="https://colab.research.google.com/github/FredSadeghi/Amazon_CoPurchase_Network_Analysis/blob/main/BigDataAmazon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import gzip
import csv
import re
import pandas as pd
from textblob import TextBlob  # For potential NLP if review text is available
import os # Import the os module

In [25]:
if not os.path.exists('/content/Amazon_CoPurchase_Network_Analysis'):
    !git clone https://github.com/FredSadeghi/Amazon_CoPurchase_Network_Analysis.git
else:
    print("Repository already cloned. Skipping.")

Repository already cloned. Skipping.


In [26]:
# Input file
input_file = '/content/Amazon_CoPurchase_Network_Analysis/amazon-meta.txt.gz'

# Output files
product_output = 'products_cleaned.csv'
category_output = 'categories_cleaned.csv'
review_output = 'reviews_cleaned.csv'
edge_output = 'edges.csv'  # New output for similar products

In [27]:
def parse_amazon_data():
    """Parse raw Amazon metadata into separate CSV files for products, categories, reviews, and edges."""
    with gzip.open(input_file, 'rt', encoding='latin-1') as f, \
         open(product_output, 'w', newline='', encoding='utf-8') as prod_out, \
         open(category_output, 'w', newline='', encoding='utf-8') as cat_out, \
         open(review_output, 'w', newline='', encoding='utf-8') as rev_out, \
         open(edge_output, 'w', newline='', encoding='utf-8') as edge_out:

        product_writer = csv.writer(prod_out)
        category_writer = csv.writer(cat_out)
        review_writer = csv.writer(rev_out)
        edge_writer = csv.writer(edge_out)

        # Write headers
        product_writer.writerow(['Id', 'ASIN', 'Title', 'Group', 'SalesRank'])
        category_writer.writerow(['ASIN', 'CategoryPath'])
        review_writer.writerow(['ASIN', 'CustomerID', 'Rating', 'Votes', 'Helpful', 'Sentiment'])
        edge_writer.writerow(['SourceASIN', 'TargetASIN'])

        current = {}
        for line in f:
            line = line.strip()

            # New product entry
            if line.startswith("Id:"):
                # Save previous product if it exists
                if current.get('ASIN') and current.get('Id'):
                    product_writer.writerow([
                        current.get('Id'),
                        current.get('ASIN'),
                        current.get('title', 'Unknown'),  # Handle missing title
                        current.get('group', 'Unknown'),  # Handle missing group
                        current.get('salesrank', '-1')    # Handle missing salesrank
                    ])
                    for cat in current.get('categories', []):
                        category_writer.writerow([current['ASIN'], cat])
                    for review in current.get('reviews', []):
                        sentiment = compute_sentiment(review['rating'])
                        review_writer.writerow([
                            current['ASIN'], review['customer'], review['rating'],
                            review['votes'], review['helpful'], sentiment
                        ])
                    for similar_asin in current.get('similar', []):
                        edge_writer.writerow([current['ASIN'], similar_asin])
                current = {'categories': [], 'reviews': [], 'similar': []}
                current['Id'] = line.split('Id:')[1].strip()

            elif line.startswith("ASIN:"):
                current['ASIN'] = line.split("ASIN:")[1].strip()

            elif 'title:' in line:
                match = re.search(r'title:\s*(.*)', line)
                if match:
                    current['title'] = match.group(1).strip()

            elif 'group:' in line:
                match = re.search(r'group:\s*(.*)', line)
                if match:
                    current['group'] = match.group(1).strip()

            elif 'salesrank:' in line:
                match = re.search(r'salesrank:\s*(.*)', line)
                if match:
                    current['salesrank'] = match.group(1).strip()

            elif line.startswith("similar:"):
                parts = line.split()
                current['similar'] = parts[2:] if len(parts) > 2 else []

            elif line.startswith("|"):
                current['categories'].append(line.strip())

            elif re.match(r'\d{4}-\d{1,2}-\d{1,2}', line):  # Match review date
                parts = line.split()
                if len(parts) >= 7:
                    review = {
                        'customer': parts[2],
                        'rating': int(parts[4]),
                        'votes': int(parts[6]),
                        'helpful': int(parts[8])
                    }
                    current['reviews'].append(review)

        # Write the last product
        if current.get('ASIN') and current.get('Id'):
            product_writer.writerow([
                current.get('Id'),
                current.get('ASIN'),
                current.get('title', 'Unknown'),
                current.get('group', 'Unknown'),
                current.get('salesrank', '-1')
            ])
            for cat in current.get('categories', []):
                category_writer.writerow([current['ASIN'], cat])
            for review in current.get('reviews', []):
                sentiment = compute_sentiment(review['rating'])
                review_writer.writerow([
                    current['ASIN'], review['customer'], review['rating'],
                    review['votes'], review['helpful'], sentiment  # Fixed: use review['votes']
                ])
            for similar_asin in current.get('similar', []):
                edge_writer.writerow([current['ASIN'], similar_asin])

In [28]:
def compute_sentiment(rating):
    """Compute a placeholder sentiment score based on rating (no review text available)."""
    if rating <= 2:
        return -1.0  # Negative
    elif rating == 3:
        return 0.0   # Neutral
    else:
        return 1.0   # Positive

In [29]:
def convert_data():
    """Convert CSV files into cleaned, structured pandas DataFrames."""
    # Load CSVs
    products_df = pd.read_csv(product_output)
    categories_df = pd.read_csv(category_output)
    reviews_df = pd.read_csv(review_output)

    # Clean products DataFrame
    products_df['SalesRank'] = pd.to_numeric(products_df['SalesRank'], errors='coerce').fillna(-1).astype(int)
    products_df['Title'] = products_df['Title'].fillna('Unknown')
    products_df['Group'] = products_df['Group'].fillna('Unknown')

    # Parse categories
    def parse_category_path(cat_path):
        if pd.isna(cat_path):
            return []
        parts = cat_path.split("|")
        return [re.sub(r"\[\d+\]", "", part).strip() for part in parts if part]

    categories_df['CategoryLevels'] = categories_df['CategoryPath'].apply(parse_category_path)
    categories_expanded = categories_df.explode('CategoryLevels')

    # Aggregate review metrics
    review_summary = reviews_df.groupby('ASIN').agg({
        'CustomerID': 'count',
        'Rating': 'mean',
        'Votes': 'sum',
        'Helpful': 'sum',
        'Sentiment': 'mean'
    }).rename(columns={
        'CustomerID': 'NumReviews',
        'Rating': 'AvgRating',
        'Votes': 'TotalVotes',
        'Helpful': 'TotalHelpful',
        'Sentiment': 'AvgSentiment'
    }).reset_index()

    # Join with products
    products_enriched = products_df.merge(review_summary, on='ASIN', how='left')
    products_enriched = products_enriched.fillna({
        'NumReviews': 0, 'AvgRating': 0.0, 'TotalVotes': 0, 'TotalHelpful': 0, 'AvgSentiment': 0.0
    })

    # Save final cleaned data
    products_enriched.to_csv('products_enriched.csv', index=False)
    categories_expanded.to_csv('categories_expanded.csv', index=False)
    reviews_df.to_csv('reviews_processed.csv', index=False)

    return products_enriched, categories_expanded, reviews_df

In [30]:
# Execute preprocessing
print("Parsing Amazon metadata...")
parse_amazon_data()
print("Converting and cleaning data...")
products_enriched, categories_expanded, reviews_processed = convert_data()
print("Done. Cleaned data saved to products_enriched.csv, categories_expanded.csv, reviews_processed.csv, and edges.csv")

Parsing Amazon metadata...
Converting and cleaning data...
Done. Cleaned data saved to products_enriched.csv, categories_expanded.csv, reviews_processed.csv, and edges.csv
