<a href="https://colab.research.google.com/github/Annieshilpha07/Product_Recommendation/blob/main/Product_Dataset_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Convert Dataset JSONL to CSV with selected columns

This project converts Amazon Fashion review and metadata JSONL files to CSV and merges them using the **parent_asin** column. It links user reviews with product details to create a combined dataset for analysis.


In [None]:
!pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [None]:
import json
import jsonlines
import pandas as pd


def parse_jsonl(path):
    data = []
    with open(path, 'r') as f:
        for line in f:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Skipping line due to JSON error: {e}")
    return pd.DataFrame(data)


# ----------- 1. Parse Fashion Reviews ----------- #
review_path = "/Amazon_Fashion.jsonl"
review_df = parse_jsonl(review_path)

# Keep only necessary review columns
review_columns = [
    'rating', 'title', 'reviewText', 'asin', 'parent_asin',
    'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'
]
review_df = review_df[[col for col in review_columns if col in review_df.columns]]

# ----------- 2. Parse Fashion Metadata ----------- #
meta_path = "/meta_Amazon_Fashion.jsonl"
meta_df = parse_jsonl(meta_path)

# Keep only necessary metadata columns
meta_columns = [
    'main_category', 'title', 'average_rating', 'rating_number', 'features',
    'description', 'price', 'images', 'store', 'categories',
    'details', 'parent_asin'
]
meta_df = meta_df[[col for col in meta_columns if col in meta_df.columns]]

# ----------- 3. Filter Common `parent_asin` IDs ----------- #
common_parent_asins = set(review_df['parent_asin']).intersection(set(meta_df['parent_asin']))
limited_parent_asins = list(common_parent_asins)[:100000]

# Filter both DataFrames by limited parent_asins
review_filtered = review_df[review_df['parent_asin'].isin(limited_parent_asins)]
meta_filtered = meta_df[meta_df['parent_asin'].isin(limited_parent_asins)]

# ----------- 4. Merge Both on `parent_asin` ----------- #
combined_df = pd.merge(review_filtered, meta_filtered, on='parent_asin', how='inner')

# Save final combined DataFrame
combined_df.to_csv("fashion_combined_by_parent_asin.csv", index=False)
print("✅ Combined CSV saved:", combined_df.shape)


Skipping line due to JSON error: Unterminated string starting at: line 1 column 177 (char 176)
Skipping line due to JSON error: Unterminated string starting at: line 1 column 46 (char 45)
✅ Combined CSV saved: (224987, 19)


In [None]:
from google.colab import files
files.download("fashion_combined_by_parent_asin.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>