In [1]:
# Downloads the dataset (~4gb) into a /datasets folder
import requests
import os
import zipfile

os.makedirs('datasets', exist_ok=True)

url = "https://www.kaggle.com/api/v1/datasets/download/googleai/regen-reviews-enhanced-with-generative-narratives"

regen_path = os.path.join('datasets', 'REGEN')
if os.path.exists(regen_path):
    print("REGEN folder found, please delete or rename the directory. If you already have the dataset, please ignore this message.")
else:
    response = requests.get(url)
    if response.status_code == 200:
        zip_path = os.path.join('datasets', 'archive.zip')
        with open(zip_path, 'wb') as f:
            f.write(response.content)
        
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall('datasets')
        
        os.remove(zip_path)
        print("Dataset downloaded and extracted to ./datasets")
    else:
        print(f"Failed to download. Status code: {response.status_code}")

REGEN folder found, please delete or rename the directory. If you already have the dataset, please ignore this message.


In [2]:
import pandas as pd
import numpy as np
import json

In [3]:
def read_large_jsonl(file_path, lines=1000):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= lines:
                break
            json_obj = json.loads(line.strip())
            
            # Flatten purchase history into separate columns with dot notation
            flattened = {}
            
            # Add base properties
            flattened['properties.reviewer_id'] = json_obj.get('reviewer_id', np.nan)
            
            # Add purchase history
            for idx, purchase in enumerate(json_obj['purchase_history']):
                prefix = f'properties.purchase_history.{idx+1}.'
                
                # Item details
                flattened[f'{prefix}item.asin'] = purchase['item'].get('asin', np.nan)
                # Clean strings by replacing newlines, commas, and normalizing spaces
                flattened[f'{prefix}item.title'] = ' '.join(''.join(char for char in purchase['item'].get('title', '') if char.isalnum() or char.isspace()).split()) or np.nan
                flattened[f'{prefix}item.category'] = ' '.join(''.join(char for char in purchase['item'].get('category', '') if char.isalnum() or char.isspace()).split()) or np.nan
                flattened[f'{prefix}item.description'] = ' '.join(''.join(char for char in purchase['item'].get('description', '') if char.isalnum() or char.isspace()).split()) if purchase['item'].get('description') else np.nan
                flattened[f'{prefix}item.price'] = float(purchase['item'].get('price', np.nan))
                
                # Review details
                flattened[f'{prefix}review.summary'] = ' '.join(''.join(char for char in purchase['review'].get('summary', '') if char.isalnum() or char.isspace()).split()) or np.nan
                flattened[f'{prefix}review.rating'] = float(purchase['review'].get('rating', np.nan))
                flattened[f'{prefix}review.text'] = ' '.join(''.join(char for char in purchase['review'].get('text', '') if char.isalnum() or char.isspace()).split()) or np.nan
                flattened[f'{prefix}review.timestamp'] = float(purchase['review'].get('unix_time', np.nan))

            # Add other fields with properties prefix
            for k,v in json_obj.items():
                if k != 'purchase_history' and k != 'reviewer_id':
                    if isinstance(v, str):
                        v = ' '.join(''.join(char for char in v if char.isalnum() or char.isspace()).split()) or np.nan
                    elif isinstance(v, (int, float)):
                        v = float(v)
                    flattened[f'properties.{k}'] = v if v else np.nan
            
            data.append(pd.json_normalize(flattened))
            
    df = pd.concat(data, ignore_index=True)
    
    # Drop rows containing '();' in any string column
    mask = pd.Series(False, index=df.index)
    for col in df.select_dtypes(include=['object']):
        mask = mask | df[col].astype(str).str.contains('\(\);', na=False)
    df = df[~mask]
    
    # Drop rows containing www, html, or javascript in any string column
    banned_words = ['www', 'html', 'javascript']
    for col in df.select_dtypes(include=['object']):
        for word in banned_words:
            mask = mask | df[col].astype(str).str.contains(word, case=False, na=False)
    df = df[~mask]
    
    return df

def export_dataframe_stats(df, output_file):
    with open(output_file, 'w', encoding='utf-8', errors='replace') as f:
        # Write shape
        f.write(f"Shape: {df.shape}\n\n")
        
        # Write columns, 3 per line, keeping brackets
        columns = list(df.columns)
        f.write("[\n")
        for i in range(0, len(columns), 3):
            line = "    " + ", ".join([f"'{col}'" for col in columns[i:i+3]])
            if i + 3 < len(columns):
                line += ","
            f.write(line + "\n")
        f.write("]\n\n")
        
        # Write statistics in groups of 3 columns
        f.write("Statistics:\n")
        desc = df.describe()
        pd.set_option('display.float_format', lambda x: '%.3f' % x)
        
        # Process 3 columns at a time
        for i in range(0, len(desc.columns), 3):
            subset_cols = desc.columns[i:i+3]
            subset_desc = desc[subset_cols]
            f.write(subset_desc.to_string())
            f.write("\n\n")
            
        f.write("\n")
        
        # Write column info and first non-null examples
        f.write("Column Examples:\n\n")
        for col in df.columns:
            f.write(f"{col}: {df[col].dtype}\n")
            # Get first non-null value if exists
            non_null_values = df[col].dropna()
            if len(non_null_values) > 0:
                first_value = non_null_values.iloc[0]
                # Format the output nicely
                if isinstance(first_value, (dict, list)):
                    example = str(first_value)
                elif isinstance(first_value, str):
                    example = first_value
                else:
                    example = f"{float(first_value):.1f}"
                f.write(f"Example: {example}\n")
            else:
                f.write("Example: No non-null values found\n")
            f.write("\n")

df = read_large_jsonl("datasets/REGEN/clothing.jsonl", lines=150000)
df.drop(columns=['properties.reviewer_id'], inplace=True)

# Export stats to file
export_dataframe_stats(df, "datasets/REGEN/clothing_described.txt")
df.to_csv("datasets/REGEN/clothing_cleaned.csv", index=False)
print(f"Shape: {df.shape}")
print(df.describe())
print(df.columns)
df.head()

Shape: (144910, 468)
       properties.purchase_history.1.item.price  \
count                                144910.000   
mean                                     28.454   
std                                      45.269   
min                                       0.000   
25%                                       0.000   
50%                                      12.990   
75%                                      37.964   
max                                     999.990   

       properties.purchase_history.1.review.rating  \
count                                   144910.000   
mean                                         4.241   
std                                          1.157   
min                                          1.000   
25%                                          4.000   
50%                                          5.000   
75%                                          5.000   
max                                          5.000   

       properties.purchase_histo

Unnamed: 0,properties.purchase_history.1.item.asin,properties.purchase_history.1.item.title,properties.purchase_history.1.item.category,properties.purchase_history.1.item.description,properties.purchase_history.1.item.price,properties.purchase_history.1.review.summary,properties.purchase_history.1.review.rating,properties.purchase_history.1.review.text,properties.purchase_history.1.review.timestamp,properties.purchase_history.2.item.asin,...,properties.purchase_history.49.review.timestamp,properties.purchase_history.50.item.asin,properties.purchase_history.50.item.title,properties.purchase_history.50.item.category,properties.purchase_history.50.item.description,properties.purchase_history.50.item.price,properties.purchase_history.50.review.summary,properties.purchase_history.50.review.rating,properties.purchase_history.50.review.text,properties.purchase_history.50.review.timestamp
0,B00NIXF4NO,LifeStride Womens Fran Wedge Pump,Clothing Shoes Jewelry Women Shoes Pumps,The fantastic Fran will make you light up the ...,19.52,This is a really comfortable shoe Its a sleek ...,4.0,This is a really comfortable shoe Its a sleek ...,1455494400.0,B00G4SHVVI,...,,,,,,,,,,
1,B003VRK1XU,Tartanista 165 Scottish Kilt Skirts Huge Choic...,Clothing Shoes Jewelry Women Clothing Skirts,Tartanista 165 Tartan Mini Kilt Skirt Free Pin...,24.95,GREAT ITEM,5.0,LOVE this item It worked GREAT for kilt night ...,1460764800.0,B0141GUG8E,...,,,,,,,,,,
2,B000ZHFCPQ,Dreamgirl Womens Silicone Lace Top ThighHigh S...,Clothing Shoes Jewelry Women Clothing Socks Ho...,The Dreamgirl Sheer Thigh High With Stay Up Si...,22.66,I like them a lot,5.0,I like them a lot Super sexy Really long I was...,1425513600.0,B000ZHFCPQ,...,,,,,,,,,,
3,B017IV3EHE,Dicksons The Reunion Heart Memorial Wedding Ri...,Clothing Shoes Jewelry Women Jewelry Necklaces,Heart Memorial Ring Holder PendantUntil We Mee...,6.58,This is very pretty It holds my rings without ...,5.0,This is very pretty It holds my rings without ...,1475452800.0,B00JZSJAP4,...,,,,,,,,,,
4,B002VJJVT4,Womens Stainless Steel Cubic Zirconia Matte Fi...,Clothing Shoes Jewelry Women Jewelry Rings,Cubic Zirconia stones reveal the same brillian...,0.0,Beautiful ring Great price,5.0,I really love this ring It is so unique and ve...,1382400000.0,B002VEDFB4,...,,,,,,,,,,
