In [12]:
#imports
import json
import pandas as pd
from tqdm import tqdm
import os
print(os.getcwd())

c:\Users\Ariff1422\Documents\Verity\notebooks


In [13]:
# File paths
REVIEW_FILE = f'../data/raw/review-California_10.json'
META_FILE = f'../data/raw/meta-California.json'
OUTPUT_FILE = f'../data/california_reviews_merged.csv'

In [14]:
#Generator function to read in a large set of data especially the k-core dataset for California
def read_json_lines(path, desc="Processing file"):
    """Reads a JSON file where each line is a separate JSON object."""
    with open(path, 'r', encoding='utf-8') as f:
        # Use tqdm to show a progress bar for large files
        for line in tqdm(f, desc=desc):
            try:
                # Safely load each line as a JSON object
                yield json.loads(line)
            except json.JSONDecodeError:
                # Skip any lines that are not valid JSON
                continue

In [15]:
# --- Step 1: Process Reviews Data ---
print("Step 1: Processing California review data...")
review_fields = ['text', 'rating', 'time', 'user_id', 'gmap_id']
reviews_data = []

# Iterate over the generator and build a list of dictionaries
for review in read_json_lines(REVIEW_FILE, desc="Reading reviews"):
    # Extract only the relevant fields to save memory
    reviews_data.append({k: review.get(k) for k in review_fields})

# Convert the list of dictionaries into a DataFrame
reviews_df = pd.DataFrame(reviews_data)
print(f"Reviews DataFrame created with {len(reviews_df)} rows.")


Step 1: Processing California review data...


Reading reviews: 44476890it [03:03, 242931.17it/s]


Reviews DataFrame created with 44476890 rows.


In [16]:
# --- Step 2: Process Metadata Data ---
print("\nStep 2: Processing business metadata...")
meta_fields = ['gmap_id', 'category', 'description', 'avg_rating', 'num_of_reviews']
metas_data = []

# The metadata file is much smaller, so loading it at once is usually fine
for meta in read_json_lines(META_FILE, desc="Reading metadata"):
    metas_data.append({k: meta.get(k) for k in meta_fields})

meta_df = pd.DataFrame(metas_data)
print(f"Metadata DataFrame created with {len(meta_df)} rows.")


Step 2: Processing business metadata...


Reading metadata: 515961it [00:21, 23999.54it/s] 


Metadata DataFrame created with 515961 rows.


In [18]:
# --- Step 3: Merge and Clean Data ---
print("\nStep 3: Merging and cleaning data...")

merged_df = pd.merge(reviews_df, meta_df, on='gmap_id', how='left')
print(f"Initial merged DataFrame has {len(merged_df)} rows.")

# FIX FOR THE ERROR: Convert the 'category' list to a string
merged_df['category'] = merged_df['category'].astype(str)

# Drop duplicate rows to ensure a clean dataset
merged_df.drop_duplicates(inplace=True)
print(f"After dropping duplicates, DataFrame has {len(merged_df)} rows.")

# Drop rows where the 'text' or 'category' is missing
merged_df.dropna(subset=['text', 'category'], inplace=True)
print(f"After dropping rows with missing text or category, DataFrame has {len(merged_df)} rows.")



Step 3: Merging and cleaning data...
Initial merged DataFrame has 44504776 rows.
After dropping duplicates, DataFrame has 43968127 rows.
After dropping rows with missing text or category, DataFrame has 23258034 rows.


In [19]:
# --- Step 4: Save the Final Dataset ---
print("\nStep 4: Saving the final processed file...")
merged_df.to_csv(OUTPUT_FILE, index=False)
print(f"Success! Final cleaned dataset saved to {OUTPUT_FILE}")


Step 4: Saving the final processed file...
Success! Final cleaned dataset saved to ../data/california_reviews_merged.csv
