# Yelp Dataset: Data Loading and Neo4j Import

This notebook processes Yelp dataset JSON files (Business, User, Review, Tip), cleans the data, transforms it into CSV format suitable for Neo4j's `neo4j-admin import` tool, and imports it into a Neo4j database.

## 1. Setup: Imports and Constants

In [1]:
# Core Libraries
import json
import csv
import uuid
import os
import regex as re
import subprocess
import datetime
import shutil
from pathlib import Path
from time import sleep
from typing import List, Tuple, Set

# Data Handling & Graph
import numpy as np
import pandas as pd
import reverse_geocoder as rg
from py2neo import Graph
# Import base Neo4jError and specific retryable errors
from neo4j.exceptions import ServiceUnavailable, TransientError, Neo4jError

In [2]:
# --- Constants: Node Labels ---
BUSINESS_NODE = "Business"
USER_NODE = "User"
REVIEW_NODE = "Review"
TIP_NODE = "Tip" # Added for tips
CATEGORY_NODE = "Category"
CITY_NODE = "City"
STATE_NODE = "State"  # Changed from Area
COUNTRY_NODE = "Country"

# --- Constants: Relationship Types ---
IN_CATEGORY = "IN_CATEGORY"
IN_CITY = "IN_CITY"
IN_STATE = "IN_STATE"    # Changed from IN_AREA
IN_COUNTRY = "IN_COUNTRY"
FRIENDS = "FRIENDS"
REVIEWS = "REVIEWS" # Business <-[:REVIEWS]- Review/Tip
WROTE = "WROTE"     # User -[:WROTE]-> Review/Tip

## 2. Configuration: File Paths and Neo4j Connection

In [3]:
# --- Input Data Files ---
# Adjusted path to point one level up and into 'yelp_dataset'
data_folder = "../cleaned_yelp_data" 
business_json_file = os.path.join(data_folder, "cleaned_yelp_academic_dataset_business.json")
review_json_file = os.path.join(data_folder, "cleaned_yelp_academic_dataset_review.json")
user_json_file = os.path.join(data_folder, "cleaned_yelp_academic_dataset_user.json")
tip_json_file = os.path.join(data_folder, "cleaned_yelp_academic_dataset_tip.json") # Added tip file

# List of raw input files
list_raw_files = [business_json_file, review_json_file, user_json_file, tip_json_file] # Added tip file

# --- Intermediate Fixed Data Files ---
# These will be created in the same directory as the raw files
fixed_business_json_file = os.path.join(data_folder, "fixed_yelp_academic_dataset_business.json")
fixed_review_json_file = os.path.join(data_folder, "fixed_yelp_academic_dataset_review.json")
fixed_user_json_file = os.path.join(data_folder, "fixed_yelp_academic_dataset_user.json")
fixed_tip_json_file = os.path.join(data_folder, "fixed_yelp_academic_dataset_tip.json") # Added fixed tip file

# List of fixed intermediate files
list_fixed_data = [fixed_business_json_file, fixed_review_json_file, fixed_user_json_file, fixed_tip_json_file] # Added fixed tip file

# --- Output CSV Files for Neo4j Import ---
# These will be generated in the current working directory (assumed project root)
business_nodes_csv_file = "business_nodes.csv"
category_nodes_csv_file = "category_nodes.csv"
city_nodes_csv_file = "city_nodes.csv"
state_nodes_csv_file = "state_nodes.csv"  # Changed from area_nodes.csv
country_nodes_csv_file = "country_nodes.csv"
user_nodes_csv_file = "user_nodes.csv"
review_nodes_csv_file = "review_nodes.csv"
tip_nodes_csv_file = "tip_nodes.csv" # Added tip nodes file
relationship_csv_file = "relationships.csv"

# List of node CSV files (updated)
nodes_files = [
    business_nodes_csv_file, category_nodes_csv_file, city_nodes_csv_file, 
    state_nodes_csv_file, country_nodes_csv_file, user_nodes_csv_file, 
    review_nodes_csv_file, tip_nodes_csv_file # Added tip nodes file
]

# --- Neo4j Configuration ---
graph_name = "neo4j" # Default graph name
SERVER_ADDRESS = "bolt://localhost:7687"
SERVER_AUTH = ("neo4j","password") # Replace with your Neo4j credentials

# --- Neo4j Installation Path (Manual Configuration Required for Neo4j 5+) ---
# !! IMPORTANT !! Set this variable to the root directory of your Neo4j installation.
# Example for Windows: neo4j_home = r"C:\Users\YourUser\.Neo4jDesktop\relate-data\dbmss\dbms-abc12345-def67890"
# Example for Linux/macOS: neo4j_home = "/path/to/neo4j-community-5.x.x"
neo4j_home = "C:/Users/wiztu/.Neo4jDesktop/relate-data/dbmss/dbms-d9cecb50-c260-4dd0-b3e0-6b387c58d5ec" 

if neo4j_home is None:
    print("WARNING: 'neo4j_home' variable is not set.")
    print("         Please edit this cell and set it to your Neo4j installation path.")
    print("         The script needs this path to find 'neo4j-admin' and 'neo4j' commands.")

# --- Utility Function ---
def delete_files(files: List[str]):
    """Safely deletes a list of files if they exist."""
    for one_file in files:
        one_path = Path(one_file)
        if one_path.is_file():
            try:
                one_path.unlink()
                # print(f"Deleted file: {one_file}")
            except OSError as e:
                print(f"Error deleting file {one_file}: {e}")

# --- Initial Check: Raw Data Files ---
if not all(Path(f).is_file() for f in list_raw_files):
    missing = [f for f in list_raw_files if not Path(f).is_file()]
    raise FileNotFoundError(f"Missing raw Yelp JSON files: {missing}. Expected in '{data_folder}'.")

## 3. Preprocessing Raw Data

**Data Challenges Addressed:**
1.  **Non-Unique IDs:** `business_id`, `user_id`, `review_id`, and potentially tip identifiers (if they existed) can overlap across files. Prefixes (`b-`, `u-`, `r-`, `t-`) are added to ensure global uniqueness for Neo4j import. Tips require UUID generation as they lack a source ID.
2.  **Dangling Friendships:** Users might list friends who don't exist in the dataset. These non-existent friend references are removed.
3.  **Duplicate Categories:** Businesses might list the same category multiple times. Duplicates are removed.
4.  **String vs. Array:** `friends` (in User) and `categories` (in Business) are stored as comma-separated strings instead of proper JSON arrays. While fixing IDs/categories, these are handled appropriately.

In [4]:
# --- Preprocessing Functions ---

def remove_unknown_friends(raw_user_path: str, output_path: str) -> None:
    """Reads raw user data, identifies all valid user IDs, 
       and writes a new file containing only users with valid friend lists."""
    print(f"Reading user IDs from {raw_user_path}...")
    user_ids = set()
    try:
        with open(raw_user_path, "r", encoding="utf-8") as rf:
            for i, line in enumerate(rf):
                if len(line.strip()) > 0:
                    try:
                        json_node = json.loads(line)
                        user_ids.add(json_node["user_id"])
                    except json.JSONDecodeError:
                        print(f"Warning: Skipping invalid JSON on line {i+1} in {raw_user_path}")
    except FileNotFoundError:
        print(f"Error: File not found {raw_user_path}")
        raise
    print(f"Found {len(user_ids)} unique user IDs.")

    print(f"Writing users with cleaned friend lists to {output_path}...")
    try:
        with open(raw_user_path, "r", encoding="utf-8") as rf, open(output_path, mode="w", encoding="utf-8") as of:
            for i, line in enumerate(rf):
                if len(line.strip()) > 0:
                    try:
                        json_node = json.loads(line)
                        friends_str = json_node.get("friends", "")
                        if friends_str and friends_str.lower() != 'none' and len(friends_str.strip()) > 0:
                            friends_arr = re.split(r"\s*,\s*", friends_str.strip())
                            # Filter out friends not present in the dataset
                            friends_exist_arr = [f for f in friends_arr if f in user_ids]
                            json_node["friends"] = ', '.join(friends_exist_arr)
                        else:
                            json_node["friends"] = "" # Ensure field exists but is empty
                        of.write(f'{json.dumps(json_node)}\n')
                    except json.JSONDecodeError:
                        # Warning already printed during ID collection
                        pass 
    except FileNotFoundError:
        print(f"Error: File not found {raw_user_path} during writing phase.")
        raise
    print("Finished cleaning friend lists.")

def make_ids_unique(input_path: str, output_path: str, func_to_modify) -> None:
    """Applies a modification function (e.g., adding ID prefixes) 
       to each JSON object in the input file and writes to the output file."""
    print(f"Applying ID modifications to {input_path} -> {output_path}...")
    temp_output_file = output_path + ".tmp-" + str(uuid.uuid4())
    try:
        with open(input_path, mode="r", encoding="utf-8") as in_f, open(temp_output_file, mode="w", encoding="utf-8") as ou_f:
            for i, line in enumerate(in_f):
                if len(line.strip()) > 0:
                    try:
                        json_node = json.loads(line)
                        json_node = func_to_modify(json_node)
                        ou_f.write(f'{json.dumps(json_node)}\n')
                    except json.JSONDecodeError:
                         print(f"Warning: Skipping invalid JSON on line {i+1} in {input_path}")
    except FileNotFoundError:
        print(f"Error: File not found {input_path}")
        # Clean up temp file if it exists
        if Path(temp_output_file).is_file():
            Path(temp_output_file).unlink()
        raise
        
    # Replace original file with modified temp file
    try:
        os.replace(temp_output_file, output_path)
        print(f"Successfully updated {output_path}.")
    except OSError as e:
        print(f"Error replacing file {output_path} with {temp_output_file}: {e}")
        # Attempt cleanup again
        if Path(temp_output_file).is_file():
            try: Path(temp_output_file).unlink()
            except OSError: pass
        raise

In [5]:
# --- Preprocessing Execution ---

# Check if fixed files already exist. If not, create them.
if not all(Path(f).is_file() for f in list_fixed_data):
    print("Fixed data files not found. Starting preprocessing...")
    # Remove any potentially incomplete fixed files first
    delete_files(list_fixed_data)

    # --- Define Modification Functions ---
    def fix_user(json_node):
        # Prefix user_id
        original_user_id = json_node["user_id"]
        json_node["user_id"] = "u-" + original_user_id
        # Prefix friend_ids (already cleaned in remove_unknown_friends)
        friends_str = json_node.get("friends", "")
        if friends_str and len(friends_str.strip()) > 0:
            friends_arr = re.split(r"\s*,\s*", friends_str.strip())
            # Prefix each valid friend ID
            friends_arr = ["u-" + f_id for f_id in friends_arr]
            json_node["friends"] = ', '.join(friends_arr)
        else:
             json_node["friends"] = ""
        return json_node

    def fix_business(json_node):
        # Prefix business_id
        json_node["business_id"] = "b-" + json_node["business_id"]
        # Deduplicate and clean categories
        categories_str = json_node.get("categories", None)
        if categories_str and len(categories_str.strip()) > 0:
            categories_arr = re.split(r"\s*,\s*", categories_str.strip())
            # Filter out empty strings and trim whitespace, then deduplicate
            categories_set = set(cat.strip() for cat in categories_arr if cat and len(cat.strip()) > 0)
            json_node["categories"] = ', '.join(sorted(list(categories_set))) # Sort for consistency
        else:
            json_node["categories"] = ""
        return json_node

    def fix_review(json_node):
        # Prefix review_id, user_id, business_id
        json_node["review_id"] = "r-" + json_node["review_id"]
        json_node["user_id"] = "u-" + json_node["user_id"]
        json_node["business_id"] = "b-" + json_node["business_id"]
        return json_node
    
    def fix_tip(json_node):
        # Prefix user_id, business_id and generate tip_id
        json_node["tip_id"] = "t-" + str(uuid.uuid4()) # Generate unique ID for tip
        json_node["user_id"] = "u-" + json_node["user_id"]
        json_node["business_id"] = "b-" + json_node["business_id"]
        return json_node

    # --- Apply Fixes ---
    # 1. Clean user friends (reads raw, writes to fixed path)
    remove_unknown_friends(user_json_file, fixed_user_json_file)
    # 2. Make user IDs unique (reads fixed, modifies in-place)
    make_ids_unique(fixed_user_json_file, fixed_user_json_file, fix_user)
    # 3. Make business IDs unique and clean categories (reads raw, writes to fixed path)
    make_ids_unique(business_json_file, fixed_business_json_file, fix_business)
    # 4. Make review IDs unique (reads raw, writes to fixed path)
    make_ids_unique(review_json_file, fixed_review_json_file, fix_review)
    # 5. Make tip IDs unique (reads raw, writes to fixed path)
    make_ids_unique(tip_json_file, fixed_tip_json_file, fix_tip) # Added tip processing
    
    print("Preprocessing complete. Fixed data files created.")
else:
    print("Fixed data files already exist. Skipping preprocessing.")

Fixed data files not found. Starting preprocessing...
Reading user IDs from ../cleaned_yelp_data\cleaned_yelp_academic_dataset_user.json...
Found 1677963 unique user IDs.
Writing users with cleaned friend lists to ../cleaned_yelp_data\fixed_yelp_academic_dataset_user.json...
Finished cleaning friend lists.
Applying ID modifications to ../cleaned_yelp_data\fixed_yelp_academic_dataset_user.json -> ../cleaned_yelp_data\fixed_yelp_academic_dataset_user.json...
Successfully updated ../cleaned_yelp_data\fixed_yelp_academic_dataset_user.json.
Applying ID modifications to ../cleaned_yelp_data\cleaned_yelp_academic_dataset_business.json -> ../cleaned_yelp_data\fixed_yelp_academic_dataset_business.json...
Successfully updated ../cleaned_yelp_data\fixed_yelp_academic_dataset_business.json.
Applying ID modifications to ../cleaned_yelp_data\cleaned_yelp_academic_dataset_review.json -> ../cleaned_yelp_data\fixed_yelp_academic_dataset_review.json...
Successfully updated ../cleaned_yelp_data\fixed_yel

## 4. Generating CSV Files for Neo4j Import Tool

This section reads the preprocessed (`fixed_...json`) files and generates CSV files formatted according to the requirements of `neo4j-admin import`.
- Node files contain headers like `nodeId:ID`, `propertyName`, `:LABEL`.
- The relationship file contains headers `:START_ID`, `:END_ID`, `:TYPE`.

In [6]:
# --- CSV Generation Functions ---

# Global sets to store unique nodes and relationships across functions
business_lat_lon = {}
state_nodes: Set[Tuple[str, str]] = set() # Changed from area_nodes
city_nodes: Set[Tuple[str, str]] = set()
country_nodes: Set[str] = set()
categories_nodes: Set[str] = set()
valid_user_ids: Set[str] = set() # Added to track valid users for tips
valid_business_ids: Set[str] = set() # Added to track valid businesses for tips

# Relationship sets (tuples: start_id, end_id, type)
in_city_relationships: Set[Tuple[str, str, str]] = set()
in_state_relationships: Set[Tuple[str, str, str]] = set() # Changed from in_area
in_country_relationships: Set[Tuple[str, str, str]] = set()
in_category_relationships: Set[Tuple[str, str, str]] = set()
friend_relationships: Set[Tuple[str, str, str]] = set()
wrote_relationships: Set[Tuple[str, str, str]] = set() # User->Review
reviews_relationships: Set[Tuple[str, str, str]] = set() # Review->Business
tip_wrote_relationships: Set[Tuple[str, str, str]] = set() # User->Tip (Added)
tip_reviews_relationships: Set[Tuple[str, str, str]] = set() # Tip->Business (Added)

def process_business_data():
    """Reads fixed business data, populates business nodes, categories, 
       lat/lon mapping, IN_CATEGORY relationships, and valid_business_ids set."""
    print(f"Processing business data from {fixed_business_json_file}...")
    global categories_nodes, business_lat_lon, in_category_relationships, valid_business_ids
    processed_count = 0
    valid_business_ids.clear() # Ensure set is empty before processing
    try:
        with open(fixed_business_json_file, "r", encoding="utf-8") as bjf:
            # Prepare writer for business nodes CSV
            with open(business_nodes_csv_file, mode="w", encoding="utf-8", newline="") as business_csv:
                fieldnames = ["business_id:ID", "name", "address", ":LABEL"]
                writer = csv.DictWriter(business_csv, fieldnames=fieldnames, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
                writer.writeheader()
                
                for line in bjf:
                    line = line.strip()
                    if len(line) > 0:
                        try:
                            json_node = json.loads(line)
                            business_id = json_node["business_id"] # Already prefixed
                            
                            # Write business node
                            writer.writerow({
                                "business_id:ID": business_id,
                                "name": json_node.get("name", ""),
                                "address": json_node.get("address", ""),
                                ":LABEL": BUSINESS_NODE
                            })
                            
                            # Add to valid IDs set
                            valid_business_ids.add(business_id)
                            
                            # Store lat/lon for location processing
                            if json_node.get("latitude") is not None and json_node.get("longitude") is not None:
                                business_lat_lon[business_id] = (json_node["latitude"], json_node["longitude"])
                            
                            # Process categories
                            categories_str = json_node.get("categories", "")
                            if categories_str and len(categories_str.strip()) > 0:
                                cur_categories = re.split(r"\s*,\s*", categories_str.strip()) # Already cleaned/deduplicated
                                categories_nodes.update(cur_categories)
                                for category in cur_categories:
                                    in_category_relationships.add((business_id, category, IN_CATEGORY))
                            processed_count += 1
                        except json.JSONDecodeError:
                            print(f"Warning: Skipping invalid JSON line in {fixed_business_json_file}")
                        except KeyError as e:
                            print(f"Warning: Missing key {e} in business record: {line[:100]}...")
                            
    except FileNotFoundError:
        print(f"Error: File not found {fixed_business_json_file}")
        raise
    print(f"Finished processing {processed_count} businesses. Found {len(valid_business_ids)} valid business IDs.")

def process_location_data():
    """Uses reverse_geocoder on business lat/lons to create City, State, Country nodes 
       and IN_CITY, IN_STATE, IN_COUNTRY relationships."""
    print("Processing location data using reverse geocoding...")
    global city_nodes, state_nodes, country_nodes, in_city_relationships, in_state_relationships, in_country_relationships
    
    if not business_lat_lon:
        print("Warning: No business latitude/longitude data found. Skipping location processing.")
        return
        
    # Prepare coordinates for batch reverse geocoding
    business_ids = list(business_lat_lon.keys())
    coordinates = list(business_lat_lon.values())
    
    print(f"Performing reverse geocoding for {len(coordinates)} coordinates...")
    try:
        location_results = rg.search(coordinates)
    except Exception as e:
        print(f"Error during reverse geocoding: {e}")
        print("Ensure 'reverse_geocoder' library and its data are installed correctly.")
        raise
        
    print("Processing geocoding results...")
    # Process results and build node/relationship sets
    for business_id, loc_info in zip(business_ids, location_results):
        city = loc_info.get('name', 'UnknownCity')
        state = loc_info.get('admin1', 'UnknownState') # admin1 is typically state/province
        country = loc_info.get('cc', 'UnknownCountry') # cc is country code
        
        # Create unique IDs
        # Ensure IDs don't clash if names are identical across different levels
        country_id = country
        state_id = f"{state}-{country_id}"
        city_id = f"{city}-{state_id}"
        
        # Add nodes (sets handle uniqueness)
        country_nodes.add(country_id)
        state_nodes.add((state_id, state))
        city_nodes.add((city_id, city))
        
        # Add relationships (sets handle uniqueness)
        in_city_relationships.add((business_id, city_id, IN_CITY))
        in_state_relationships.add((city_id, state_id, IN_STATE))
        in_country_relationships.add((state_id, country_id, IN_COUNTRY))
        
    print(f"Finished processing locations: {len(city_nodes)} cities, {len(state_nodes)} states, {len(country_nodes)} countries.")

def process_user_data():
    """Reads fixed user data, populates user nodes, FRIEND relationships, and valid_user_ids set."""
    print(f"Processing user data from {fixed_user_json_file}...")
    global friend_relationships, valid_user_ids
    processed_count = 0
    valid_user_ids.clear() # Ensure set is empty before processing
    try:
        with open(fixed_user_json_file, "r", encoding="utf-8") as ujf:
            # Prepare writer for user nodes CSV
            with open(user_nodes_csv_file, mode="w", encoding="utf-8", newline="") as user_csv:
                fieldnames = ["user_id:ID", "name", "yelping_since", ":LABEL"]
                writer = csv.DictWriter(user_csv, fieldnames=fieldnames, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
                writer.writeheader()
                
                for line in ujf:
                    line = line.strip()
                    if len(line) > 0:
                        try:
                            json_node = json.loads(line)
                            user_id = json_node["user_id"] # Already prefixed
                            
                            # Write user node
                            writer.writerow({
                                "user_id:ID": user_id,
                                "name": json_node.get("name", ""),
                                "yelping_since": json_node.get("yelping_since", ""),
                                ":LABEL": USER_NODE
                            })
                            
                            # Add to valid IDs set
                            valid_user_ids.add(user_id)
                            
                            # Process friends
                            friends_str = json_node.get("friends", "")
                            if friends_str and len(friends_str.strip()) > 0:
                                friends_arr = re.split(r"\s*,\s*", friends_str.strip()) # Already cleaned and prefixed
                                for friend_id in friends_arr:
                                    # Ensure relationship uniqueness (u1 < u2)
                                    u1 = min(user_id, friend_id)
                                    u2 = max(user_id, friend_id)
                                    if u1 != u2: # Avoid self-loops if data error exists
                                        friend_relationships.add((u1, u2, FRIENDS))
                            processed_count += 1
                        except json.JSONDecodeError:
                            print(f"Warning: Skipping invalid JSON line in {fixed_user_json_file}")
                        except KeyError as e:
                            print(f"Warning: Missing key {e} in user record: {line[:100]}...")
                            
    except FileNotFoundError:
        print(f"Error: File not found {fixed_user_json_file}")
        raise
    print(f"Finished processing {processed_count} users. Found {len(valid_user_ids)} valid user IDs.")

def process_review_data():
    """Reads fixed review data, populates review nodes, WROTE and REVIEWS relationships."""
    print(f"Processing review data from {fixed_review_json_file}...")
    global wrote_relationships, reviews_relationships
    processed_count = 0
    try:
        with open(fixed_review_json_file, "r", encoding="utf-8") as rjf:
            # Prepare writer for review nodes CSV
            with open(review_nodes_csv_file, mode="w", encoding="utf-8", newline="") as review_csv:
                # Note: 'text' can contain quotes/commas, ensure proper quoting
                fieldnames = ["review_id:ID", "stars", "date", "text", ":LABEL"]
                writer = csv.DictWriter(review_csv, fieldnames=fieldnames, delimiter=",", quotechar='"', quoting=csv.QUOTE_ALL) # QUOTE_ALL for safety
                writer.writeheader()
                
                for line in rjf:
                    line = line.strip()
                    if len(line) > 0:
                        try:
                            json_node = json.loads(line)
                            review_id = json_node["review_id"] # Already prefixed
                            user_id = json_node["user_id"]     # Already prefixed
                            business_id = json_node["business_id"] # Already prefixed
                            
                            # Write review node
                            writer.writerow({
                                "review_id:ID": review_id,
                                "stars": json_node.get("stars", 0),
                                "date": json_node.get("date", ""),
                                "text": json_node.get("text", ""),
                                ":LABEL": REVIEW_NODE
                            })
                            
                            # Add relationships
                            wrote_relationships.add((user_id, review_id, WROTE))
                            reviews_relationships.add((review_id, business_id, REVIEWS))
                            processed_count += 1
                        except json.JSONDecodeError:
                            print(f"Warning: Skipping invalid JSON line in {fixed_review_json_file}")
                        except KeyError as e:
                            print(f"Warning: Missing key {e} in review record: {line[:100]}...")
                            
    except FileNotFoundError:
        print(f"Error: File not found {fixed_review_json_file}")
        raise
    print(f"Finished processing {processed_count} reviews.")

def process_tip_data():
    """Reads fixed tip data, populates tip nodes, WROTE and REVIEWS relationships, 
       only including tips where user and business exist."""
    print(f"Processing tip data from {fixed_tip_json_file}...")
    global tip_wrote_relationships, tip_reviews_relationships
    processed_count = 0
    skipped_count = 0
    
    # Ensure valid ID sets are populated before running this
    if not valid_user_ids or not valid_business_ids:
        raise RuntimeError("Valid user and business ID sets are empty. Ensure process_user_data() and process_business_data() run first.")
        
    try:
        with open(fixed_tip_json_file, "r", encoding="utf-8") as tjf:
            # Prepare writer for tip nodes CSV
            with open(tip_nodes_csv_file, mode="w", encoding="utf-8", newline="") as tip_csv:
                fieldnames = ["tip_id:ID", "text", "date", "compliment_count", ":LABEL"]
                writer = csv.DictWriter(tip_csv, fieldnames=fieldnames, delimiter=",", quotechar='"', quoting=csv.QUOTE_ALL) # QUOTE_ALL for safety
                writer.writeheader()
                
                for line in tjf:
                    line = line.strip()
                    if len(line) > 0:
                        try:
                            json_node = json.loads(line)
                            tip_id = json_node["tip_id"]         # Already generated & prefixed
                            user_id = json_node["user_id"]       # Already prefixed
                            business_id = json_node["business_id"] # Already prefixed
                            
                            # --- Validation Check ---
                            if user_id in valid_user_ids and business_id in valid_business_ids:
                                # Write tip node
                                writer.writerow({
                                    "tip_id:ID": tip_id,
                                    "text": json_node.get("text", ""),
                                    "date": json_node.get("date", ""),
                                    "compliment_count": json_node.get("compliment_count", 0),
                                    ":LABEL": TIP_NODE
                                })
                                
                                # Add relationships
                                tip_wrote_relationships.add((user_id, tip_id, WROTE))
                                tip_reviews_relationships.add((tip_id, business_id, REVIEWS))
                                processed_count += 1
                            else:
                                skipped_count += 1
                                # Optionally log skipped tips if needed for debugging
                                # print(f"Skipping tip {tip_id} due to missing user ({user_id not in valid_user_ids}) or business ({business_id not in valid_business_ids})")
                                
                        except json.JSONDecodeError:
                            print(f"Warning: Skipping invalid JSON line in {fixed_tip_json_file}")
                        except KeyError as e:
                            print(f"Warning: Missing key {e} in tip record: {line[:100]}...")
                            
    except FileNotFoundError:
        print(f"Error: File not found {fixed_tip_json_file}")
        raise
    print(f"Finished processing tips. Added {processed_count} tips, skipped {skipped_count} due to missing user/business.")

def write_category_nodes_to_file():
    """Writes the collected unique category nodes to a CSV file."""
    print(f"Writing {len(categories_nodes)} category nodes to {category_nodes_csv_file}...")
    with open(category_nodes_csv_file, mode="w", encoding="utf-8", newline="") as category_csv:
        fieldnames = ["category_id:ID", ":LABEL"]
        writer = csv.DictWriter(category_csv, fieldnames=fieldnames, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writeheader()
        for category_id in sorted(list(categories_nodes)):
            writer.writerow({"category_id:ID": category_id, ":LABEL": CATEGORY_NODE})
    print("Finished writing category nodes.")

def write_location_nodes_to_file():
    """Writes the collected unique City, State, Country nodes to CSV files."""
    # Write City Nodes
    print(f"Writing {len(city_nodes)} city nodes to {city_nodes_csv_file}...")
    with open(city_nodes_csv_file, mode="w", encoding="utf-8", newline="") as city_csv:
        fieldnames = ["city_id:ID", "name", ":LABEL"]
        writer = csv.DictWriter(city_csv, fieldnames=fieldnames, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writeheader()
        for city_id, city_name in sorted(list(city_nodes)):
            writer.writerow({"city_id:ID": city_id, "name": city_name, ":LABEL": CITY_NODE})
            
    # Write State Nodes (Changed from Area)
    print(f"Writing {len(state_nodes)} state nodes to {state_nodes_csv_file}...")
    with open(state_nodes_csv_file, mode="w", encoding="utf-8", newline="") as state_csv:
        fieldnames = ["state_id:ID", "name", ":LABEL"]
        writer = csv.DictWriter(state_csv, fieldnames=fieldnames, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writeheader()
        for state_id, state_name in sorted(list(state_nodes)):
            writer.writerow({"state_id:ID": state_id, "name": state_name, ":LABEL": STATE_NODE})
            
    # Write Country Nodes
    print(f"Writing {len(country_nodes)} country nodes to {country_nodes_csv_file}...")
    with open(country_nodes_csv_file, mode="w", encoding="utf-8", newline="") as country_csv:
        fieldnames = ["country_id:ID", ":LABEL"]
        writer = csv.DictWriter(country_csv, fieldnames=fieldnames, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writeheader()
        for country_id in sorted(list(country_nodes)):
            writer.writerow({"country_id:ID": country_id, ":LABEL": COUNTRY_NODE})
    print("Finished writing location nodes.")

def write_relationships_to_file():
    """Writes all collected relationships to a single CSV file."""
    all_relationships = (
        in_category_relationships | in_city_relationships | 
        in_state_relationships | in_country_relationships | 
        friend_relationships | wrote_relationships | reviews_relationships | 
        tip_wrote_relationships | tip_reviews_relationships # Added tip relationships
    )
    print(f"Writing {len(all_relationships)} relationships to {relationship_csv_file}...")
    with open(relationship_csv_file, mode="w", encoding="utf-8", newline="") as rel_csv:
        fieldnames = [":START_ID", ":END_ID", ":TYPE"]
        writer = csv.DictWriter(rel_csv, fieldnames=fieldnames, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writeheader()
        # Sort for deterministic output (optional but good practice)
        for start_id, end_id, rel_type in sorted(list(all_relationships)):
            writer.writerow({":START_ID": start_id, ":END_ID": end_id, ":TYPE": rel_type})
    print("Finished writing relationships.")

In [7]:
# --- CSV Generation Execution ---

# Check if CSV files already exist. If not, generate them.
if not all(Path(f).is_file() for f in nodes_files) or not Path(relationship_csv_file).is_file():
    print("Import CSV files not found or incomplete. Starting generation...")
    # Delete any existing CSV files first
    delete_files(nodes_files) # Now includes tip_nodes_csv_file
    delete_files([relationship_csv_file])
    
    # Process data and populate node/relationship sets
    # IMPORTANT: Process Users and Businesses FIRST to populate valid ID sets
    process_user_data() 
    process_business_data()
    process_location_data() # Depends on business_lat_lon
    process_review_data()
    process_tip_data() # Process tips AFTER users/businesses
    
    # Write node sets to their respective CSV files
    write_category_nodes_to_file()
    write_location_nodes_to_file()
    # Business, User, Review, Tip nodes written during processing steps
    
    # Write all relationships to the relationship CSV file
    write_relationships_to_file()
    
    # Optional: Clean up large intermediate sets to free memory
    del business_lat_lon, categories_nodes, city_nodes, state_nodes, country_nodes
    del valid_user_ids, valid_business_ids # Added cleanup
    del in_category_relationships, in_city_relationships, in_state_relationships, in_country_relationships
    del friend_relationships, wrote_relationships, reviews_relationships
    del tip_wrote_relationships, tip_reviews_relationships # Added cleanup
    
    print("CSV file generation complete.")
else:
    print("Import CSV files already exist. Skipping generation.")

Import CSV files not found or incomplete. Starting generation...
Processing user data from ../cleaned_yelp_data\fixed_yelp_academic_dataset_user.json...
Finished processing 1677963 users. Found 1677963 valid user IDs.
Processing business data from ../cleaned_yelp_data\fixed_yelp_academic_dataset_business.json...
Finished processing 68696 businesses. Found 68696 valid business IDs.
Processing location data using reverse geocoding...
Performing reverse geocoding for 68696 coordinates...
Loading formatted geocoded file...
Processing geocoding results...
Finished processing locations: 620 cities, 14 states, 2 countries.
Processing review data from ../cleaned_yelp_data\fixed_yelp_academic_dataset_review.json...
Finished processing 5084370 reviews.
Processing tip data from ../cleaned_yelp_data\fixed_yelp_academic_dataset_tip.json...
Finished processing tips. Added 741599 tips, skipped 0 due to missing user/business.
Writing 878 category nodes to category_nodes.csv...
Finished writing categor

## 5. Data Integrity Check

Verify basic integrity of the generated CSV files before attempting import:
- Check for duplicate IDs within each node file.
- Check for duplicate relationships (same start, end, and type).

In [8]:
num_nodes_total = 0

def check_nodes_relationships_csv_files_integrity():
    """Validates uniqueness of node IDs and relationships in generated CSVs."""
    global num_nodes_total
    num_nodes_total = 0 # Reset count
    print("Checking integrity of generated CSV files...")
    
    # Check Node Files
    for one_node_file in nodes_files:
        print(f"  Checking node file: {one_node_file}...")
        try:
            temp_df = pd.read_csv(one_node_file, header="infer", sep=",", encoding="utf-8", low_memory=False)
            ids = temp_df.iloc[:, 0] # First column should be the ID (e.g., 'node_id:ID')
            if len(ids) != len(ids.unique()):
                duplicates = ids[ids.duplicated()].unique()
                raise ValueError(f"Duplicate node IDs found in [{one_node_file}]: {list(duplicates)[:5]}...")
            num_nodes_total += len(ids)
            print(f"    Found {len(ids)} unique nodes.")
        except FileNotFoundError:
            print(f"Error: Node file not found: {one_node_file}")
            raise
        except Exception as e:
            print(f"Error checking node file {one_node_file}: {e}")
            raise
            
    # Check Relationship File
    print(f"  Checking relationship file: {relationship_csv_file}...")
    try:
        temp_df = pd.read_csv(relationship_csv_file, header="infer", sep=",", encoding="utf-8", low_memory=False)
        # Combine start, end, type for uniqueness check
        # Ensure columns are treated as strings to avoid type issues
        rel_signatures = temp_df.iloc[:, 0].astype(str) + "|" + temp_df.iloc[:, 1].astype(str) + "|" + temp_df.iloc[:, 2].astype(str)
        if len(rel_signatures) != len(rel_signatures.unique()):
            duplicates = rel_signatures[rel_signatures.duplicated()].unique()
            raise ValueError(f"Duplicate relationships found in [{relationship_csv_file}]: {list(duplicates)[:5]}...")
        print(f"    Found {len(rel_signatures)} unique relationships.")
    except FileNotFoundError:
        print(f"Error: Relationship file not found: {relationship_csv_file}")
        raise
    except Exception as e:
        print(f"Error checking relationship file {relationship_csv_file}: {e}")
        raise
        
    print(f"Integrity check passed. Total unique nodes expected: {num_nodes_total}")

# Run the check
try:
    check_nodes_relationships_csv_files_integrity()
except Exception as e:
    print(f"\n--- DATA INTEGRITY CHECK FAILED ---: {e}")
    print("Import aborted. Please check the CSV generation process and input data.")
    # Optionally raise to stop notebook execution
    # raise

Checking integrity of generated CSV files...
  Checking node file: business_nodes.csv...
    Found 68696 unique nodes.
  Checking node file: category_nodes.csv...
    Found 878 unique nodes.
  Checking node file: city_nodes.csv...
    Found 620 unique nodes.
  Checking node file: state_nodes.csv...
    Found 14 unique nodes.
  Checking node file: country_nodes.csv...
    Found 2 unique nodes.
  Checking node file: user_nodes.csv...
    Found 1677963 unique nodes.
  Checking node file: review_nodes.csv...
    Found 5084370 unique nodes.
  Checking node file: tip_nodes.csv...
    Found 741599 unique nodes.
  Checking relationship file: relationships.csv...
    Found 19333704 unique relationships.
Integrity check passed. Total unique nodes expected: 7574142


## 6. Importing Data into Neo4j

This section uses the `neo4j-admin import` tool to load the generated CSV files into the Neo4j database. 

**Process:**
1.  Stop the Neo4j Database Service.
2.  Delete the existing database files (effectively resetting the graph).
3.  Run `neo4j-admin import` using the generated CSV files.
4.  Start the Neo4j Database Service.
5.  Verify the import by checking the node count.

**Important Notes:**
- Requires `neo4j_home` to be correctly identified earlier.
- Requires sufficient permissions to stop/start the Neo4j service and modify its data directories.
- This will **completely overwrite** the target Neo4j database (`graph_name`).

In [9]:
# --- Neo4j Import Functions ---

def command_neo4j_database_service(cmd: str):
    """Executes start/stop commands for the Neo4j service."""
    if neo4j_home is None or not Path(neo4j_home).is_dir():
        raise ValueError("Neo4j home directory ('neo4j_home') is not set or invalid. Please configure it in cell [2].")
        
    neo4j_cmd_path = os.path.join(neo4j_home, "bin", "neo4j.bat") # Assuming Windows .bat
    if not Path(neo4j_cmd_path).is_file():
         neo4j_cmd_path = os.path.join(neo4j_home, "bin", "neo4j") # Try Linux/macOS script
         if not Path(neo4j_cmd_path).is_file():
             raise FileNotFoundError(f"Neo4j command script not found in {os.path.join(neo4j_home, 'bin')}")
             
    if cmd in ["stop", "start"]:
        print(f"Attempting to {cmd} Neo4j service...")
        try:
            # Use shell=True cautiously, ensure neo4j_cmd_path is safe.
            # Capture output to check success.
            cmd_res = subprocess.run([neo4j_cmd_path, cmd], capture_output=True, text=True, check=False, shell=True)
            
            print(f"  STDOUT: {cmd_res.stdout.strip()}")
            print(f"  STDERR: {cmd_res.stderr.strip()}")
            
            # Check output for success indicators (these might vary slightly by Neo4j version/OS)
            if cmd == "stop":
                # Check if it's already stopped or successfully stopped
                if "stopped" not in cmd_res.stdout.lower() and "not running" not in cmd_res.stderr.lower():
                     # If stop failed, maybe it needs more time or manual intervention
                     print(f"Warning: Neo4j stop command finished, but output doesn't confirm stop. Check service status manually.")
                     # Allow script to continue, but import might fail if service still running
                else:
                    print("Neo4j service appears stopped.")
                    sleep(5) # Give some time for files to be released
            elif cmd == "start":
                if "started" not in cmd_res.stdout.lower():
                    raise RuntimeError(f"Failed to start Neo4j service. Check logs. STDERR: {cmd_res.stderr.strip()}")
                else:
                    print("Neo4j service appears started. Waiting for it to become available...")
                    sleep(15) # Give Neo4j time to initialize
                    
        except FileNotFoundError:
            raise FileNotFoundError(f"Neo4j command '{neo4j_cmd_path}' not found.")
        except subprocess.CalledProcessError as e:
            raise RuntimeError(f"Error executing Neo4j {cmd} command: {e}. STDOUT: {e.stdout}. STDERR: {e.stderr}")
        except Exception as e:
            raise RuntimeError(f"An unexpected error occurred during Neo4j {cmd}: {e}")
    else:
        raise ValueError(f'Unknown command for Neo4j service: [{cmd}]')

def reset_neo4j_database():
    """Deletes the data directories for the target graph."""
    if neo4j_home is None or not Path(neo4j_home).is_dir():
        raise ValueError("Neo4j home directory ('neo4j_home') is not set or invalid. Please configure it in cell [2].")
        
    # Construct paths based on standard Neo4j directory structure
    neo4j_db_dir = Path(neo4j_home) / "data" / "databases" / graph_name
    neo4j_tx_dir = Path(neo4j_home) / "data" / "transactions" / graph_name
    
    print(f"Attempting to delete database directory: {neo4j_db_dir}")
    if neo4j_db_dir.exists() and neo4j_db_dir.is_dir():
        try:
            shutil.rmtree(neo4j_db_dir)
            print("  Database directory deleted.")
        except OSError as e:
            raise OSError(f"Error deleting database directory {neo4j_db_dir}: {e}. Check permissions and ensure Neo4j is stopped.")
    else:
        print("  Database directory does not exist, skipping deletion.")
        
    print(f"Attempting to delete transactions directory: {neo4j_tx_dir}")
    if neo4j_tx_dir.exists() and neo4j_tx_dir.is_dir():
        try:
            shutil.rmtree(neo4j_tx_dir)
            print("  Transactions directory deleted.")
        except OSError as e:
            # This might sometimes fail if logs are held; often okay if db dir was removed.
            print(f"Warning: Could not delete transactions directory {neo4j_tx_dir}: {e}. Import might still succeed.")
    else:
        print("  Transactions directory does not exist, skipping deletion.")

import os
import subprocess
from pathlib import Path

# Assume neo4j_home, graph_name, nodes_files, relationship_csv_file are defined correctly

def import_data():
    # Ensure neo4j_home is valid before proceeding
    if not neo4j_home or not Path(neo4j_home).is_dir():
         raise ValueError(f"Neo4j home directory ('neo4j_home') is not set or invalid: {neo4j_home}")

    import_tool = Path(neo4j_home) / "bin" / ("neo4j-admin.bat" if os.name == "nt" else "neo4j-admin")
    if not import_tool.is_file():
        raise FileNotFoundError(f"neo4j-admin executable not found at: {import_tool}")

    csv_dir = Path.cwd() # where the notebook generated the CSVs

    args = [
        str(import_tool),
        "database", "import", "full",
        # Removed --database=graph_name from here
        "--overwrite-destination=true",
        "--multiline-fields=true",       # Keep this if your text fields might have newlines
        "--skip-duplicate-nodes=true",
        "--skip-bad-relationships=true",
        # Add other flags as needed, BEFORE nodes/relationships
    ]

    # Add each node file with an **absolute** path
    for f in nodes_files:
        # Ensure the path is absolute and correctly formatted for the OS
        node_file_path = (csv_dir / f).resolve()
        args.append(f"--nodes={node_file_path}")

    # Add the relationship file with an **absolute** path
    rel_file_path = (csv_dir / relationship_csv_file).resolve()
    args.append(f"--relationships={rel_file_path}")

    # --- FIX ---
    # Add the database name as the VERY LAST positional argument
    # args.append(graph_name)
    # --- END FIX ---

    print("Running:", " ".join(map(str, args))) # Use map(str,..) in case paths are Path objects

    env = os.environ.copy()
    # Ensure JAVA_HOME is correctly set if needed by your environment/neo4j-admin script
    # Double-check this path is correct for your system where the JDK is installed
    java_home_path = "C:/Program Files/Eclipse Adoptium/jdk-21.0.7.6-hotspot"
    if Path(java_home_path).is_dir():
         env["JAVA_HOME"] = java_home_path
         print(f"Setting JAVA_HOME to: {java_home_path}")
    else:
         print(f"Warning: Specified JAVA_HOME path not found: {java_home_path}. neo4j-admin might fail.")
         # You might remove the JAVA_HOME setting if neo4j-admin finds it automatically,
         # but explicitly setting it is often more reliable if needed.

    # Use shell=False (safer and often handles arguments better)
    # Ensure neo4j service is STOPPED before running this
    try:
        res = subprocess.run(args, text=True, capture_output=True, env=env, check=False, shell=False) # Use check=False to manually check returncode

        if res.returncode != 0:
            # Print command again for easier debugging on failure
            print("\n--- FAILED COMMAND ---")
            print(" ".join(map(str, args)))
            print("----------------------\n")
            raise RuntimeError(f"Import failed ({res.returncode})\nSTDOUT:\n{res.stdout}\nSTDERR:\n{res.stderr}")
        else:
             print("\n--- Import Command Output ---")
             print(res.stdout)
             print("---------------------------\n")
             if res.stderr: # Print stderr even on success, might contain warnings
                 print("--- Import Command STDERR (Warnings/Info) ---")
                 print(res.stderr)
                 print("---------------------------------------------\n")

    except FileNotFoundError as fnf_error:
         print(f"Error: Command not found. Ensure neo4j-admin path is correct. Details: {fnf_error}")
         raise
    except Exception as e:
         print(f"An unexpected error occurred running the import command: {e}")
         raise # Re-raise the caught exception

In [10]:
# --- Import Execution ---

# Check if import is necessary by comparing expected node count with current graph
run_import = False
try:
    # Ensure num_nodes_total was calculated
    if 'num_nodes_total' not in globals() or num_nodes_total == 0:
         print("Warning: Expected node count not calculated. Running integrity check first.")
         check_nodes_relationships_csv_files_integrity()
         
    print(f"Checking current graph '{graph_name}' node count...")
    graph = Graph(SERVER_ADDRESS, auth=SERVER_AUTH) # Reconnect
    current_node_count = graph.nodes.match().count()
    print(f"  Current node count: {current_node_count}")
    print(f"  Expected node count from CSVs: {num_nodes_total}")
    
    if current_node_count != num_nodes_total or num_nodes_total == 0:
        print("Node count mismatch or expected count is zero. Proceeding with import.")
        run_import = True
    else:
        print("Node count matches expected count. Skipping import.")
        
# Catch specific retryable errors, general driver errors, and OS-level connection errors
except (ServiceUnavailable, TransientError, Neo4jError, OSError) as e:
    print(f"Could not connect to Neo4j to check node count ({e}). Assuming import is needed.")
    run_import = True
except NameError:
     print("Expected node count variable 'num_nodes_total' not defined. Assuming import is needed.")
     run_import = True
except Exception as e:
    print(f"An unexpected error occurred checking node count: {e}. Assuming import is needed.")
    run_import = True

# Execute the import process if needed
if run_import:
    try:
        print("\n--- Starting Neo4j Import Process ---")
        #command_neo4j_database_service("stop")
        reset_neo4j_database()
        import_data()
        #command_neo4j_database_service("start")
        print("--- Neo4j Import Process Finished ---")
    except Exception as e:
        print(f"\n--- IMPORT PROCESS FAILED ---: {e}")
        print("Attempting to start Neo4j service if it was stopped...")
        try:
            # Try to start it anyway, might fail if already running or other issues
            #command_neo4j_database_service("start") 
            print("whatever")
        except Exception as start_err:
            print(f"Could not restart Neo4j service after failed import: {start_err}")
        # Re-raise the original exception
        # raise e 
else:
    print("\nImport skipped as node count matches.")

Checking current graph 'neo4j' node count...
An unexpected error occurred checking node count: Cannot open connection to ConnectionProfile('bolt://localhost:7687'). Assuming import is needed.

--- Starting Neo4j Import Process ---
Attempting to delete database directory: C:\Users\wiztu\.Neo4jDesktop\relate-data\dbmss\dbms-d9cecb50-c260-4dd0-b3e0-6b387c58d5ec\data\databases\neo4j
  Database directory deleted.
Attempting to delete transactions directory: C:\Users\wiztu\.Neo4jDesktop\relate-data\dbmss\dbms-d9cecb50-c260-4dd0-b3e0-6b387c58d5ec\data\transactions\neo4j
  Transactions directory deleted.
Running: C:\Users\wiztu\.Neo4jDesktop\relate-data\dbmss\dbms-d9cecb50-c260-4dd0-b3e0-6b387c58d5ec\bin\neo4j-admin.bat database import full --overwrite-destination=true --multiline-fields=true --skip-duplicate-nodes=true --skip-bad-relationships=true --nodes=C:\Users\wiztu\OneDrive\Documents\Data Science and Management\Project\scripts\business_nodes.csv --nodes=C:\Users\wiztu\OneDrive\Documents

## 7. Final Verification

Attempt to connect to the Neo4j database and verify the node count again after the import process.

In [11]:
yelp_graph_ready = False
final_node_count = -1

def check_if_importing_is_successful():
    """Tries to connect to Neo4j and checks if the node count matches the expected count."""
    global graph, yelp_graph_ready, final_node_count
    
    # Ensure expected count is available
    if 'num_nodes_total' not in globals() or num_nodes_total == 0:
        try:
             print("Recalculating expected node count for final verification...")
             check_nodes_relationships_csv_files_integrity()
        except Exception as e:
             print(f"Error recalculating expected node count: {e}. Cannot verify import accurately.")
             return # Cannot verify
             
    print(f"\nVerifying import results (expected nodes: {num_nodes_total})...")
    num_tries = 30 # Try for 30 seconds (adjust as needed)
    wait_interval = 2 # Wait 2 seconds between tries
    
    for one_try in range(num_tries):
        try:
            print(f"Attempt {one_try + 1}/{num_tries}: Connecting to Neo4j...")
            graph = Graph(SERVER_ADDRESS, auth=SERVER_AUTH)
            final_node_count = graph.nodes.match().count()
            print(f"  Successfully connected. Found {final_node_count} nodes.")
            
            if final_node_count == 0 and num_nodes_total > 0:
                print("  Warning: Graph is empty, but expected nodes. Import might have failed silently or service is still starting.")
                # Continue trying for a bit longer
            elif final_node_count != num_nodes_total:
                print(f"  Warning: Node count mismatch! Expected {num_nodes_total}, found {final_node_count}.")
                # Consider this potentially successful but log warning
                yelp_graph_ready = True # Mark as ready but with a warning
                return
            else: # Counts match
                print("  Node count matches expected count. Import successful!")
                yelp_graph_ready = True
                return # Success
                
        # Catch specific retryable errors, general driver errors, and OS-level connection errors
        except (ServiceUnavailable, TransientError, Neo4jError, OSError) as e:
            print(f"  Connection failed ({e}). Neo4j might still be starting. Retrying in {wait_interval}s...")
        except Exception as e:
            print(f"  An unexpected error occurred during verification: {e}")
            # Stop trying on unexpected errors
            break 
            
        # Wait before next try only if not successful yet
        if not yelp_graph_ready:
             sleep(wait_interval)
             
    # If loop finishes without success
    if not yelp_graph_ready:
        print(f"\nVerification failed after {num_tries} attempts. Could not confirm successful import.")
        print(f"Last recorded node count: {final_node_count}")

# Run the final check
check_if_importing_is_successful()

if not yelp_graph_ready:
    print("\n--- FINAL VERIFICATION FAILED ---")
    # Optional: raise Exception("Failed to verify Neo4j import success.")
else:
     print("\n--- FINAL VERIFICATION COMPLETE --- ")


Verifying import results (expected nodes: 7574142)...
Attempt 1/30: Connecting to Neo4j...
  An unexpected error occurred during verification: Cannot open connection to ConnectionProfile('bolt://localhost:7687')

Verification failed after 30 attempts. Could not confirm successful import.
Last recorded node count: -1

--- FINAL VERIFICATION FAILED ---
