In [2]:
%pip install neo4j

Collecting neo4j
  Downloading neo4j-5.28.1-py3-none-any.whl.metadata (5.9 kB)
Downloading neo4j-5.28.1-py3-none-any.whl (312 kB)
Installing collected packages: neo4j
Successfully installed neo4j-5.28.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
from neo4j import GraphDatabase

uri = "bolt://localhost:7687"  # Change if needed
user = "neo4j"
password = "your_password"

driver = GraphDatabase.driver(uri, auth=(user, password))
print(f"Connected to Neo4j at {uri}")


Connected to Neo4j at bolt://localhost:7687


In [20]:


# Correct database name (from SHOW DATABASES result)
database_name = "neo4j"

# Neo4j connection details
uri = "bolt://localhost:7687"
user = "neo4j"
password = "123456789"

# Initialize driver
driver = GraphDatabase.driver(uri, auth=(user, password))

# Function to test connection
def test_connection():
    with driver.session(database=database_name) as session:
        result = session.run("RETURN 'Connection successful' AS msg")
        print(result.single()["msg"])

test_connection()




Connection successful


In [28]:
import pandas as pd

# Load CSV
csv_path = r"C:\Users\doris\Desktop\Data management\archive\dataset\Amazon Sale Report.csv"
df = pd.read_csv(csv_path)

# Display column names
print("Columns in CSV:", df.columns)

# Check data types
print("\nData Types:")
print(df.dtypes)

# Show first 10 rows to inspect Amount values
print("\nFirst 10 rows:")
print(df.head(10))



Columns in CSV: Index(['index', 'Order ID', 'Date', 'Status', 'Fulfilment', 'Sales Channel ',
       'ship-service-level', 'Style', 'SKU', 'Category', 'Size', 'ASIN',
       'Courier Status', 'Qty', 'currency', 'Amount', 'ship-city',
       'ship-state', 'ship-postal-code', 'ship-country', 'promotion-ids',
       'B2B', 'fulfilled-by', 'Unnamed: 22'],
      dtype='object')

Data Types:
index                   int64
Order ID               object
Date                   object
Status                 object
Fulfilment             object
Sales Channel          object
ship-service-level     object
Style                  object
SKU                    object
Category               object
Size                   object
ASIN                   object
Courier Status         object
Qty                     int64
currency               object
Amount                float64
ship-city              object
ship-state             object
ship-postal-code      float64
ship-country           object
promotion-i

  df = pd.read_csv(csv_path)


In [38]:
import os
import pandas as pd
from neo4j import GraphDatabase
import re

# Neo4j Connection
uri = "bolt://localhost:7687"
user = "neo4j"
password = "123456789"
database_name = "neo4j"

driver = GraphDatabase.driver(uri, auth=(user, password))

# Directory containing CSV files
csv_directory = r"C:\Users\doris\Desktop\Data management\archive\dataset"

# Function to execute Cypher queries
def run_query(query, parameters=None):
    with driver.session(database=database_name) as session:
        session.run(query, parameters or {})

# Get all CSV files in the directory
csv_files = [f for f in os.listdir(csv_directory) if f.endswith('.csv')]

# Process each CSV file
for file in csv_files:
    file_path = os.path.join(csv_directory, file)

    print(f"Processing: {file}...")

    # Read CSV with mixed type handling
    df = pd.read_csv(file_path, header=0, dtype=str, low_memory=False)

    # Extract filename as node label
    node_label = re.sub(r'[^a-zA-Z0-9_]', '_', os.path.splitext(file)[0])

    # Clean column names
    df.columns = [re.sub(r'[^a-zA-Z0-9_]', '_', col.strip()) for col in df.columns]

    # ✅ Only convert `Amount` if the column exists
    if "Amount" in df.columns:
        df["Amount"] = pd.to_numeric(df["Amount"], errors="coerce")  # Convert text to float
        df = df.dropna(subset=["Amount"])  # Remove NaN values

    # Get cleaned column names
    col_names = df.columns.tolist()

    # ✅ Insert data in **batches** to improve performance
    batch_size = 1000
    total_rows = len(df)
    for i in range(0, total_rows, batch_size):
        batch_df = df.iloc[i:i+batch_size]
        batch_data = batch_df.to_dict(orient="records")

        with driver.session(database=database_name) as session:
            for row in batch_data:
                properties = ", ".join([f"{col}: ${col}" for col in col_names])
                query = f"CREATE (:{node_label} {{{properties}}});"
                session.run(query, row)

    print(f"✅ Finished uploading {file}")

# Close Neo4j connection
driver.close()
print("🎉 All CSV files successfully uploaded to Neo4j!")


Processing: Amazon Sale Report.csv...
✅ Finished uploading Amazon Sale Report.csv
Processing: International sale Report.csv...
✅ Finished uploading International sale Report.csv
Processing: Sale Report.csv...
✅ Finished uploading Sale Report.csv
🎉 All CSV files successfully uploaded to Neo4j!


In [40]:

# **Step 2: Create Relationships**
print("Creating relationships...")

relationship_query = """
MATCH (asr:Amazon_Sale_Report), (sr:Sale_Report)
WHERE asr.SKU = sr.SKU_Code
MERGE (asr)-[:MATCHES]->(sr);
"""
run_query(relationship_query)

# ✅ FIXED LABEL: Use `International_sale_Report` (lowercase 's')
relationship_query2 = """
MATCH (asr:Amazon_Sale_Report), (isr:International_sale_Report)
WHERE asr.SKU = isr.SKU
MERGE (asr)-[:INTERNATIONAL_MATCH]->(isr);
"""
batch_size = 1000  # Adjust the batch size based on available memory

while True:
    relationship_query2 = f"""
    MATCH (asr:Amazon_Sale_Report), (isr:International_sale_Report)
    WHERE asr.SKU = isr.SKU
    WITH asr, isr LIMIT {batch_size}
    MERGE (asr)-[:INTERNATIONAL_MATCH]->(isr)
    RETURN COUNT(*);
    """
    result = run_query(relationship_query2)
    
    # If no more matches, break the loop
    if not result or result.single()[0] == 0:
        break

print("Relationships created successfully!")





Creating relationships...


  with driver.session(database=database_name) as session:


Relationships created successfully!


In [42]:
# Define Cypher Query
query = """
MATCH (asr:Amazon_Sale_Report)
OPTIONAL MATCH (asr)-[:INTERNATIONAL_MATCH]->(isr:International_sale_Report)
WITH isr.CUSTOMER AS customer, toFloat(asr.Amount) AS amount
WHERE customer IS NOT NULL AND amount IS NOT NULL AND amount > 0
RETURN customer, SUM(amount) AS sum_sales
ORDER BY sum_sales DESC
LIMIT 10;


"""

# Function to Execute Query and Return Results
def get_top_customers():
    with driver.session(database=database_name) as session:
        result = session.run(query)
        return [{"customer": record["customer"], "sum_sales": record["sum_sales"]} for record in result]

# Run Query and Print Results
top_customers = get_top_customers()
for row in top_customers:
    print(row)


{'customer': 'MULBERRIES BOUTIQUE', 'sum_sales': 56450.28999999999}
{'customer': 'COTTON CLOSET LTD', 'sum_sales': 54445.28}
{'customer': 'RINO SANDARAN', 'sum_sales': 43290.57}
{'customer': 'AMANI CONCEPT TRADING LLC (KAPDA)', 'sum_sales': 37017.0}
{'customer': 'VAHARSHA BOUTIQUE', 'sum_sales': 35884.0}
{'customer': 'RIVAAN LLC', 'sum_sales': 34214.86}
{'customer': 'FUSION FASHIONS CORP.', 'sum_sales': 31442.0}
{'customer': 'KOGILA SELLAPPAN', 'sum_sales': 29909.0}
{'customer': 'VISHAL DARSHAN BOUTIQUE', 'sum_sales': 25927.0}
{'customer': 'THANA MARIMUTHU', 'sum_sales': 23002.0}


  with driver.session(database=database_name) as session:
