# Question 1: EDA

### Step 1: Load Data

In [57]:
import pandas as pd

url = 'https://storage.googleapis.com/bdt-trx/punks.csv'
df = pd.read_csv(url)

### Step 2: Initial Inspection

In [58]:
# inspect the data, its types aswel as for missing values

print(df.head())
print(df.dtypes)
print(df.isnull().sum())

   Unnamed: 0     Type      From        To        Amount           Txn  Punk
0           0  Claimed       NaN  0xc352b5           NaN  Jun 23, 2017     0
1           1     Bid   0x948a10       NaN  0.67Ξ ($219)  Jun 23, 2017     0
2           2     Bid   0x00d7c9       NaN  0.98Ξ ($320)  Jun 23, 2017     0
3           3    Sold   0xc352b5  0x00d7c9  0.98Ξ ($320)  Jun 23, 2017     0
4           4     Bid   0x717403       NaN   0.05Ξ ($14)  Jun 25, 2017     0
Unnamed: 0     int64
Type          object
From          object
To            object
Amount        object
Txn           object
Punk           int64
dtype: object
Unnamed: 0        0
Type              0
From           9447
To            13891
Amount         5380
Txn               0
Punk              0
dtype: int64


### Step 3: Data Cleaning

In [59]:
df['Txn'] = pd.to_datetime(df['Txn'])
df['Amount'] = df['Amount'].str.extract(r'([\d\.]+)').astype(float)

### Step 4: Inspect the transaction types

In [60]:
# check types and amount of transcations 
print(df['Type'].unique())
print(df['Type'].value_counts())

['Claimed' 'Bid ' 'Sold ' 'Transfer' 'Offered' 'Bid Withdrawn'
 'Offer Withdrawn' '(Wrap)' '(Unwrap)' 'Bid *']
Type
Offered            6787
Bid                3812
Bid Withdrawn      2340
Transfer           2286
Sold               1679
Claimed            1635
Offer Withdrawn    1025
(Wrap)              234
(Unwrap)            200
Bid *                 4
Name: count, dtype: int64


### Step 5: Check current Punk Owners


In [61]:
# sort dataframe by date
df = df.sort_values('Txn')

# initialize a dictionary to store current owners
punk_owners = {}

for index, row in df.iterrows():
    punk = row['Punk']
    txn_type = row['Type']
    from_wallet = row['From']
    to_wallet = row['To']
    
    if txn_type == 'Claimed':
        if pd.notnull(to_wallet):
            punk_owners[punk] = to_wallet
    elif txn_type in ['Sold', 'Transfer']:
        if pd.notnull(to_wallet):
            punk_owners[punk] = to_wallet

# convert dictionary to dataframe
punk_owner_df = pd.DataFrame(list(punk_owners.items()), columns=['Punk', 'Owner'])
print(punk_owner_df.head())

   Punk     Owner
0     0  0xf5099e
1   661  0x1f52dc
2   662  0xc352b5
3   663  0xc352b5
4   664  0xc352b5


### Step 6: Top five Punk owners

In [62]:
ownership_counts = punk_owner_df['Owner'].value_counts()
print(ownership_counts.head(5))

Owner
0xc352b5    406
0xb88f61    174
0x4d8e16     47
0x577ebc     39
0x31a5ff     32
Name: count, dtype: int64


# Question 2: Data Model

Below is the data model diagram for our Neo4j database:

![Data Model Diagram](Cryptopunk.png)

# Question 3: Load data into Neo4j

In [63]:
import pandas as pd
import numpy as np
from neo4j import GraphDatabase

# connect to Neo4j
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "your_password"))

# create uniqueness constraints: https://neo4j.com/docs/cypher-manual/current/constraints/
with driver.session() as session:
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (w:Wallet) REQUIRE w.address IS UNIQUE")
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (t:Transaction) REQUIRE t.txn_id IS UNIQUE")
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (p:Cryptopunk) REQUIRE p.punk_id IS UNIQUE")

# sort by transaction date and rest id index
df = df.sort_values('Txn').reset_index(drop=True)
df['txn_id'] = df.index.astype(str)

# define function for batch processing
def create_nodes_and_relationships(tx, records):
    for record in records:
        txn_id = record['txn_id']
        txn_type = record['Type']
        amount = record['Amount']
        date = record['Txn'].isoformat()
        from_wallet = record['From']
        to_wallet = record['To']
        punk_id = int(record['Punk'])
        
        # merge Cryptopunk node
        tx.run("""
            MERGE (p:Cryptopunk {punk_id: $punk_id})
        """, punk_id=punk_id)
        
        # merge Transaction node
        tx.run("""
            MERGE (t:Transaction {txn_id: $txn_id})
            SET t.type = $txn_type, t.amount = $amount, t.date = datetime($date)
        """, txn_id=txn_id, txn_type=txn_type, amount=amount, date=date)
        
        # merge Wallet nodes and create relationships
        if pd.notnull(from_wallet):
            tx.run("""
                MERGE (from_w:Wallet {address: $from_wallet})
                MERGE (from_w)-[:INITIATED]->(t)
            """, from_wallet=from_wallet, txn_id=txn_id)
        
        if pd.notnull(to_wallet):
            tx.run("""
                MERGE (to_w:Wallet {address: $to_wallet})
                MERGE (t)-[:TO]->(to_w)
            """, to_wallet=to_wallet, txn_id=txn_id)
        
        # create INVOLVES relationship
        tx.run("""
            MATCH (t:Transaction {txn_id: $txn_id}), (p:Cryptopunk {punk_id: $punk_id})
            MERGE (t)-[:INVOLVES]->(p)
        """, txn_id=txn_id, punk_id=punk_id)
        
        # update CURRENTLY_OWNS relationship
        if txn_type in ['Claimed', 'Sold', 'Transfer']:
            # delete existing CURRENTLY_OWNS relationships for this Cryptopunk
            tx.run("""
                MATCH (p:Cryptopunk {punk_id: $punk_id})<-[r:CURRENTLY_OWNS]-()
                DELETE r
            """, punk_id=punk_id)
            
            if pd.notnull(to_wallet):
                # create new CURRENTLY_OWNS relationship
                tx.run("""
                    MATCH (to_w:Wallet {address: $to_wallet}), (p:Cryptopunk {punk_id: $punk_id})
                    MERGE (to_w)-[:CURRENTLY_OWNS]->(p)
                """, to_wallet=to_wallet, punk_id=punk_id)

# batch process data
batch_size = 1000
num_batches = int(np.ceil(len(df) / batch_size))

with driver.session() as session:
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        batch = df.iloc[start_idx:end_idx].to_dict('records')
        
        session.write_transaction(create_nodes_and_relationships, batch)
        print(f"Processed batch {i + 1} of {num_batches}")


  session.write_transaction(create_nodes_and_relationships, batch)


Processed batch 1 of 21


  session.write_transaction(create_nodes_and_relationships, batch)


Processed batch 2 of 21


  session.write_transaction(create_nodes_and_relationships, batch)


Processed batch 3 of 21


  session.write_transaction(create_nodes_and_relationships, batch)


Processed batch 4 of 21


  session.write_transaction(create_nodes_and_relationships, batch)


Processed batch 5 of 21


  session.write_transaction(create_nodes_and_relationships, batch)


Processed batch 6 of 21


  session.write_transaction(create_nodes_and_relationships, batch)


Processed batch 7 of 21


  session.write_transaction(create_nodes_and_relationships, batch)


Processed batch 8 of 21


  session.write_transaction(create_nodes_and_relationships, batch)


Processed batch 9 of 21


  session.write_transaction(create_nodes_and_relationships, batch)


Processed batch 10 of 21


  session.write_transaction(create_nodes_and_relationships, batch)


Processed batch 11 of 21


  session.write_transaction(create_nodes_and_relationships, batch)


Processed batch 12 of 21


  session.write_transaction(create_nodes_and_relationships, batch)


Processed batch 13 of 21


  session.write_transaction(create_nodes_and_relationships, batch)


Processed batch 14 of 21


  session.write_transaction(create_nodes_and_relationships, batch)


Processed batch 15 of 21


  session.write_transaction(create_nodes_and_relationships, batch)


Processed batch 16 of 21


  session.write_transaction(create_nodes_and_relationships, batch)


Processed batch 17 of 21


  session.write_transaction(create_nodes_and_relationships, batch)


Processed batch 18 of 21


  session.write_transaction(create_nodes_and_relationships, batch)


Processed batch 19 of 21


  session.write_transaction(create_nodes_and_relationships, batch)


Processed batch 20 of 21
Processed batch 21 of 21


  session.write_transaction(create_nodes_and_relationships, batch)


# Question 4: 

In [64]:
with driver.session() as session:
    result = session.run("""
        MATCH (w:Wallet)-[:CURRENTLY_OWNS]->(p:Cryptopunk)
        RETURN w.address AS wallet, COUNT(p) AS num_punks
        ORDER BY num_punks DESC
        LIMIT 5
    """)
    print("Top five wallets holding the largest number of Cryptopunks:")
    for record in result:
        print(f"Wallet: {record['wallet']}, Number of Cryptopunks: {record['num_punks']}")



Top five wallets holding the largest number of Cryptopunks:
Wallet: 0xc352b5, Number of Cryptopunks: 406
Wallet: 0xb88f61, Number of Cryptopunks: 174
Wallet: 0x4d8e16, Number of Cryptopunks: 47
Wallet: 0x577ebc, Number of Cryptopunks: 39
Wallet: 0x31a5ff, Number of Cryptopunks: 32
