In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import seaborn as sns

# BEFORE YOU RUN
## Required Project Structure
To ensure that all notebooks and scripts run correctly, please maintain the following structure at the **project root**. Note that the `data/` folder must be at the same level as the `notebooks/` and `src/` folders.

```text
.
├── api/                                  # FastAPI/Flask code
├── data/                                 # [IGNORED BY GIT - LOCAL ONLY]
│   ├── cyber_dataset/
│   │   ├── auth.txt.gz                   # Authentication logs (Large)
│   │   └── redteam.txt.gz                # Redteam labels
│   ├── elliptic_bitcoin_dataset/
│   │   ├── elliptic_txs_classes.csv
│   │   ├── elliptic_txs_edgelist.csv
│   │   └── elliptic_txs_features.csv
│   └── keystroke_dynamics_dataset/
│       └── DSL-StrongPasswordData.csv
├── notebooks/
│   └── EDA.ipynb                         # Current Notebook
├── src/                                  # Source code for training/API
├── vis/                                  # Visualization assets
├── .gitignore
├── README.md
└── requirements.txt

About the Data:
- Elliptic is already a graph. It has 2% illicit data, the rest is licit. 
- Few things to check out: is the data clustered in specific time windows or not?

- For device/auth signals, this data is pretty massive. We need to downsample to be able to manage number of users. 
- We can explore how many users log into a single machine. Maybe a "normal" number of devices per user, things like that

- Behavioral data is micro-features for the nodes. We can analyze the variance in keystoke hold times. Maybe some sort of behavioral fingerprint for a user that changes when an account is compromised.

Mapping the Data
- These obviouslly aren't natively connected, we need to do this ourselves. For example, User_ID -> Wallet_ID and Device_ID -> Login_Session or something like that. 
- This could help us map User A logs in from Device B (unusual) with a typing cadence that doesn't match their profile, then immediately sends Bitcoin to an Illicit node in the Elliptic graph

In [None]:
# Read in the Datasets
df_labels = pd.read_csv('../data/elliptic_bitcoin_dataset/elliptic_txs_classes.csv') # maps transaction IDs to licit, illicit, unknown
df_edges = pd.read_csv('../data/elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv') # the adjacency list [who sent money to who]
df_features = pd.read_csv('../data/elliptic_bitcoin_dataset/elliptic_txs_features.csv', header=None) # 166 features for each transaction

In [None]:
# 1. Rename columns for clarity
# The features file usually doesn't have headers. Column 0 is the ID, Column 1 is Time Step.
df_features.columns = ['txId', 'time_step'] + [f'feat_{i}' for i in range(165)]

# 2. Merge labels and features into a master Node Dataframe
df_nodes = pd.merge(df_features, df_labels, on='txId')

In [None]:
class_map = {'1': 1, '2': 2, 'unknown': -1}
df_nodes['class'] = df_nodes['class'].map(class_map)
print(df_nodes['class'].value_counts())
# We have a lot of unknown nodes we can work with. Train on the known ones and see if we can classify the unknowns later 

In [None]:
df_nodes.head()

In [None]:
df_features.head()

In [None]:
df_edges.head()

In [None]:
# temporal dist
df_nodes.groupby('time_step')['class'].count().plot()

In [None]:
# graph density 
num_nodes = df_nodes.shape[0]
num_edges = df_edges.shape[0]
density = num_edges / (num_nodes * (num_nodes - 1))
print(f"Graph Density: {density:.6f}")
#sparse graph is excpected for real world networks, network is highly efficient.

In [None]:
len(df_edges)/len(df_nodes)  # avg degree

In [None]:
# Calculate degree for each node
# In-degree: how many transactions flow INTO this tx
# Out-degree: how many transactions flow OUT of this tx
degrees = df_edges['txId1'].value_counts() + df_edges['txId2'].value_counts()

plt.figure(figsize=(10, 5))
plt.hist(degrees, bins=50, log=True, color='skyblue', edgecolor='black')
plt.title("Degree Distribution (Log Scale)")
plt.xlabel("Number of Connections (Degree)")
plt.ylabel("Count of Nodes")
plt.show()

Interpreting with some context here:
- the average degree is 1.15ish, meaning that any node with a degree of 10, 20, or more, is likely a major outlier and this represents mixing services or consolidation points 
- Because of sparsity, we should use GraphSAGE or GCN because it aggregates neighbors information without being overwhelmed by noise. 

- Another thing, isolated clusters can be highly suspicious. There are a lot of disjointed clusters in this data. So, most nodes do not see each other. We need to rely on local neighborhood features because of this!

In [None]:
# 1. Create the Graph object from your edgelist
G = nx.from_pandas_edgelist(df_edges, source='txId1', target='txId2', create_using=nx.Graph())

# 2. Find all connected components
components = list(nx.connected_components(G))

# 3. Sort components by size (largest first)
components.sort(key=len, reverse=True)

# 4. Get the Largest Connected Component
lcc = G.subgraph(components[0])

print(f"Total nodes in full graph: {G.number_of_nodes()}")
print(f"Nodes in the Largest Connected Component: {lcc.number_of_nodes()}")
print(f"Percentage of nodes in LCC: {100 * lcc.number_of_nodes() / G.number_of_nodes():.2f}%")

In [None]:
# Create a mapping of txId to class for quick lookup
node_class_dict = dict(zip(df_nodes['txId'], df_nodes['class']))

# Analyze the top 10 largest components
for i, comp in enumerate(components[:10]):
    # Count classes in this component
    classes_in_comp = [node_class_dict.get(node, -1) for node in comp]
    
    fraud_count = classes_in_comp.count(1)
    licit_count = classes_in_comp.count(0)
    unknown_count = classes_in_comp.count(-1)
    
    print(f"Component {i+1} | Size: {len(comp)} | Fraud: {fraud_count} | Licit: {licit_count} | Unknown: {unknown_count}")

In [None]:
# we could use this to create risk scores for unknown nodes based on their proximity to known fraudulent nodes in the graph.

### NEXT DATA
KEYSTROKES

In [None]:
# Load the dataset
df_bio = pd.read_csv('data/keystroke_dynamics_dataset/DSL-StrongPasswordData.csv')

In [None]:
df_bio.head()

In [None]:
df_bio.columns

In [None]:
# Basic Stats: How many users and how many samples per user?
num_users = df_bio['subject'].nunique()
samples_per_user = df_bio.groupby('subject').size().iloc[0]
print(f"Total Users: {num_users}")
print(f"Samples per User: {samples_per_user}")

In [None]:
# Select a few users to compare
sample_users = df_bio['subject'].unique()[:5]
df_sample = df_bio[df_bio['subject'].isin(sample_users)]

plt.figure(figsize=(12, 6))
sns.boxplot(x='subject', y='H.period', data=df_sample)
plt.title('Key "H" Hold Time Distribution across 5 Users')
plt.ylabel('Time (seconds)')
plt.xticks(rotation=45)
plt.show()

We could try some synethic cross-channel linking of this data artficially. This could look like s002 (keystrokes) -> txID_12345 (elliptic). Some issues with this...
- There are only 51 users, and about 200k transactions, so the data would be very repetitive.
- We could limit where we merge (active components, ignoring unknown nodes, etc) and only work with that small subsections.

The next dataset is harder to work with due to file sizes:
- auth.txt.gz, we NEED this one. It has auth events and is the device/access layer 
- redteam.txt.gz, pretty much need this. Contains confirmed malicious activity. can see timestamps/users to simulate the start of a fraud event in the graph
- proc.txt.gz, probably DONT need this. It has start/stop events and can detect bot behavior, maybe a nice to have later
- dnd.txt.gz, probably DONT need this. DND lookup events, useful for network security, but harder to map to financial fraud. can ignore

We can join with src_user and src_comp to make a heterogeneous graph where edges are "Sent Money To" "Logged Into" or "Typed"

I'm waiting for the data here to load and will sample and upload a sample of that data for us to use for the purpose of the project before we expand how much data we are grabbing from the auth file. 