# **EDA for elliptic dataset**

This project analyzes Bitcoin transactions using the provided datasets (`elliptic_txs_classes.csv`, `elliptic_txs_features.csv`, `elliptic_txs_edgelist.csv`). The code performs data analysis, graph visualizations, and exploration of key metrics. All plots are saved in the `images` folder.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import os
import random

# Ensure the images directory exists
if not os.path.exists("images"):
    os.makedirs("images")

# Load datasets
classes_df = pd.read_csv('/kaggle/input/elliptic-data-set/elliptic_bitcoin_dataset/elliptic_txs_classes.csv')
features_df = pd.read_csv('/kaggle/input/elliptic-data-set/elliptic_bitcoin_dataset/elliptic_txs_features.csv', header=None)
edges_df = pd.read_csv('/kaggle/input/elliptic-data-set/elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv')

# Rename columns for clarity
features_df = features_df.rename(columns={0: 'tx_id', 1: 'time_step'})



## **Analysis Steps**

### **1. Transaction Class Distribution**
- A pie chart is created to show the proportion of legal (`licit`) and illegal (`illicit`) transactions.

In [None]:
# 1. Transaction Class Distribution
class_counts = classes_df['class'].value_counts()
plt.figure(figsize=(8, 6))
plt.pie(class_counts, labels=class_counts.index, autopct='%1.1f%%', startangle=140)
plt.title("Transaction Class Distribution")
plt.savefig("images/class_distribution.png")
plt.close()

### **2. Temporal Analysis**
- A line plot shows the number of transactions over time (`time_step`) for each class.

In [None]:
# 2. Temporal Analysis
merged_df = pd.merge(features_df[['tx_id', 'time_step']], classes_df, left_on='tx_id', right_on='txId')
time_class_counts = merged_df.groupby(['time_step', 'class']).size().unstack(fill_value=0)
time_class_counts.plot(kind='line', figsize=(12, 6))
plt.title('Number of Transactions Over Time by Class')
plt.xlabel('Time Step')
plt.ylabel('Number of Transactions')
plt.legend(title='Class')
plt.savefig("images/temporal_analysis.png")
plt.close()

### **3. Graph Visualizations**

#### **3.1. Node Degree Distribution**
- A histogram of node degree distribution is plotted.
- The y-axis uses a logarithmic scale for better visualization of distributions.

In [None]:
# 3. Node Degree Distribution
degrees = [val for (node, val) in G.degree()]
plt.figure(figsize=(8, 6))
plt.hist(degrees, bins=50, color='blue', alpha=0.7)
plt.title("Node Degree Distribution")
plt.xlabel("Degree")
plt.ylabel("Frequency")
plt.yscale('log')
plt.savefig("images/degree_distribution.png")
plt.close()


#### **3.2. Transaction Network for Time Step 10**
- Visualizes a subgraph of transactions for a specific time step (`time_step=10`).
- **Output file**: `images/transaction_network.png`.

In [None]:
#Transaction Network Visualization (Filtered by Time Step 10)

time_step_to_filter = 10
filtered_nodes = features_df[features_df['time_step'] == time_step_to_filter]['tx_id'].tolist()

subgraph = G.subgraph(filtered_nodes)

plt.figure(figsize=(14, 14))


nx.draw(
    subgraph,
    node_color="blue",  
    node_size=50,      
    alpha=0.9,          
    edge_color="black", 
    width=0.5,          
    with_labels=False   
)

plt.title(f"Transaction Network for Time Step {time_step_to_filter}")
plt.savefig("images/transaction_network.png")
plt.close()


#### **3.3. Legal Transactions**
- A subgraph visualizing only legal transactions (`class=2`) for a specific time step (`time_step=10`).
- Nodes are colored green.

In [None]:
time_step_to_filter = 10


filtered_nodes = features_df[features_df['time_step'] == time_step_to_filter]['tx_id'].tolist()


licit_nodes = classes_df[
    (classes_df['class'] == '2') & (classes_df['txId'].isin(filtered_nodes))
]['txId'].tolist()
licit_subgraph = G.subgraph(licit_nodes)

plt.figure(figsize=(14, 14))
nx.draw(
    licit_subgraph,
    node_color="green",    
    node_size=50,
    alpha=0.9,
    edge_color="black",
    width=0.5,
    with_labels=False
)
plt.title(f"Llicit Transactions (Time Step {time_step_to_filter})")
plt.savefig("images/transaction_network_licit_filtered.png")
plt.close()

#### **3.4. Illegal Transactions**
- A subgraph visualizing only illlegal transactions (`class=1`) for a specific time step (`time_step=10`).
- Nodes are colored red.

In [None]:
illicit_nodes = classes_df[
    (classes_df['class'] == '1') & (classes_df['txId'].isin(filtered_nodes))
]['txId'].tolist()
illicit_subgraph = G.subgraph(illicit_nodes)

plt.figure(figsize=(14, 14))
nx.draw(
    illicit_subgraph,
    node_color="red", 
    node_size=50,
    alpha=0.9,
    edge_color="black",
    width=0.5,
    with_labels=False
)
plt.title(f"Ilicit Transactions (Time Step {time_step_to_filter})")
plt.savefig("images/transaction_network_illicit_filtered.png")
plt.close()


### **4. Feature Correlation Heatmap**
- Computes the correlation matrix of transaction features and visualizes it as a heatmap.

In [None]:
# 4. Feature Correlation Heatmap
corr_matrix = features_df.iloc[:, 2:].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Feature Correlation Heatmap")
plt.savefig("images/correlation_heatmap.png")
plt.close()


### **5. PCA of Transaction Features**
- Transaction features are standardized and reduced to two principal components using PCA.
- A scatter plot visualizes the two components, highlighting clusters of legal and illegal transactions.

In [None]:
# 5. PCA of Features
features_scaled = StandardScaler().fit_transform(features_df.iloc[:, 2:])
pca = PCA(n_components=2)
principal_components = pca.fit_transform(features_scaled)
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
pca_df['class'] = classes_df['class']

plt.figure(figsize=(10, 8))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='class', palette="deep")
plt.title("PCA of Features")
plt.savefig("images/pca_scatter.png")
plt.close()