In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import os

print("Libraries imported successfully!")

In [None]:
# Define the path to your chosen edges file
edges_file = 'data/raw/256497288/256497288.edges'

# Check if file exists
if os.path.exists(edges_file):
    print(f"✅ File found: {edges_file}")
else:
    print(f"❌ File not found. Looking for: {edges_file}")
    # List what's in the directory to debug
    dir_path = os.path.dirname(edges_file)
    print(f"Files in {dir_path}:")
    print(os.listdir(dir_path))

In [None]:
# Let's look at the first few lines
print("First 10 lines of the file:")
with open(edges_file, 'r') as f:
    for i in range(10):
        line = f.readline().strip()
        print(f"Line {i+1}: {line}")

# Count total lines
with open(edges_file, 'r') as f:
    total_lines = sum(1 for line in f)
print(f"\nTotal edges in file: {total_lines}")

In [None]:
# Load the edges into a pandas DataFrame
# Format: follower followed (space separated)
df_edges = pd.read_csv(edges_file, sep=' ', header=None, names=['follower', 'followed'])

print("✅ Data loaded successfully!")
print(f"Shape: {df_edges.shape}")
print(f"\nFirst 5 rows:")
print(df_edges.head())
print(f"\nLast 5 rows:")
print(df_edges.tail())

In [None]:
print("=== BASIC STATISTICS ===")
print(f"Total edges: {len(df_edges)}")

# Count unique users
followers = df_edges['follower'].unique()
followed = df_edges['followed'].unique()
all_users = pd.concat([df_edges['follower'], df_edges['followed']]).unique()

print(f"Unique followers: {len(followers)}")
print(f"Unique followed users: {len(followed)}")
print(f"Total unique users: {len(all_users)}")

# Check for self-follows
self_follows = df_edges[df_edges['follower'] == df_edges['followed']]
print(f"Self-follow edges: {len(self_follows)}")

In [None]:
print("\n=== MOST FOLLOWED USERS ===")
# Count how many times each user is followed
followed_counts = df_edges['followed'].value_counts()
print("Top 10 most followed users:")
print(followed_counts.head(10))

print("\n=== MOST ACTIVE FOLLOWERS ===")
# Count how many people each user follows
follower_counts = df_edges['follower'].value_counts()
print("Top 10 most active followers:")
print(follower_counts.head(10))

In [None]:
# For MVP, let's create a small subgraph (first 100 edges)
df_small = df_edges.head(100)

# Create directed graph
G = nx.DiGraph()

# Add edges
for _, row in df_small.iterrows():
    G.add_edge(row['follower'], row['followed'])

print(f"Small graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")

In [None]:
# Simple visualization - might be messy if network is large
plt.figure(figsize=(10, 8))

# Use a layout algorithm
pos = nx.spring_layout(G, k=0.5, iterations=50)

# Draw
nx.draw(G, pos, 
        with_labels=True, 
        node_size=300, 
        node_color='lightblue',
        font_size=8,
        arrowsize=10)

plt.title("Small Sample of Twitter Follower Network")
plt.show()