# Run Conversation Pipeline

This notebook demonstrates the usage of the conversation pipeline for processing tweet conversations.

In [1]:
import os
import pandas as pd
import logging
import sys

# Add the parent directory to Python path to make src imports work
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Now we can import from src
from src.pipeline.conversation_pipeline import ConversationExtractor, Tweet

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

print("Imports successful!")

Imports successful!


In [2]:
# Function to load and preprocess the data
def load_tweet_data(file_path):
    print(f"Loading data from: {file_path}")
    df = pd.read_csv(file_path)
    required_columns = ['tweet_id', 'full_text', 'screen_name', 'created_at', 
                       'reply_to_id', 'reply_to_user', 'expandedURL']
    
    # Check if all required columns exist
    missing_cols = [col for col in required_columns if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")
    
    print(f"Successfully loaded {len(df)} tweets")
    print("\nDataset columns:")
    print(df.columns.tolist())
    return df

In [3]:
# Initialize the conversation extractor
extractor = ConversationExtractor()

# File paths - adjust these as needed
data_dir = os.path.join(project_root, 'data')
test_file = os.path.join(data_dir, 'df_test_10k.csv')
full_file = os.path.join(data_dir, 'Kopie von FolloweeIDs2_tweets_df_AugustPull.csv')

# Try to load test file first
try:
    df = load_tweet_data(test_file)
except FileNotFoundError:
    print(f"Test file not found: {test_file}")
    print("Trying full dataset...")
    df = load_tweet_data(full_file)

Loading data from: /Users/mogen/Desktop/Research/data/df_test_10k.csv
Successfully loaded 10000 tweets

Dataset columns:
['full_text', 'tweet_id', 'created_at', 'screen_name', 'original_user_id', 'retweeted_user_ID', 'collected_at', 'reply_to_id', 'reply_to_user', 'expandedURL']


In [None]:
# Process data in chunks
chunk_size = 1000
total_chunks = (len(df) + chunk_size - 1) // chunk_size

print(f"Processing {len(df)} tweets in chunks of {chunk_size}")
print(f"Total chunks: {total_chunks}")

for i in range(0, len(df), chunk_size):
    chunk = df.iloc[i:i + chunk_size]
    extractor.process_chunk(chunk)
    
    # Print progress every chunk
    current_chunk = (i // chunk_size) + 1
    print(f"Processed chunk {current_chunk}/{total_chunks} ({(i + len(chunk))} tweets)")
    
print("\nProcessing complete!")

In [None]:
# Print statistics
stats = extractor.get_stats()
print("\nProcessing Statistics:")
for key, value in stats.items():
    print(f"{key}: {value}")

# Analyze conversation lengths
conversation_lengths = [len(tweets) for tweets in extractor.conversations.values()]
conv_stats = pd.Series(conversation_lengths).describe()

print("\nConversation Length Statistics:")
print(conv_stats)

In [None]:
# Display a few sample conversations
print("Sample Conversations:")
for conv_id, tweets in list(extractor.conversations.items())[:3]:
    print(f"\nConversation {conv_id} ({len(tweets)} tweets):")
    for tweet in sorted(tweets, key=lambda x: x.timestamp):
        print(f"  {tweet.timestamp}: {tweet.author}: {tweet.text[:100]}...")
    print("-" * 80)