# Preprocessor Development and Testing

This notebook is for developing and testing the DataPreprocessor class.

In [1]:
# Standard library
import os
import re
import logging
from typing import Tuple, Dict
from datetime import datetime
import unittest
import tempfile

# Third-party packages
import pandas as pd
import emoji

In [2]:
import sys
import os

# Get the parent directory (Research_Case root)
project_root = os.path.dirname(os.getcwd())  # This gets parent of notebooks directory
print(f"Project root: {project_root}")

# Add to Python path if not already there
if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added to sys.path: {project_root}")

from research_case.processors.preprocess import DataPreprocessor
import pandas as pd

Project root: /Users/mogen/Desktop/Research_Case
Added to sys.path: /Users/mogen/Desktop/Research_Case


In [3]:
sample_data = {
        'full_text': [
            'This is a normal tweet with content',
            'https://example.com',  # URL only
            '@user1 @user2',  # Mentions only
            '🎉 🎊 🎈',  # Emojis only
            'Short',  # Too short
            'This is a valid tweet with a link https://example.com',
            'Reply to someone with good content'  # Valid reply
        ],
        'tweet_id': [1, 2, 3, 4, 5, 6, 7],
        'created_at': [
            '2024-01-01 10:00:00',
            '2024-01-01 10:01:00',
            '2024-01-01 10:02:00',
            '2024-01-01 10:03:00',
            '2024-01-01 10:04:00',
            '2024-01-01 10:05:00',
            '2024-01-01 10:06:00'
        ],
        'screen_name': [
            'user1', 'user2', 'user3', 'user4', 
            'user5', 'user6', 'user7'
        ],
        'original_user_id': [
            101, 102, 103, 104, 105, 106, 107
        ],
        'retweeted_user_ID': [
            None, None, None, None, None, None, None
        ],
        'collected_at': [
            '2024-12-21 16:00:00',
            '2024-12-21 16:00:00',
            '2024-12-21 16:00:00',
            '2024-12-21 16:00:00',
            '2024-12-21 16:00:00',
            '2024-12-21 16:00:00',
            '2024-12-21 16:00:00'
        ],
        'reply_to_id': [
            None, None, 123, None, None, None, 456
        ],
        'reply_to_user': [
            None, None, '@original_user', None, None, None, '@another_user'
        ],
        'expandedURL': [
            None, 'https://example.com', None, None, None, 
            'https://example.com', None
        ]
    }
    
   
df = pd.DataFrame(sample_data)
df.to_csv('/Users/mogen/Desktop/Research_Case/research_case/test/test_data/sample_tweets.csv', index=False)

# Initialize preprocessor
preprocessor = DataPreprocessor('/Users/mogen/Desktop/Research_Case/research_case/test/test_data/sample_tweets.csv')

In [4]:
# Test loading data
preprocessor.load_data()
print("Initial data shape:", preprocessor.df.shape)

2024-12-21 16:13:26,871 - INFO - Loading data from /Users/mogen/Desktop/Research_Case/research_case/test/test_data/sample_tweets.csv
2024-12-21 16:13:26,874 - INFO - Loaded 7 rows


Initial data shape: (7, 10)


In [5]:
# Test splitting posts and replies
posts_df, replies_df = preprocessor.split_posts_replies()
print("Posts shape:", posts_df.shape)
print("Replies shape:", replies_df.shape)

2024-12-21 16:13:27,010 - INFO - Split data into 5 posts and 2 replies


Posts shape: (5, 10)
Replies shape: (2, 10)


In [6]:
# Test tweet filtering
filtered_posts = preprocessor.filter_tweets(posts_df)
print("\nFiltered posts:")
print(filtered_posts[['full_text']])


Filtered posts:
                                           full_text
0                This is a normal tweet with content
5  This is a valid tweet with a link https://exam...


In [9]:
# Test full processing pipeline
posts_file, replies_file, users_file, conversations_file = preprocessor.process(test=True)
print(f"\nProcessed files saved to:\n{posts_file}\n{replies_file}")

2024-12-21 16:14:56,192 - INFO - Created output directory: Tests/test_data_20241221_161456
2024-12-21 16:14:56,195 - INFO - Split data into 5 posts and 2 replies
2024-12-21 16:14:56,195 - INFO - Filtering posts...
2024-12-21 16:14:56,196 - INFO - Retained 2 valid posts after filtering
2024-12-21 16:14:56,197 - INFO - Filtering replies...
2024-12-21 16:14:56,197 - INFO - Retained 1 valid replies after filtering
2024-12-21 16:14:56,198 - INFO - Grouping posts by user ID
2024-12-21 16:14:56,200 - INFO - Grouped posts for 2 unique users
2024-12-21 16:14:56,201 - INFO - Grouping conversations
2024-12-21 16:14:56,201 - INFO - Grouped 1 conversations
2024-12-21 16:14:56,204 - INFO - Saved all processed files to Tests/test_data_20241221_161456



Processed files saved to:
Tests/test_data_20241221_161456/test_posts.csv
Tests/test_data_20241221_161456/test_replies.csv


In [12]:
def cleanup_test_files(posts_file: str, replies_file: str, users_file: str, conversations_file: str) -> None:
    """
    Clean up test files and directories after testing.
    
    Args:
        posts_file: Path to the test posts CSV file
        replies_file: Path to the test replies CSV file
        users_file: Path to the test users JSON file
        conversations_file: Path to the test conversations JSON file
    """
    try:
        # Get the test directory from the posts file path
        test_dir = os.path.dirname(posts_file)
        
        # List all files to clean up
        files_to_remove = [
            posts_file,
            replies_file, 
            users_file,
            conversations_file
        ]
        
        # Remove individual files
        for file_path in files_to_remove:
            if file_path and os.path.exists(file_path):
                os.remove(file_path)
                print(f"Removed test file: {file_path}")
        
        # Remove the test directory if it's empty
        if os.path.exists(test_dir) and not os.listdir(test_dir):
            os.rmdir(test_dir)
            print(f"Removed empty test directory: {test_dir}")
            
            # Try to remove parent "Tests" directory if it's empty
            parent_dir = os.path.dirname(test_dir)
            if os.path.basename(parent_dir) == "Tests" and not os.listdir(parent_dir):
                os.rmdir(parent_dir)
                print(f"Removed empty Tests directory: {parent_dir}")
                
    except Exception as e:
        print(f"Error during cleanup: {e}")

# Usage example:
try:
    # Run your test code
    #print(f"Test files are at:\n{test_posts_file}\n{test_replies_file}")
    
    # Clean up after testing
    cleanup_test_files(posts_file, replies_file, users_file, conversations_file)
    
except Exception as e:
    print(f"Error: {e}")

Removed test file: Tests/test_data_20241221_161456/test_posts.csv
Removed test file: Tests/test_data_20241221_161456/test_replies.csv
Removed test file: Tests/test_data_20241221_161456/test_users.json
Removed test file: Tests/test_data_20241221_161456/test_conversations.json
Removed empty test directory: Tests/test_data_20241221_161456
