In [1]:
import os
import json
import glob
import random
import re
import datetime
from collections import defaultdict
from datetime import datetime, timedelta
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

# Download NLTK resources
nltk.download('punkt', quiet=True)

def clean_text(text):
    """Clean and preprocess the article text"""
    if not text or len(text.strip()) == 0:
        return ""
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def tokenize_and_pad(text, max_tokens=512):
    """Tokenize text and limit to max_tokens"""
    tokens = word_tokenize(text)
    return ' '.join(tokens[:max_tokens])

def is_duplicate_or_headline_only(text):
    """Check if text is duplicate or only headline"""
    if not text:
        return True
    
    # Consider it headline-only if it's very short (less than 20 tokens)
    tokens = word_tokenize(text)
    return len(tokens) < 20

def extract_date_from_published(published_str):
    """Extract date from published string"""
    try:
        # Parse the date part from the published string
        date_obj = datetime.strptime(published_str.split('T')[0], '%Y-%m-%d')
        return date_obj
    except (ValueError, AttributeError, IndexError):
        return None

def process_dataset(input_directory, output_directory, num_examples_per_window=3, num_windows=150, consecutive_days=10, articles_per_day=10):
    """Process the financial news dataset"""
    # Create output directory structure
    os.makedirs(output_directory, exist_ok=True)
    
    # Create splits directories
    for split in ['train', 'test', 'val']:
        split_dir = os.path.join(output_directory, split)
        os.makedirs(split_dir, exist_ok=True)
    
    # Get all month folders
    month_folders = []
    for root, dirs, files in os.walk(input_directory):
        for directory in dirs:
            if directory.startswith("2018_"):
                month_folders.append(os.path.join(root, directory))
    
    print(f"Found {len(month_folders)} month folders.")
    
    # Find all JSON files in the month folders
    all_json_files = []
    for month_folder in month_folders:
        for root, dirs, files in os.walk(month_folder):
            for file in files:
                if file.endswith('.json'):
                    all_json_files.append(os.path.join(root, file))
    
    print(f"Found {len(all_json_files)} JSON files in the dataset.")
    
    # Parse all articles and organize by date
    articles_by_date = defaultdict(list)
    
    for i, json_file in enumerate(all_json_files):
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                try:
                    data = json.load(f)
                    
                    # Skip if no thread or text
                    if 'thread' not in data or 'text' not in data or not data['text']:
                        continue
                        
                    # Extract and clean text
                    text = clean_text(data['text'])
                    
                    # Skip if duplicate or headline only
                    if is_duplicate_or_headline_only(text):
                        continue
                    
                    # Get date from published field
                    published = data.get('published') or data.get('thread', {}).get('published')
                    date_obj = extract_date_from_published(published)
                    
                    if date_obj:
                        # Create article object
                        article = {
                            'title': data.get('title', ''),
                            'text': text,
                            'published': published,
                            'date': date_obj,
                            'source': data.get('thread', {}).get('site', '')
                        }
                        
                        # Add to articles by date
                        articles_by_date[date_obj.strftime('%Y-%m-%d')].append(article)
                        
                except json.JSONDecodeError:
                    # Handle HTML or other non-JSON content
                    with open(json_file, 'r', encoding='utf-8', errors='ignore') as text_f:
                        content = text_f.read(100)  # Read just a bit to check
                        if '<html' in content.lower():
                            # Skip HTML files
                            continue
                            
        except Exception as e:
            print(f"Error processing file {json_file}: {str(e)}")
        
        # Print progress every 1000 files
        if (i + 1) % 1000 == 0:
            print(f"Processed {i + 1} files...")
    
    print(f"Found articles for {len(articles_by_date)} unique dates.")
    
    # Sort dates
    sorted_dates = sorted(articles_by_date.keys())
    
    if not sorted_dates:
        print("No valid dates found. Check the dataset structure and file formats.")
        return
    
    # Find all valid windows of 10 consecutive days with sufficient articles
    valid_windows = []
    for i in range(len(sorted_dates) - consecutive_days + 1):
        window_start = sorted_dates[i]
        window_dates = []
        current_date = datetime.strptime(window_start, "%Y-%m-%d")
        
        valid_window = True
        for j in range(consecutive_days):
            date_str = current_date.strftime("%Y-%m-%d")
            if date_str in articles_by_date and len(articles_by_date[date_str]) >= articles_per_day:
                window_dates.append(date_str)
                current_date += timedelta(days=1)
            else:
                valid_window = False
                break
        
        if valid_window and len(window_dates) == consecutive_days:
            valid_windows.append(window_dates)
    
    print(f"Found {len(valid_windows)} valid 10-day windows.")
    
    if not valid_windows:
        print("Could not find a valid window of consecutive days with sufficient articles.")
        return
    
    # Limit to requested number of windows if needed
    if len(valid_windows) > num_windows:
        valid_windows = random.sample(valid_windows, num_windows)
        print(f"Randomly selected {num_windows} windows.")
    
    # Create examples for each window
    all_examples = []
    example_id = 1
    
    for window_idx, window_dates in enumerate(valid_windows):
        print(f"Processing window {window_idx+1}/{len(valid_windows)}: {window_dates[0]} to {window_dates[-1]}")
        
        # Generate multiple examples for each window
        for example_idx in range(num_examples_per_window):
            example = {
                'id': example_id,
                'window_id': window_idx + 1,
                'dates': window_dates,
                'articles_by_date': {}
            }
            
            # For each date in the window, select random articles
            for date in window_dates:
                available_articles = articles_by_date[date]
                
                # Randomly select articles_per_day articles
                if len(available_articles) > articles_per_day:
                    selected_articles = random.sample(available_articles, articles_per_day)
                else:
                    selected_articles = available_articles
                
                # Process and tokenize each article
                processed_articles = []
                for article in selected_articles:
                    processed_text = tokenize_and_pad(article['text'])
                    
                    if processed_text:  # Only add if there's valid text after processing
                        processed_articles.append({
                            'title': article['title'],
                            'text': processed_text,
                            'source': article['source']
                        })
                
                example['articles_by_date'][date] = processed_articles
            
            all_examples.append(example)
            example_id += 1
    
    print(f"Generated {len(all_examples)} total examples.")
    
    # Split examples into train/test/val
    random.shuffle(all_examples)  # Shuffle to ensure random distribution
    
    # Calculate split sizes
    total = len(all_examples)
    train_size = int(total * 0.8)
    test_size = int(total * 0.1)
    val_size = total - train_size - test_size
    
    # Split examples
    train_examples = all_examples[:train_size]
    test_examples = all_examples[train_size:train_size+test_size]
    val_examples = all_examples[train_size+test_size:]
    
    print(f"Split distribution: Train: {len(train_examples)}, Test: {len(test_examples)}, Val: {len(val_examples)}")
    
    # Write examples to files
    splits = {
        'train': train_examples,
        'test': test_examples,
        'val': val_examples
    }
    
    for split_name, examples in splits.items():
        split_dir = os.path.join(output_directory, split_name)
        
        for example in examples:
            # Create a directory for this example
            example_dir = os.path.join(split_dir, f"example{example['id']}")
            os.makedirs(example_dir, exist_ok=True)
            
            # Create text files for each date in the window
            for date, articles in example['articles_by_date'].items():
                date_file = os.path.join(example_dir, f"{date}.txt")
                
                with open(date_file, 'w', encoding='utf-8') as f:
                    for idx, article in enumerate(articles):
                        f.write(f"--- Article {idx+1} ---\n")
                        f.write(f"Title: {article['title']}\n")
                        f.write(f"Source: {article['source']}\n")
                        f.write(f"Date: {date}\n")
                        f.write("Text:\n")
                        f.write(article['text'])
                        f.write("\n\n")
    
    print(f"Dataset processing complete. Output saved to {output_directory}")

if __name__ == "__main__":
    # Set the paths for Kaggle environment
    input_directory = "/kaggle/input/us-financial-news-articles"  # Path to the dataset with month folders
    output_directory = "/kaggle/working/processed_dataset_v2"        # Path for output in Kaggle
    
    # Process the dataset
    process_dataset(
        input_directory=input_directory,
        output_directory=output_directory,
        num_examples_per_window=3,    # Number of examples per window
        num_windows=150,              # Max number of windows to use (can be less if not enough found)
        consecutive_days=10,          # Number of consecutive days in each window
        articles_per_day=10           # Number of articles to select per day
    )

Found 5 month folders.
Found 306242 JSON files in the dataset.
Processed 2000 files...
Processed 3000 files...
Processed 4000 files...
Processed 5000 files...
Processed 6000 files...
Processed 7000 files...
Processed 8000 files...
Processed 9000 files...
Processed 10000 files...
Processed 11000 files...
Processed 12000 files...
Processed 13000 files...
Processed 14000 files...
Processed 15000 files...
Processed 16000 files...
Processed 17000 files...
Processed 18000 files...
Processed 19000 files...
Processed 20000 files...
Processed 21000 files...
Processed 22000 files...
Processed 24000 files...
Processed 25000 files...
Processed 26000 files...
Processed 27000 files...
Processed 28000 files...
Processed 29000 files...
Processed 30000 files...
Processed 32000 files...
Processed 33000 files...
Processed 34000 files...
Processed 35000 files...
Processed 36000 files...
Processed 37000 files...
Processed 38000 files...
Processed 39000 files...
Processed 40000 files...
Processed 41000 file