In [None]:
import pandas as pd
import os

# Load the cleaned data with multiple-company markings
input_path = r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Output\paired_mda_reports_CLEANEDV4.csv'
df = pd.read_csv(input_path)

print("\nInitial Data Check:")
print(f"Total rows: {len(df)}")
print(f"Total rows with multiple companies: {df['has_multiple_companies'].sum()}")
print(f"Unique CIKs with multiple companies: {df[df['has_multiple_companies']]['cik_number'].nunique()}")

# Convert dates to datetime
df['current_filing_date'] = pd.to_datetime(df['current_filing_date'])
df['next_filing_date'] = pd.to_datetime(df['next_filing_date'])

# Add year-month columns for transition handling
df['current_ym'] = pd.to_datetime(df['current_filing_date'].dt.strftime('%Y-%m-01'))
df['next_ym'] = pd.to_datetime(df['next_filing_date'].dt.strftime('%Y-%m-01'))

# Define period boundaries
TRAIN_END = pd.Timestamp('2015-06-30')    # Training cutoff
VAL_END = pd.Timestamp('2017-06-30')      # Validation cutoff

# Identify all CIKs with multiple companies
multiple_company_ciks = set(df[df['has_multiple_companies']]['cik_number'])

# Create the DAPT dataset (2000-2010 + all multiple companies)
dapt_data = df[
    (df['current_filing_date'].dt.year <= 2010) |
    (df['cik_number'].isin(multiple_company_ciks))
]

# Remaining data (excluding DAPT data)
remaining_data = df[
    ~(df['current_filing_date'].dt.year <= 2010) &
    ~df['cik_number'].isin(multiple_company_ciks)
]

# Split remaining data with clean cutoffs
train_data = remaining_data[
    (remaining_data['current_filing_date'] <= TRAIN_END)
]

val_data = remaining_data[
    (remaining_data['current_filing_date'] > TRAIN_END) &
    (remaining_data['current_filing_date'] <= VAL_END)
]

test_data = remaining_data[
    (remaining_data['current_filing_date'] > VAL_END)
]

# Print comprehensive statistics
print("\nDataset Statistics:")
print(f"Total pairs in dataset: {len(df)}")
print(f"Total CIKs with multiple companies: {len(multiple_company_ciks)}")
print(f"DAPT data (2000-2010 + all multiple companies): {len(dapt_data)} pairs")
print(f"Training data (2011-2015.06, single company only): {len(train_data)} pairs")
print(f"Validation data (2015.07-2017.06, single company only): {len(val_data)} pairs")
print(f"Test data (2017.07+, single company only): {len(test_data)} pairs")

def check_date_ranges(dataset, name):
    if len(dataset) > 0:
        print(f"\n{name} date range:")
        print(f"Current MDAs: {dataset['current_filing_date'].min()} to {dataset['current_filing_date'].max()}")
        print(f"Next MDAs: {dataset['next_filing_date'].min()} to {dataset['next_filing_date'].max()}")
        print(f"Number of unique companies: {dataset['company_name'].nunique()}")
        print(f"Number of pairs with multiple companies: {dataset['has_multiple_companies'].sum()}")
        print("\nYear-Month distribution for current filings:")
        print(dataset.groupby([dataset['current_filing_date'].dt.year, 
                             dataset['current_filing_date'].dt.month]).size().sort_index())
        print("\nYear-Month distribution for next filings:")
        print(dataset.groupby([dataset['next_filing_date'].dt.year, 
                             dataset['next_filing_date'].dt.month]).size().sort_index())
    else:
        print(f"\n{name} is empty")

# Check each split
check_date_ranges(dapt_data, "DAPT data")
check_date_ranges(train_data, "Training data")
check_date_ranges(val_data, "Validation data")
check_date_ranges(test_data, "Test data")

# Verify no temporal overlap
print("\nVerifying no temporal overlap between splits...")
def verify_no_overlap(df1, df1_name, df2, df2_name):
    next_mdas_1 = set(df1['current_filing_date'])
    current_mdas_2 = set(df2['current_filing_date'])
    overlap = next_mdas_1.intersection(current_mdas_2)
    if overlap:
        print(f"Warning: Found {len(overlap)} overlapping dates between {df1_name} and {df2_name}")
        print("Sample overlapping dates:", sorted(overlap)[:5], "...")
    else:
        print(f"No overlap between {df1_name} and {df2_name}")

verify_no_overlap(train_data, "Training", val_data, "Validation")
verify_no_overlap(val_data, "Validation", test_data, "Test")

# Create output directory if it doesn't exist
output_base = r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Output\Model Data'
os.makedirs(output_base, exist_ok=True)

# Save the splits
dapt_data.to_csv(os.path.join(output_base, 'dapt_data.csv'), index=False)
train_data.to_csv(os.path.join(output_base, 'train_data.csv'), index=False)
val_data.to_csv(os.path.join(output_base, 'val_data.csv'), index=False)
test_data.to_csv(os.path.join(output_base, 'test_data.csv'), index=False)

# Verify all multiple-company pairs are in DAPT
multiple_company_pairs = df['has_multiple_companies'].sum()
multiple_company_pairs_in_dapt = dapt_data['has_multiple_companies'].sum()
assert multiple_company_pairs == multiple_company_pairs_in_dapt, "Not all multiple-company pairs are in DAPT dataset"
print(f"\nVerification: All {multiple_company_pairs} multiple-company pairs are in DAPT dataset")

# Verify all data is assigned
total_assigned = len(dapt_data) + len(train_data) + len(val_data) + len(test_data)
print(f"\nTotal rows: {len(df)}")
print(f"Total assigned: {total_assigned}")
if total_assigned != len(df):
    print(f"Warning: {len(df) - total_assigned} rows unassigned!")
else:
    print("All rows assigned successfully!")

In [11]:
import os
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import pandas as pd
import random
from tqdm import tqdm
import re

# Modified path for input
dapt_data = pd.read_csv(r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Output\Model Data\dapt_data.csv')

def clean_text(text):
    """Clean text before sentence splitting"""
    if not isinstance(text, str):
        return ""

    # Remove table-like content
    text = re.sub(r'=+', '', text)
    text = re.sub(r'-+', '', text)
    text = re.sub(r'Table \d+.*?\n', '', text, flags=re.IGNORECASE)

    # Remove parenthetical numbers and incomplete year references
    text = re.sub(r'\(\d+\)', '', text)
    text = re.sub(r'(?<=\d{4}),\s*(?!\d{4})', ' ', text)

    # Clean up financial notation
    text = re.sub(r'\$\s*', '$', text)
    text = re.sub(r'(?<=\d),(?=\d{3})', '', text)

    # Remove special formatting and headers
    text = re.sub(r'\s*\(table\s*of\s*contents\)\s*', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Summary of.*?Table \d+', '', text, flags=re.IGNORECASE)

    # Standardize whitespace
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

def split_into_sentences(text):
    """Split text into sentences with more lenient filtering"""
    text = clean_text(text)
    sentences = sent_tokenize(text)

    cleaned_sentences = []
    for s in sentences:
        s = s.strip()

        # Skip sentences that:
        if (
            # More lenient length constraints
            not (30 < len(s) < 500) or  # Changed from 40-400
            len(s.split()) < 6 or       # Changed from 8
            len(s.split()) > 60 or      # Changed from 50

            # Basic pattern matching (removed many constraints)
            re.match(r'^(table|figure)\s+\d+', s.lower()) or
            'Amount Percent' in s or

            # Structure checks (simplified)
            s.count('$') > 5 or         # Changed from 3
            re.search(r'\d{6,}', s)     # Only filter very long numbers
        ):
            continue

        cleaned_sentences.append(s)

    return cleaned_sentences

# Process DAPT data for sentence-level training
all_sentences = []
sentence_pairs = []  # For tracking consecutive sentences

print("Processing MDAs into sentences...")
for _, row in tqdm(dapt_data.iterrows(), total=len(dapt_data), desc="Processing MDAs"):
    # Process current MDA
    current_sentences = split_into_sentences(row['current_mda_content'])

    # Process next MDA
    next_sentences = split_into_sentences(row['next_mda_content'])

    # Add all sentences to main list
    all_sentences.extend(current_sentences)
    all_sentences.extend(next_sentences)

    # Create pairs of consecutive sentences (for contextual similarity)
    for doc_sentences in [current_sentences, next_sentences]:
        for i in range(len(doc_sentences) - 1):
            # Only create pairs if both sentences are meaningful
            if len(doc_sentences[i].split()) >= 8 and len(doc_sentences[i+1].split()) >= 8:
                sentence_pairs.append({
                    'sentence1': doc_sentences[i],
                    'sentence2': doc_sentences[i + 1],
                    'company': row['company_name'],
                    'filing_date': row['current_filing_date']
                })

# Create DataFrames
sentences_df = pd.DataFrame({
    'sentence': all_sentences
})

pairs_df = pd.DataFrame(sentence_pairs)

# Modified paths for output
output_base = r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Output\Model Data\DAPT Data'
os.makedirs(output_base, exist_ok=True)  # Create directory if it doesn't exist


sentences_df.to_csv(os.path.join(output_base, 'dapt_sentences.csv'), index=False)
pairs_df.to_csv(os.path.join(output_base, 'dapt_sentence_pairs.csv'), index=False)

print("\nDataset Statistics:")
print(f"Total individual sentences: {len(sentences_df)}")
print(f"Total sentence pairs: {len(pairs_df)}")

# Print some quality checks
print("\nQuality Check:")
print("Average sentence length:", sum(len(s.split()) for s in all_sentences) / len(all_sentences))
print("Shortest sentence:", min(all_sentences, key=len))
print("Longest sentence:", max(all_sentences, key=len))

# Sample random sentences
print("\nRandom Sample of Clean Sentences:")
sample_sentences = random.sample(all_sentences, min(5, len(all_sentences)))
for i, sentence in enumerate(sample_sentences, 1):
    print(f"{i}. {sentence}")

# Previous Dataset Statistics:
# Total individual sentences: 4129608
# Total sentence pairs: 4111984

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abbra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Processing MDAs into sentences...


Processing MDAs: 100%|██████████| 10719/10719 [07:43<00:00, 23.14it/s]



Dataset Statistics:
Total individual sentences: 4895131
Total sentence pairs: 4733770

Quality Check:
Average sentence length: 25.236633095212365
Shortest sentence: See page 10 for further detail.
Longest sentence: Manufacture of electromagnetic interference and radio frequency interference shielding for primarily communications, computer and aerospace applications Santa Barbara Infrared, Inc. (SBIR)............... Design and manufacture of aerospace and defense electronically controlled infrared simulation and test equipment Trilectron Industries, Inc., formerly a part of the ETG which designed and manufactured electronically controlled ground support equipment for aircraft, was sold in September 2000.

Random Sample of Clean Sentences:
1. In connection with these actions, the Company took charges of 18.5 million and 7.7 million, respectively.
2. The estimated fair value of our net assets is calculated based on the difference between the fair value of our assets and the fair value of

In [None]:
import pandas as pd

# Load the CSV files
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Research/Paper Implementations/Koval Paper/Data/train_data.csv')
val_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Research/Paper Implementations/Koval Paper/Data/val_data.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Research/Paper Implementations/Koval Paper/Data/test_data.csv')

# Convert dates to datetime
def print_data_stats(df, name):
    # Convert date column to datetime
    df['current_filing_date'] = pd.to_datetime(df['current_filing_date'])

    print(f"\n{name} Statistics:")
    print(f"Number of samples: {len(df)}")
    print(f"Date range: {df['current_filing_date'].min()} to {df['current_filing_date'].max()}")
    print(f"Number of unique companies: {df['company_name'].nunique()}")
    print(f"Average MDA length (current): {df['current_mda_content'].str.len().mean():.0f} chars")
    print(f"Average MDA length (next): {df['next_mda_content'].str.len().mean():.0f} chars")

# Print statistics for each dataset
print_data_stats(train_data, "Training Data")
print_data_stats(val_data, "Validation Data")
print_data_stats(test_data, "Test Data")