In [None]:
# MDA Extraction Test 1 

import os
import re
import csv
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import traceback
import logging
from datetime import datetime, timedelta
import yfinance as yf
import pandas as pd
import numpy as np
from yahooquery import Ticker
import time
from requests.exceptions import RequestException
import requests

"IFQWK03BBAJDH85H"

# Define the path for the log file
log_directory = r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Output'
log_file = os.path.join(log_directory, 'mda_extraction.log')

# Ensure the log directory exists
os.makedirs(log_directory, exist_ok=True)

# Configure logging to write to a file
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename=log_file,
    filemode='w'  # 'w' mode overwrites the file, use 'a' for append
)

# Add a StreamHandler for tqdm to work correctly
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.ERROR)  # Only show ERROR level logs in the console
logging.getLogger().addHandler(console_handler)


# ================================
# 1. Define Regular Expressions
# ================================


#MDA Start Patterns
ITEM7_PATTERNS = [
    r'\b(?:Item|ITEM)\s*7\b(?!A|\.A)\.?(?:\s*[-–—])?\s*(?:Management[\'\s]?s?|Managements?|Managment|Manag[ae]?ment[\'\s]?s?)\b'
]

ITEM7_REGEX = re.compile('|'.join(ITEM7_PATTERNS), re.IGNORECASE)

# MDA End Patterns
ITEM7A_OR_8_PATTERNS = [
    r'\bItem\s*7A\.?\s*',
    r'\bITEM\s*7A\.?\s*',
    r'\bItem\s*7A\.?\s*Quantitative\s+and\s+Qualitative\s+Disclosures\s+About\s+Market\s+Risk\b',
    r'\bITEM\s*7A\.?\s*Quantitative\s+and\s+Qualitative\s+Disclosures\s+About\s+Market\s+Risk\b',
    r'\bItem\s*8\.?\s*',
    r'\bITEM\s*8\.?\s*',
]
ITEM7A_OR_8_REGEX = re.compile('|'.join(ITEM7A_OR_8_PATTERNS), re.IGNORECASE)


# ================================
# 1. Helper Functions
# ================================

# Dictionary to store CIK to ticker mappings
cik_ticker_cache = {}

def cik_to_ticker(cik):
    if cik in cik_ticker_cache:
        return cik_ticker_cache[cik]
    
    url = f"https://data.sec.gov/submissions/CIK{cik.zfill(10)}.json"
    headers = {
        'User-Agent': 'FinResearch/1.0 (Contact: abbraga04@gmail.com)'
    }
    response = requests.get(url, headers=headers)
    time.sleep(1)  # Add a delay of 1 second between requests
    
    if response.status_code == 200:
        data = response.json()
        tickers = data.get('tickers', [])
        cik_ticker_cache[cik] = tickers
        return tickers
    else:
        cik_ticker_cache[cik] = None
        return None

import requests
import pandas as pd
from datetime import datetime, timedelta
import time

# Alpha Vantage API key
API_KEY = 'YOUR_ALPHA_VANTAGE_API_KEY'  # Replace with your actual API key

def get_stock_data(cik, start_date, end_date, max_retries=5, delay=12):
    """
    Fetch daily stock price data for a given CIK and date range using Alpha Vantage.
    """
    tickers = cik_to_ticker(str(cik))
    if not tickers:
        print(f"No ticker found for CIK: {cik}")
        return pd.Series()

    ticker = tickers[0]  # Use the first ticker if multiple are returned
    
    for attempt in range(max_retries):
        try:
            url = f'https://www.alphavantage.co/query?function=TIME_SERIES_DAILY_ADJUSTED&symbol={ticker}&apikey={API_KEY}&outputsize=full'
            response = requests.get(url)
            data = response.json()
            
            if 'Time Series (Daily)' in data:
                df = pd.DataFrame(data['Time Series (Daily)']).T
                df.index = pd.to_datetime(df.index)
                df = df.sort_index()
                df = df[(df.index >= start_date) & (df.index <= end_date)]
                
                if not df.empty:
                    return df['4. close'].astype(float)
                else:
                    print(f"No data found for ticker {ticker} (CIK: {cik}) in the specified date range")
            else:
                print(f"No data found for ticker {ticker} (CIK: {cik})")
            
            return pd.Series()
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for CIK {cik}: {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(delay)
            else:
                print(f"Max retries reached for CIK {cik}")
                return pd.Series()

def prepare_risk_prediction_data(results):
    """
    Prepare the dataset for risk prediction.
    """
    data = []
    print(f"Total results to process: {len(results)}")
    
    for result in results:
        cik = result['cik_number']
        company_name = result['company_name']
        filing_date = result.get('filing_date')
        
        if not filing_date:
            print(f"Missing filing date for {company_name} (CIK: {cik}). Skipping this report.")
            continue
        
        end_date = (datetime.strptime(filing_date, '%Y-%m-%d') + timedelta(days=365)).strftime('%Y-%m-%d')
        print(f"Fetching stock data for {company_name} (CIK: {cik}) from {filing_date} to {end_date}")
        
        stock_prices = get_stock_data(cik, filing_date, end_date)
        
        if not stock_prices.empty:
            mdd = calculate_mdd(stock_prices)
            print(f"Calculated MDD for {company_name} (CIK: {cik}): {mdd}")
            data.append({
                'cik': cik,
                'company_name': company_name,
                'mda_content': result.get('mda_content', ''),
                'mdd': mdd,
                'filing_date': filing_date
            })
        else:
            print(f"No valid stock price data found for {company_name} (CIK: {cik})")

    print(f"Total data points collected: {len(data)}")
    
    if not data:
        print("No data to process for risk prediction.")
        return []

    df = pd.DataFrame(data)
    df['year'] = pd.to_datetime(df['filing_date']).dt.year
    
    def label_risk_by_year(group):
        mdds = group['mdd'].values
        return label_risk(mdds)
    
    df['risk_label'] = df.groupby('year').apply(label_risk_by_year).reset_index(drop=True)
    
    print(f"Risk labels assigned. Final number of data points: {len(df)}")
    
    return df.to_dict('records')
    

def contains_other_items(mda_content):
    """
    Checks if the MDA content contains mentions of items other than Item 7.

    Parameters:
    - mda_content (str): The content of the MDA section.

    Returns:
    - bool: True if other items are mentioned, False otherwise.
    """

    # This pattern matches "Item" or "ITEM" followed by a number that's not 7
    pattern = r'\b(?:Item|ITEM)\s+(?!7\b)\d+'
    return bool(re.search(pattern, mda_content))

def extract_company_name(content):
    """
    Extracts the company name from the file content.

    Parameters:
    - content (str): The full text of the 10-K document.

    Returns:
    - str: The extracted company name or "Unknown" if not found.
    """

    match = re.search(r'COMPANY CONFORMED NAME:\s*(.+)$', content, re.MULTILINE)
    return match.group(1).strip() if match else "Unknown"

def extract_filing_date(content):
    """
    Extracts the filing date from the file content.

    Parameters:
    - content (str): The full text of the 10-K document.

    Returns:
    - str: The extracted filing date in YYYY-MM-DD format or "Unknown" if not found.
    """

    match = re.search(r'FILED AS OF DATE:\s*(\d{8})', content)
    if match:
        date_str = match.group(1)
        return f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
    return "Unknown"

def extract_company_id(content):
    """
    Extracts the company identifier (CIK) from the file content.

    Parameters:
    - content (str): The content of the file.

    Returns:
    - str or None: Extracted CIK or None if not found.
    """

    # Look for the CIK in the header of the file
    cik_pattern = r'CENTRAL INDEX KEY:\s*(\d{10})'
    match = re.search(cik_pattern, content)
    if match:
        return match.group(1)
    
    # If not found in the standard location, try an alternative pattern
    alt_pattern = r'CIK=(\d{10})'
    match = re.search(alt_pattern, content)
    if match:
        return match.group(1)
    
    return None

def count_companies(content):
    """
    Counts the number of unique companies in the SEC filing based on CIK numbers,
    company names, and filer identifiers.

    Parameters:
    - content (str): The full text of the 10-K document.

    Returns:
    - int: Number of unique companies detected.
    """

    # Extract CIK numbers using the existing helper function
    cik_number = extract_company_id(content)
    cik_numbers = set([cik_number]) if cik_number else set()

    # Extract company names using the existing helper function
    company_name = extract_company_name(content)
    company_names = set([company_name]) if company_name != "Unknown" else set()

    # Search for filer identifiers
    filer_pattern = r'FILER:\s*\n\s*COMPANY DATA:'
    filer_count = len(re.findall(filer_pattern, content))

    # Determine the number of unique companies
    num_companies = max(len(cik_numbers), len(company_names), filer_count)

    return num_companies

# Filter through table of contents
def is_table_of_contents(text, strict=True):
    """
    Determines if the given text is likely to be a table of contents.

    Parameters:
    - text (str): The text to analyze.
    - strict (bool): Whether to use strict or lenient criteria.

    Returns:
    - bool: True if the text is likely a table of contents, False otherwise.
    """

    logging.info("Checking if the text is a table of contents...")
    # Check if the text is short
    if len(text.split()) < 200:
        logging.info("Text is short, likely a table of contents.")
        return True
    
    # Check for patterns typical in table of contents
    toc_patterns = [
        r'\bItem\s+\d+[A-Z]?\.\s+.*\d+\s*$',  # Item followed by number and page number
        r'\bTable\s+of\s+Contents\b',
        r'\b(Page|p\.)\s+\d+\b',
        r'\bItem\s+\d+[A-Z]?\..*\n.*\d+\s*$',  # Item title followed by page number on next line
        r'^\s*\d+\s*$',  # Standalone page numbers
        r'\bItem\s+\d+[A-Z]?.*\d+\s*\b',  # Item followed by any text and a number (page number)
        r'\bItem\s+\d+[A-Z]?\.?\s+[^.]+?(?=\s+Item|\s+\d+|$)'  # Item followed by title, ending at next Item or number
    ]
    
    toc_regex = re.compile('|'.join(toc_patterns), re.IGNORECASE | re.MULTILINE)
    
    # Count matches
    matches = toc_regex.findall(text)
    if strict and len(matches) > 3:  # If more than 3 matches, likely a table of contents
        logging.info(f"Detected {len(matches)} matches in strict mode, likely a table of contents.")
        return True
    elif not strict and len(matches) > 5:  # Less strict for longer sections
        logging.info(f"Detected {len(matches)} matches in less strict mode, likely a table of contents.")
        return True
    
    # Check for consecutive item listings
    items = re.findall(r'\bItem\s+\d+[A-Z]?', text, re.IGNORECASE)
    if len(items) > 2:  # More than 2 ITEM listings
        # Check if these items appear close to each other
        item_positions = [m.start() for m in re.finditer(r'\bItem\s+\d+[A-Z]?', text, re.IGNORECASE)]
        if len(item_positions) > 1:
            avg_distance = sum(item_positions[i+1] - item_positions[i] for i in range(len(item_positions)-1)) / (len(item_positions)-1)
            if strict and avg_distance < 200:  # If average distance between ITEMs is less than 200 characters
                logging.info("High density of item listings detected, likely a table of contents.")
                return True
            elif not strict and avg_distance < 100:  # Less strict for longer sections
                logging.info("High density of item listings detected in less strict mode, likely a table of contents.")
                return True
    
    # Check for high density of item listings
    words = text.split()
    item_density = len(items) / len(words)
    if strict and item_density > 0.01:  # If more than 1% of words are item listings, likely a table of contents
        logging.info("Item density exceeds 1%, likely a table of contents.")
        return True
    elif not strict and item_density > 0.02:  # Less strict for longer sections
        logging.info("Item density exceeds 2% in less strict mode, likely a table of contents.")
        return True
    
    # Check for short paragraphs (typical in TOC)
    paragraphs = text.split('\n\n')
    short_paragraphs = [p for p in paragraphs if len(p.split()) < 20]
    
    if len(short_paragraphs) / len(paragraphs) > 0.5:  # If more than half of paragraphs are short, likely a TOC
        logging.info("More than half of paragraphs are short, likely a table of contents.")
        return True
    
    logging.info("Text does not appear to be a table of contents.")
    return False


def pair_reports(results):
    """
    Pairs reports for companies based on their filing dates, allowing for delays and differences in publication dates.

    Parameters:
    - results (list of dict): Extracted MDA sections with filenames and filing dates.

    Returns:
    - list of dict: Paired reports.
    """
    # Group reports by company
    company_reports = {}
    for report in results:
        cik = report['cik_number']
        if cik not in company_reports:
            company_reports[cik] = []
        company_reports[cik].append(report)

    paired_reports = []
    for cik, reports in company_reports.items():
        # Sort reports by filing date
        sorted_reports = sorted(reports, key=lambda x: datetime.strptime(x['filing_date'], '%Y-%m-%d'))
        
        for i in range(len(sorted_reports) - 1):
            current_report = sorted_reports[i]
            next_report = sorted_reports[i + 1]
            
            current_date = datetime.strptime(current_report['filing_date'], '%Y-%m-%d')
            next_date = datetime.strptime(next_report['filing_date'], '%Y-%m-%d')
            
            # Check if reports are within a reasonable time frame (e.g., 9-15 months apart)
            if timedelta(days=270) <= next_date - current_date <= timedelta(days=450):
                paired_reports.append({
                    'current_filename': current_report['filename'],
                    'next_filename': next_report['filename'],
                    'company_name': current_report['company_name'],
                    'cik_number': cik,
                    'current_filing_date': current_report['filing_date'],
                    'next_filing_date': next_report['filing_date'],
                    'current_mda_content': current_report['mda_content'],
                    'next_mda_content': next_report['mda_content'],
                    'time_difference': (next_date - current_date).days
                })

    return paired_reports

def pair_and_save_reports(results, output_directory):
    """
    Pairs reports and saves them to a CSV file.

    Parameters:
    - results (list of dict): Extracted MDA sections with filenames and filing dates.
    - output_directory (str): Directory to save the paired reports CSV.

    Returns:
    - None
    """

    paired_reports_csv = os.path.join(output_directory, 'paired_mda_reports.csv')
    
    # Use the new pair_reports function that doesn't rely on fiscal_calendar_df
    paired_reports = pair_reports(results)
    
    if paired_reports:
        fieldnames = ['current_filename', 'next_filename', 'company_name', 'cik_number', 
                      'current_filing_date', 'next_filing_date', 'current_mda_content', 
                      'next_mda_content', 'time_difference']
        
        with open(paired_reports_csv, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for row in paired_reports:
                writer.writerow(row)
        logging.info(f"INFO: Paired reports saved to {paired_reports_csv}")
    else:
        logging.warning("WARNING: No paired reports found.")

# ================================
# 2. MDA Text Extraction
# ================================


def extract_mda_section(text):
    """
    Extracts the Management's Discussion and Analysis (MDA) section from the given text.

    Parameters:
    - text (str): The full text of the 10-K document.

    Returns:
    - dict or None: A dictionary containing the extracted MDA section and metadata, or None if not found.
    """

    logging.info("Normalizing text for extraction...")
    normalized_text = re.sub(r'\s+', ' ', text)
    
    item7_matches = list(ITEM7_REGEX.finditer(normalized_text))
    logging.info(f"Found {len(item7_matches)} potential Item 7 matches.")
    
    if not item7_matches:
        logging.info("No Item 7 matches found.")
        return None
    
    for start_match in item7_matches:
        start_index = start_match.start()
        start_pattern = start_match.group()
        logging.info(f"Found potential MDA start at index {start_index}: '{start_pattern}'")
        
        if 'item 6' in start_pattern.lower() and 'management' not in start_pattern.lower() and 'md&a' not in start_pattern.lower():
            logging.info("Skipping Item 6 as it is not explicitly an MDA.")
            continue
        
        # Find end of MDA
        end_match = ITEM7A_OR_8_REGEX.search(normalized_text[start_index + 100:])
        if end_match:
            end_index = start_index + 100 + end_match.start()
            logging.info(f"Found potential MDA end at index {end_index}.")
        else:
            end_index = len(normalized_text)
            logging.info("No end pattern found, using end of document.")
        
        mda_text = normalized_text[start_index:end_index]
        
        # Check if the entire section is a table of contents
        if is_table_of_contents(mda_text):
            logging.info("This section appears to be a table of contents. Skipping.")
            continue
        
        # If the beginning looks like a table of contents, try to find the real start of the MDA
        first_1000_words = ' '.join(mda_text.split()[:1000])
        if is_table_of_contents(first_1000_words):
            logging.info("The beginning looks like a table of contents. Searching for the real MDA start.")
            real_start_match = re.search(r'(Management\'s\s+Discussion\s+and\s+Analysis|MD&A).{0,500}?(?=\n\n)', mda_text, re.IGNORECASE | re.DOTALL)
            if real_start_match:
                start_index += real_start_match.start()
                mda_text = mda_text[real_start_match.start():]
                logging.info(f"Found real MDA start at index {start_index}.")
            else:
                logging.info("Couldn't find the real MDA start. Skipping this occurrence.")
                continue
        
        # Check word count
        word_count = len(mda_text.split())
        logging.info(f"Extracted MDA section with word count: {word_count}.")
        if 500 <= word_count:
            return {
                'mda_content': mda_text,
                'start_index': start_index,
                'end_index': end_index,
                'start_pattern': start_pattern,
                'end_pattern': end_match.group() if end_match else "End of document",
                'word_count': word_count
            }
    
    logging.info("No suitable MDA section found.")
    return None


# Method to process each file 
def process_file(file_path):
    """
    Processes a single 10-K file to extract the MDA section and related metadata.

    Parameters:
    - file_path (str): Path to the 10-K file.

    Returns:
    - dict or None: A dictionary containing the extracted MDA section and metadata, or None if extraction fails.
    """

    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            content = file.read()

        num_companies = count_companies(content)

        if num_companies != 1:
            logging.info(f"Skipping file {file_path}: Detected {num_companies} companies.")
            return None 

        cik_number = extract_company_id(content)
        company_name = extract_company_name(content)
        filing_date = extract_filing_date(content)

        # Check if we've already processed this company
        if cik_number in cik_ticker_cache:
            if cik_ticker_cache[cik_number] is None:
                logging.info(f"Skipping file {file_path}: No ticker found for company {company_name} (CIK: {cik_number}).")
                return None
        else:
            # If not, check if the company has a ticker
            tickers = cik_to_ticker(str(cik_number))
            if not tickers:
                logging.info(f"Skipping file {file_path}: No ticker found for company {company_name} (CIK: {cik_number}).")
                return None

        result = extract_mda_section(content)

        if result:
            result.update({
                'cik_number': cik_number,
                'company_name': company_name,
                'filing_date': filing_date,
                'filename': os.path.basename(file_path)
            })
            return result
        else:
            return None
    except Exception as e:
        logging.error(f"Error processing file {file_path}: {e}")
        traceback.print_exc()
        return None

        
# ================================
# 3. Main Processing Function
# ================================

def main():
    """
    Main function to process 10-K files, extract MDA sections, generate paired reports,
    and prepare risk prediction data.

    Parameters:
    - None

    Returns:
    - tuple: A tuple containing filtered results and risk prediction data.
    """

    base_directory = r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data'
    output_directory = r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Output'
    os.makedirs(output_directory, exist_ok=True)

    all_results = []
    all_files = []

    # Process specified periods
    for period in ['10-X_C_2001-2005', '10-X_C_2006-2010', '10-X_C_2011-2015', '10-X_C_2016-2020']:
        period_path = os.path.join(base_directory, period)
        if os.path.isdir(period_path):
            for year in os.listdir(period_path):
                year_path = os.path.join(period_path, year)
                if os.path.isdir(year_path) and year.isdigit():
                    for quarter in ['QTR1', 'QTR2', 'QTR3', 'QTR4']:
                        quarter_path = os.path.join(year_path, quarter)
                        if os.path.isdir(quarter_path):
                            logging.info(f"Processing {period} - {year} {quarter}")
                            quarter_files = [
                                os.path.join(quarter_path, f)
                                for f in os.listdir(quarter_path)
                                if f.lower().endswith('.txt') and '10-k' in f.lower() and not any(x in f.lower() for x in ['10-k/a', '10-k-a'])
                            ]
                            all_files.extend(quarter_files)
                        if (len(all_files) >= 10000):
                            break
                if (len(all_files) >= 10000):
                    break
        if (len(all_files) >= 10000):
            break
    
    all_files = all_files[:10000]

    # Process files concurrently
    max_workers = min(32, os.cpu_count() + 4)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_file = {executor.submit(process_file, file_path): file_path for file_path in all_files}
        
        for future in tqdm(as_completed(future_to_file), total=len(all_files), desc="Processing files"):
            file_path = future_to_file[future]
            try:
                result = future.result(timeout=60)
                if result:
                    all_results.append(result)
            except Exception as exc:
                logging.error(f"ERROR: {file_path} generated an exception: {exc}")

    # Filter out MDAs with mentions of items other than Item 7
    filtered_results = [result for result in all_results if not contains_other_items(result['mda_content'])]
    
    logging.info(f"Total MDAs extracted: {len(all_results)}")
    logging.info(f"MDAs with only Item 7 mentioned: {len(filtered_results)}")

    # Save all results to a single CSV (including those with other items mentioned)
    output_csv_file = os.path.join(output_directory, 'all_extracted_MDAs_2001-2020.csv')
    fieldnames = ['filename', 'company_name', 'cik_number', 'filing_date', 'mda_content', 'start_index', 'end_index', 'end_pattern', 'start_pattern', 'word_count']
    with open(output_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in all_results:
            writer.writerow(row)
    logging.info(f"INFO: All extracted MDA sections and metadata saved to {output_csv_file}")

    # Prepare data for risk prediction
    risk_prediction_data = prepare_risk_prediction_data(filtered_results)
    
    # Save risk prediction data
    risk_prediction_csv = os.path.join(output_directory, 'risk_prediction_data_2001-2020.csv')
    risk_prediction_df = pd.DataFrame(risk_prediction_data)
    risk_prediction_df.to_csv(risk_prediction_csv, index=False)
    logging.info(f"INFO: Risk prediction data saved to {risk_prediction_csv}")

    print(f"Total files processed: {len(all_files)}")
    print(f"Number of files with MDA sections: {len(all_results)}")
    print(f"Number of MDAs with only Item 7 mentioned: {len(filtered_results)}")
    print(f"Number of risk prediction data points: {len(risk_prediction_data)}")

    return filtered_results, risk_prediction_data

if __name__ == "__main__":
    filtered_results, risk_prediction_data = main()
 

In [None]:
   
# MDA Extraction Test 1 

import os
import re
import csv
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import traceback
import logging
from datetime import datetime, timedelta
import yfinance as yf
import pandas as pd
import numpy as np
from yahooquery import Ticker
import time
from requests.exceptions import RequestException
import requests

# Define the path for the log file
log_directory = r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Output'
log_file = os.path.join(log_directory, 'mda_extraction.log')

# Ensure the log directory exists
os.makedirs(log_directory, exist_ok=True)

# Configure logging to write to a file
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename=log_file,
    filemode='w'  # 'w' mode overwrites the file, use 'a' for append
)

# Add a StreamHandler for tqdm to work correctly
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.ERROR)  # Only show ERROR level logs in the console
logging.getLogger().addHandler(console_handler)


# ================================
# 1. Define Regular Expressions
# ================================


#MDA Start Patterns
# ITEM7_PATTERNS = [

#     r'\b(?:Item|ITEM)\s*7\b(?!A|\.A)\.?(?:\s*[-–—])?\s*(?:Management[\'\s]?s?|Managements?|Managment|Manag[ae]?ment[\'\s]?s?)\b'
# ]

# ITEM7_PATTERNS = [
#     r'(?:^|(?<=\n))(?:\s*(?:\r?\n|\s)*)\b(?:Item|ITEM)\s*(?:\r?\n|\s)*7\b(?!A|\.A)\.?(?:\s*[-–—])?\s*(?:Management[\'\s]?s?|Managements?|Managment|Manag[ae]?ment[\'\s]?s?)\b'
# ]

ITEM7_PATTERNS = [
    r'(?:^|(?<=\n))(?:\s*(?:\r?\n|\s)*)\b(?:Item|ITEM)\s*7\b(?!A|\.A)'
]

ITEM7_REGEX = re.compile('|'.join(ITEM7_PATTERNS), re.IGNORECASE)

# MDA End Patterns
ITEM7A_OR_8_PATTERNS = [
    r'\bItem\s*7A\.?\s*',
    r'\bITEM\s*7A\.?\s*',
    r'\bItem\s*7A\.?\s*Quantitative\s+and\s+Qualitative\s+Disclosures\s+About\s+Market\s+Risk\b',
    r'\bITEM\s*7A\.?\s*Quantitative\s+and\s+Qualitative\s+Disclosures\s+About\s+Market\s+Risk\b',
    r'\bItem\s*8\.?\s*',
    r'\bITEM\s*8\.?\s*',
]
ITEM7A_OR_8_REGEX = re.compile('|'.join(ITEM7A_OR_8_PATTERNS), re.IGNORECASE)


# ================================
# 1. Helper Functions
# ================================

def contains_other_items(mda_content):
    """
    Checks if the MDA content contains mentions of items other than Item 7.

    Parameters:
    - mda_content (str): The content of the MDA section.

    Returns:
    - bool: True if other items are mentioned, False otherwise.
    """

    # This pattern matches "Item" or "ITEM" followed by a number that's not 7
    pattern = r'\b(?:Item|ITEM)\s+(?!7\b)\d+'
    return bool(re.search(pattern, mda_content))

def extract_company_name(content):
    """
    Extracts the company name from the file content.

    Parameters:
    - content (str): The full text of the 10-K document.

    Returns:
    - str: The extracted company name or "Unknown" if not found.
    """

    match = re.search(r'COMPANY CONFORMED NAME:\s*(.+)$', content, re.MULTILINE)
    return match.group(1).strip() if match else "Unknown"

def extract_filing_date(content):
    """
    Extracts the filing date from the file content.

    Parameters:
    - content (str): The full text of the 10-K document.

    Returns:
    - str: The extracted filing date in YYYY-MM-DD format or "Unknown" if not found.
    """

    match = re.search(r'FILED AS OF DATE:\s*(\d{8})', content)
    if match:
        date_str = match.group(1)
        return f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
    return "Unknown"

def extract_company_id(content):
    """
    Extracts the company identifier (CIK) from the file content.

    Parameters:
    - content (str): The content of the file.

    Returns:
    - str or None: Extracted CIK or None if not found.
    """

    # Look for the CIK in the header of the file
    cik_pattern = r'CENTRAL INDEX KEY:\s*(\d{10})'
    match = re.search(cik_pattern, content)
    if match:
        return match.group(1)
    
    # If not found in the standard location, try an alternative pattern
    alt_pattern = r'CIK=(\d{10})'
    match = re.search(alt_pattern, content)
    if match:
        return match.group(1)
    
    return None

def count_companies(content):
    """
    Counts the number of unique companies in the SEC filing based on CIK numbers,
    company names, and filer identifiers.

    Parameters:
    - content (str): The full text of the 10-K document.

    Returns:
    - int: Number of unique companies detected.
    """

    # Extract CIK numbers using the existing helper function
    cik_number = extract_company_id(content)
    cik_numbers = set([cik_number]) if cik_number else set()

    # Extract company names using the existing helper function
    company_name = extract_company_name(content)
    company_names = set([company_name]) if company_name != "Unknown" else set()

    # Search for filer identifiers
    filer_pattern = r'FILER:\s*\n\s*COMPANY DATA:'
    filer_count = len(re.findall(filer_pattern, content))

    # Determine the number of unique companies
    num_companies = max(len(cik_numbers), len(company_names), filer_count)

    return num_companies

# If found Item 7 is less than 500 words, skip it and search for a new one
# Remove TOC Regex

# Filter through table of contents
def is_table_of_contents(text, strict=True):
    """
    Determines if the given text is likely to be a table of contents.

    Parameters:
    - text (str): The text to analyze.
    - strict (bool): Whether to use strict or lenient criteria.

    Returns:
    - bool: True if the text is likely a table of contents, False otherwise.
    """

    logging.info("Checking if the text is a table of contents...")
    # Check if the text is short
    if len(text.split()) < 200:
        logging.info("Text is short, likely a table of contents.")
        return True
    
    # Check for patterns typical in table of contents
    toc_patterns = [
        r'\bItem\s+\d+[A-Z]?\.\s+.*\d+\s*$',  # Item followed by number and page number
        r'\bTable\s+of\s+Contents\b',
        r'\b(Page|p\.)\s+\d+\b',
        r'\bItem\s+\d+[A-Z]?\..*\n.*\d+\s*$',  # Item title followed by page number on next line
        r'^\s*\d+\s*$',  # Standalone page numbers
        r'\bItem\s+\d+[A-Z]?.*\d+\s*\b',  # Item followed by any text and a number (page number)
        r'\bItem\s+\d+[A-Z]?\.?\s+[^.]+?(?=\s+Item|\s+\d+|$)'  # Item followed by title, ending at next Item or number
    ]
    
    toc_regex = re.compile('|'.join(toc_patterns), re.IGNORECASE | re.MULTILINE)
    
    # Count matches
    matches = toc_regex.findall(text)
    if strict and len(matches) > 3:  # If more than 3 matches, likely a table of contents
        logging.info(f"Detected {len(matches)} matches in strict mode, likely a table of contents.")
        return True
    elif not strict and len(matches) > 5:  # Less strict for longer sections
        logging.info(f"Detected {len(matches)} matches in less strict mode, likely a table of contents.")
        return True
    
    # Check for consecutive item listings
    items = re.findall(r'\bItem\s+\d+[A-Z]?', text, re.IGNORECASE)
    if len(items) > 2:  # More than 2 ITEM listings
        # Check if these items appear close to each other
        item_positions = [m.start() for m in re.finditer(r'\bItem\s+\d+[A-Z]?', text, re.IGNORECASE)]
        if len(item_positions) > 1:
            avg_distance = sum(item_positions[i+1] - item_positions[i] for i in range(len(item_positions)-1)) / (len(item_positions)-1)
            if strict and avg_distance < 200:  # If average distance between ITEMs is less than 200 characters
                logging.info("High density of item listings detected, likely a table of contents.")
                return True
            elif not strict and avg_distance < 100:  # Less strict for longer sections
                logging.info("High density of item listings detected in less strict mode, likely a table of contents.")
                return True
    
    # Check for high density of item listings
    words = text.split()
    item_density = len(items) / len(words)
    if strict and item_density > 0.01:  # If more than 1% of words are item listings, likely a table of contents
        logging.info("Item density exceeds 1%, likely a table of contents.")
        return True
    elif not strict and item_density > 0.02:  # Less strict for longer sections
        logging.info("Item density exceeds 2% in less strict mode, likely a table of contents.")
        return True
    
    # Check for short paragraphs (typical in TOC)
    paragraphs = text.split('\n\n')
    short_paragraphs = [p for p in paragraphs if len(p.split()) < 20]
    
    if len(short_paragraphs) / len(paragraphs) > 0.5:  # If more than half of paragraphs are short, likely a TOC
        logging.info("More than half of paragraphs are short, likely a table of contents.")
        return True
    
    logging.info("Text does not appear to be a table of contents.")
    return False


def pair_reports(results):
    """
    Pairs reports for companies based on their filing dates, allowing for delays and differences in publication dates.

    Parameters:
    - results (list of dict): Extracted MDA sections with filenames and filing dates.

    Returns:
    - list of dict: Paired reports.
    """
    # Group reports by company
    company_reports = {}
    for report in results:
        cik = report['cik_number']
        if cik not in company_reports:
            company_reports[cik] = []
        company_reports[cik].append(report)

    paired_reports = []
    for cik, reports in company_reports.items():
        # Sort reports by filing date
        sorted_reports = sorted(reports, key=lambda x: datetime.strptime(x['filing_date'], '%Y-%m-%d'))
        
        for i in range(len(sorted_reports) - 1):
            current_report = sorted_reports[i]
            next_report = sorted_reports[i + 1]
            
            current_date = datetime.strptime(current_report['filing_date'], '%Y-%m-%d')
            next_date = datetime.strptime(next_report['filing_date'], '%Y-%m-%d')
            
            # Check if reports are within a reasonable time frame (e.g., 9-15 months apart)
            if timedelta(days=270) <= next_date - current_date <= timedelta(days=450):
                paired_reports.append({
                    'current_filename': current_report['filename'],
                    'next_filename': next_report['filename'],
                    'company_name': current_report['company_name'],
                    'cik_number': cik,
                    'current_filing_date': current_report['filing_date'],
                    'next_filing_date': next_report['filing_date'],
                    'current_mda_content': current_report['mda_content'],
                    'next_mda_content': next_report['mda_content'],
                    'time_difference': (next_date - current_date).days
                })

    return paired_reports

def pair_and_save_reports(results, output_directory):
    """
    Pairs reports and saves them to a CSV file.

    Parameters:
    - results (list of dict): Extracted MDA sections with filenames and filing dates.
    - output_directory (str): Directory to save the paired reports CSV.

    Returns:
    - None
    """

    paired_reports_csv = os.path.join(output_directory, 'paired_mda_reports.csv')
    
    # Use the new pair_reports function that doesn't rely on fiscal_calendar_df
    paired_reports = pair_reports(results)
    
    if paired_reports:
        fieldnames = ['current_filename', 'next_filename', 'company_name', 'cik_number', 
                      'current_filing_date', 'next_filing_date', 'current_mda_content', 
                      'next_mda_content', 'time_difference']
        
        with open(paired_reports_csv, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for row in paired_reports:
                writer.writerow(row)
        logging.info(f"INFO: Paired reports saved to {paired_reports_csv}")
    else:
        logging.warning("WARNING: No paired reports found.")

# ================================
# 2. MDA Text Extraction
# ================================


def extract_mda_section(text):
    """
    Extracts the Management's Discussion and Analysis (MDA) section from the given text.

    Parameters:
    - text (str): The full text of the 10-K document.

    Returns:
    - dict or None: A dictionary containing the extracted MDA section and metadata, or None if not found.
    """

    logging.info("Normalizing text for extraction...")
    normalized_text = re.sub(r'\s+', ' ', text)
    
    item7_matches = list(ITEM7_REGEX.finditer(normalized_text))
    print(f"Found {len(item7_matches)} potential Item 7 matches.")
    
    if not item7_matches:
        logging.info("No Item 7 matches found.")
        return None
    
    for start_match in item7_matches:
        start_index = start_match.start()
        start_pattern = start_match.group()
        logging.info(f"Found potential MDA start at index {start_index}: '{start_pattern}'")
        
        if 'item 6' in start_pattern.lower() and 'management' not in start_pattern.lower() and 'md&a' not in start_pattern.lower():
            logging.info("Skipping Item 6 as it is not explicitly an MDA.")
            continue
        
        # Find end of MDA
        end_match = ITEM7A_OR_8_REGEX.search(normalized_text[start_index + 100:])
        if end_match:
            end_index = start_index + 100 + end_match.start()
            logging.info(f"Found potential MDA end at index {end_index}.")
        else:
            end_index = len(normalized_text)
            logging.info("No end pattern found, using end of document.")
        
        mda_text = normalized_text[start_index:end_index]
        
        # Check word count
        word_count = len(mda_text.split())
        logging.info(f"Extracted MDA section with word count: {word_count}.")
        if 500 <= word_count:
            return {
                'mda_content': mda_text,
                'start_index': start_index,
                'end_index': end_index,
                'start_pattern': start_pattern,
                'end_pattern': end_match.group() if end_match else "End of document",
                'word_count': word_count
            }
    
    logging.info("No suitable MDA section found.")
    return None


# Method to process each file 
def process_file(file_path):
    """
    Processes a single 10-K file to extract the MDA section and related metadata.

    Parameters:
    - file_path (str): Path to the 10-K file.

    Returns:
    - dict or None: A dictionary containing the extracted MDA section and metadata, or None if extraction fails.
    """

    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            content = file.read()

        num_companies = count_companies(content)

        if num_companies != 1:
            logging.info(f"Skipping file {file_path}: Detected {num_companies} companies.")
            return None 

        cik_number = extract_company_id(content)
        company_name = extract_company_name(content)
        filing_date = extract_filing_date(content)

        result = extract_mda_section(content)

        if result:
            result.update({
                'cik_number': cik_number,
                'company_name': company_name,
                'filing_date': filing_date,
                'filename': os.path.basename(file_path)
            })
            return result
        else:
            return None
    except Exception as e:
        logging.error(f"Error processing file {file_path}: {e}")
        traceback.print_exc()
        return None

        
# ================================
# 3. Main Processing Function
# ================================

def main():
    """
    Main function to process 10-K files, extract MDA sections, generate paired reports,
    and prepare risk prediction data.

    Parameters:
    - None

    Returns:
    - tuple: A tuple containing filtered results and risk prediction data.
    """

    base_directory = r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data'
    output_directory = r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Output'
    os.makedirs(output_directory, exist_ok=True)

    all_results = []
    all_files = []

    # Process specified periods
    for period in ['10-X_C_2001-2005', '10-X_C_2006-2010', '10-X_C_2011-2015', '10-X_C_2016-2020']:
        period_path = os.path.join(base_directory, period)
        if os.path.isdir(period_path):
            for year in os.listdir(period_path):
                year_path = os.path.join(period_path, year)
                if os.path.isdir(year_path) and year.isdigit():
                    for quarter in ['QTR1', 'QTR2', 'QTR3', 'QTR4']:
                        quarter_path = os.path.join(year_path, quarter)
                        if os.path.isdir(quarter_path):
                            logging.info(f"Processing {period} - {year} {quarter}")
                            quarter_files = [
                                os.path.join(quarter_path, f)
                                for f in os.listdir(quarter_path)
                                if f.lower().endswith('.txt') and '10-k' in f.lower() and not any(x in f.lower() for x in ['10-k/a', '10-k-a'])
                            ]
                            all_files.extend(quarter_files)
                        if (len(all_files) >= 100):
                            break
                if (len(all_files) >= 100):
                    break
        if (len(all_files) >= 100):
            break
    
    all_files = all_files[:100]

    # Process files concurrently
    max_workers = min(32, os.cpu_count() + 4)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_file = {executor.submit(process_file, file_path): file_path for file_path in all_files}
        
        for future in tqdm(as_completed(future_to_file), total=len(all_files), desc="Processing files"):
            file_path = future_to_file[future]
            try:
                result = future.result(timeout=60)
                if result:
                    all_results.append(result)
            except Exception as exc:
                logging.error(f"ERROR: {file_path} generated an exception: {exc}")

    # Filter out MDAs with mentions of items other than Item 7
    filtered_results = [result for result in all_results if not contains_other_items(result['mda_content'])]
    
    logging.info(f"Total MDAs extracted: {len(all_results)}")
    logging.info(f"MDAs with only Item 7 mentioned: {len(filtered_results)}")

    # Save all results to a single CSV (including those with other items mentioned)
    output_csv_file = os.path.join(output_directory, 'all_extracted_MDAs_2001-2020.csv')
    fieldnames = ['filename', 'company_name', 'cik_number', 'filing_date', 'mda_content', 'start_index', 'end_index', 'end_pattern', 'start_pattern', 'word_count']
    with open(output_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in all_results:
            writer.writerow(row)
    logging.info(f"INFO: All extracted MDA sections and metadata saved to {output_csv_file}")


    print(f"Total files processed: {len(all_files)}")
    print(f"Number of files with MDA sections: {len(all_results)}")
    print(f"Number of MDAs with only Item 7 mentioned: {len(filtered_results)}")

    return filtered_results

if __name__ == "__main__":
    filtered_results = main()

In [None]:
import re
import os
import csv
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import logging

# Define the management pattern
management_pattern = re.compile(r'ITEM\s+7\.?\s+MANAGEMENT\'?\s*S?(.*?)ITEM\s+[7A|7(a)|8]', re.IGNORECASE|re.DOTALL|re.MULTILINE)

# Define a pattern to check for other items
other_items_pattern = re.compile(r'\b(?:Item|ITEM)\s+(?!7\b)\d+', re.IGNORECASE)

def contains_other_items(mda_content):
    """
    Checks if the MDA content contains mentions of items other than Item 7 more than two times.

    Parameters:
    - mda_content (str): The content of the MDA section.

    Returns:
    - bool: True if other items are mentioned more than twice, False otherwise.
    """
    matches = other_items_pattern.findall(mda_content)
    return len(matches) > 2

def check_management_pattern(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        content = file.read()
        
        # Check for management pattern using findall
        management_matches = management_pattern.findall(content)
        
        # Filter out matches with less than 500 words and check for other items
        valid_matches = []
        for match in management_matches:
            word_count = len(match.split())
            if word_count >= 500 and not contains_other_items(match):
                valid_matches.append((match, word_count))
        
        if not valid_matches:
            return None
        
        # Get the longest match
        longest_match = max(valid_matches, key=lambda x: x[1])
        
        # Get 50 words before the longest match
        words_before = ' '.join(content.split()[:content.index(longest_match[0])]).split()[-50:]
        prefix = ' '.join(words_before)
        
        return {
            'file_path': file_path,
            'mda_content': prefix + ' ' + longest_match[0],
            'word_count': longest_match[1]
        }

def main():
    base_directory = r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data'
    output_file = r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Output\longest_mdas.csv'
    
    all_files = []
    for root, dirs, files in os.walk(base_directory):
        for file in files:
            if file.lower().endswith('.txt') and ('10-k' in file.lower() or '10-k405' in file.lower()) and not any(x in file.lower() for x in ['10-k/a', '10-k-a', '10-k405-a']):
                all_files.append(os.path.join(root, file))

    max_workers = min(32, os.cpu_count() + 4)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_file = {executor.submit(check_management_pattern, file_path): file_path for file_path in all_files}
        
        all_results = []
        for future in tqdm(as_completed(future_to_file), total=len(all_files), desc="Processing files"):
            file_path = future_to_file[future]
            try:
                result = future.result(timeout=60)
                if result:
                    all_results.append(result)
            except Exception as exc:
                logging.error(f"ERROR: {file_path} generated an exception: {exc}")

    # Write longest MDAs to CSV file
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['file_path', 'word_count', 'mda_content']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for result in all_results:
            writer.writerow(result)

    print(f"Total files processed: {len(all_files)}")
    print(f"Files with valid management pattern: {len(all_results)}")
    print(f"Longest MDAs saved to: {output_file}")

if __name__ == "__main__":
    main()

In [None]:
import os
import json
import requests
import time
import re
import logging
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

base_directory = r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data'
output_file = r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Output\cik_ticker_mapping.json'

# Initialize the caches as global variables
cik_ticker_cache = {}
processed_ciks = set()

def get_sec_tickers():
    """Get the complete CIK to ticker mapping including exchange data from SEC"""
    url = "https://www.sec.gov/files/company_tickers_exchange.json"
    headers = {'User-Agent': 'FinResearch/1.0 (Contact: abbraga04@gmail.com)'}
    
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            data = response.json()
            mapping = {}
            for entry in data['data']:
                cik = str(entry[0]).zfill(10)  # CIK
                ticker = entry[2]               # Ticker
                if ticker:  # Only add if there's a ticker
                    mapping[cik] = [ticker]
            print(f"Loaded {len(mapping)} CIK-ticker mappings from SEC")
            return mapping
        else:
            print(f"Error loading SEC data: Status code {response.status_code}")
    except Exception as e:
        print(f"Error loading SEC data: {str(e)}")
    return {}

# Load the CIK-ticker mapping once at startup
print("Loading SEC ticker data...")
cik_ticker_cache = get_sec_tickers()
processed_ciks = set(cik_ticker_cache.keys())

def extract_company_name(content):
    """
    Extracts the company name from the file content.
    """
    match = re.search(r'COMPANY CONFORMED NAME:\s*(.+)$', content, re.MULTILINE)
    return match.group(1).strip() if match else "Unknown"

def extract_company_id(content):
    """
    Extracts the company identifier (CIK) from the file content.
    """
    cik_pattern = r'CENTRAL INDEX KEY:\s*(\d{10})'
    match = re.search(cik_pattern, content)
    if match:
        return match.group(1)
    
    alt_pattern = r'CIK=(\d{10})'
    match = re.search(alt_pattern, content)
    if match:
        return match.group(1)
    
    return None

def process_file_batch(file_path, cache):
    """Process a single file and return CIK and ticker info"""
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
            cik = extract_company_id(content)
            if cik:
                # If CIK exists in our mapping, return it
                if cik in cache:
                    return cik, cache[cik]
                processed_ciks.add(cik)
    except Exception as e:
        logging.error(f"Error processing {file_path}: {str(e)}")
    return None, None

def create_cik_ticker_mapping(base_directory, output_file='cik_ticker_mapping.json', batch_size=1000):
    """
    Creates a JSON file mapping CIK numbers to their ticker symbols in batches.
    """
    # Create a new dictionary for storing only matches from your files
    found_mappings = {}
    
    # Get list of all files
    files_list = []
    for root, dirs, files in os.walk(base_directory):
        for file in files:
            if file.lower().endswith('.txt') and ('10-k' in file.lower() or '10-k405' in file.lower()) and not any(x in file.lower() for x in ['10-k/a', '10-k-a', '10-k405-a']):
                files_list.append(os.path.join(root, file))

    print(f"Found {len(files_list)} 10-K files to process")

    # Process files in batches
    max_workers = min(32, os.cpu_count() + 4)
    for i in range(0, len(files_list), batch_size):
        batch = files_list[i:i + batch_size]
        print(f"\nProcessing batch {i//batch_size + 1} of {(len(files_list) + batch_size - 1)//batch_size}")
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_file = {executor.submit(process_file_batch, file_path, cik_ticker_cache): file_path 
                            for file_path in batch}
            
            for future in tqdm(as_completed(future_to_file), total=len(batch), desc="Processing files"):
                cik, tickers = future.result()
                if cik and tickers:  # Only add to found_mappings if both cik and tickers exist
                    found_mappings[cik] = tickers
        
        # Save intermediate results after each batch
        with open(output_file, 'w') as f:
            json.dump(found_mappings, f, indent=4)  # Save only the found mappings
        print(f"Intermediate results saved. Current CIKs mapped: {len(found_mappings)}")
    
    print(f"\nFinal CIK-Ticker mapping saved to {output_file}")
    print(f"Total CIKs mapped: {len(found_mappings)}")
    return found_mappings

if __name__ == "__main__":
    create_cik_ticker_mapping(base_directory, output_file)

In [None]:
import os
import json
import requests
import time
import re
import logging
from tqdm import tqdm  # Import tqdm for progress bars
from concurrent.futures import ThreadPoolExecutor, as_completed


base_directory = r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data'

def find_cik_in_files(target_cik, base_directory):
    """
    Search for a specific CIK number using filenames.
    """
    files_found = []
    target_cik_no_zeros = str(int(target_cik))  # Remove leading zeros for matching
    
    # Walk through directory and check filenames
    for root, dirs, files in os.walk(base_directory):
        for file in files:
            if (file.lower().endswith('.txt') and 
                ('10-k' in file.lower() or '10-k405' in file.lower()) and 
                not any(x in file.lower() for x in ['10-k/a', '10-k-a', '10-k405-a'])):
                
                # Extract CIK from filename
                match = re.search(r'edgar_data_(\d+)_', file)
                if match:
                    file_cik = match.group(1)
                    if file_cik == target_cik_no_zeros:
                        files_found.append(os.path.join(root, file))
    
    # Print results
    if files_found:
        print(f"\nFound CIK {target_cik} in {len(files_found)} 10-K files:")
        for file in files_found:
            print(f"- {file}")
    else:
        print(f"\nCIK {target_cik} not found in any 10-K files.")
    
    return files_found

# Search for the specific CIK
find_cik_in_files("0000789019", base_directory)

In [None]:
import os
import json
import shutil
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

def check_file_cik(args):
    """Check if file's CIK is in our mapping and copy if it is"""
    file_path, cik_mapping, output_dir = args
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
            # Extract CIK using your existing patterns
            cik_pattern = r'CENTRAL INDEX KEY:\s*(\d{10})'
            alt_pattern = r'CIK=(\d{10})'
            
            match = re.search(cik_pattern, content)
            if not match:
                match = re.search(alt_pattern, content)
            
            if match:
                cik = match.group(1)
                if cik in cik_mapping:
                    # Create same directory structure in output_dir
                    rel_path = os.path.relpath(file_path, base_directory)
                    new_path = os.path.join(output_dir, rel_path)
                    os.makedirs(os.path.dirname(new_path), exist_ok=True)
                    shutil.copy2(file_path, new_path)
                    return new_path
    except Exception as e:
        logging.error(f"Error processing {file_path}: {str(e)}")
    return None

def copy_mapped_files(base_directory, mapping_file, output_directory):
    """Copy files with matching CIKs to new directory"""
    
    # Load CIK mapping
    with open(mapping_file, 'r') as f:
        cik_mapping = json.load(f)
    
    print(f"Loaded {len(cik_mapping)} CIK-ticker mappings")
    
    # Get list of all 10-K files
    all_files = []
    for root, dirs, files in os.walk(base_directory):
        for file in files:
            if file.lower().endswith('.txt') and ('10-k' in file.lower() or '10-k405' in file.lower()) and not any(x in file.lower() for x in ['10-k/a', '10-k-a', '10-k405-a']):
                all_files.append(os.path.join(root, file))

    print(f"Found {len(all_files)} 10-K files to process")

    # Create output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)

    # Process files in parallel
    max_workers = min(32, os.cpu_count() + 4)
    copied_files = []
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        args = [(file_path, cik_mapping, output_directory) for file_path in all_files]
        future_to_file = {executor.submit(check_file_cik, arg): arg[0] for arg in args}
        
        for future in tqdm(as_completed(future_to_file), total=len(all_files), desc="Copying matched files"):
            file_path = future_to_file[future]
            try:
                result = future.result(timeout=60)
                if result:
                    copied_files.append(result)
            except Exception as exc:
                logging.error(f"ERROR: {file_path} generated an exception: {exc}")

    print(f"\nCopied {len(copied_files)} files to {output_directory}")
    return copied_files

# Usage
base_directory = r'C:\Users\abbra\Documents\Research\Koval Paper\Data'
mapping_file = r'C:\Users\abbra\Documents\Research\Koval Paper\Data\Output\cik_ticker_mapping.json'
output_directory = r'C:\Users\abbra\Documents\Research\Koval Paper\Data\Mapped_Files'

copied_files = copy_mapped_files(base_directory, mapping_file, output_directory)

In [None]:
import re
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import logging

# Define the management patterns
management_pattern = re.compile(r'ITEM\s+7\.?\s+MANAGEMENT\'?\s*S?(.*?)ITEM\s+[7A|7(a)|8]', re.IGNORECASE|re.DOTALL|re.MULTILINE)
fallback_pattern = re.compile(r'MANAGEMENT\'?\s*S?\s*DISCUSSION(.*?)(?:ITEM\s+[7A|7(a)|8]|\Z)', re.IGNORECASE|re.DOTALL|re.MULTILINE)

# Define a pattern to check for other items
other_items_pattern = re.compile(r'\b(?:Item|ITEM)\s+(?!7\b)\d+', re.IGNORECASE)

def contains_other_items(mda_content):
    """
    Checks if the MDA content contains mentions of items other than Item 7 more than two times.

    Parameters:
    - mda_content (str): The content of the MDA section.

    Returns:
    - bool: True if other items are mentioned more than twice, False otherwise.
    """
    matches = other_items_pattern.findall(mda_content)
    return len(matches) > 2

def check_management_pattern(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        content = file.read()
        
        # Check for management pattern using findall
        management_matches = management_pattern.findall(content)
        
        # Filter out matches with less than 500 words and check for other items
        valid_matches = []
        for match in management_matches:
            word_count = len(match.split())
            if word_count >= 500 and not contains_other_items(match):
                valid_matches.append((match, word_count))
        
        # If no valid matches found, try fallback pattern
        # if not valid_matches:
        #     fallback_matches = fallback_pattern.findall(content)
        #     for match in fallback_matches:
        #         word_count = len(match.split())
        #         if word_count >= 500 and not contains_other_items(match):
        #             valid_matches.append((match, word_count))
        
        if not valid_matches:
            return {
                'file_path': file_path,
                'has_management_pattern': False,
                'longest_match': "",
                'word_count': 0,
                'has_other_items': False
            }
        
        # Get the longest match
        longest_match = max(valid_matches, key=lambda x: x[1])
        
        # Get 50 words before the longest match
        words_before = ' '.join(content.split()[:content.index(longest_match[0])]).split()[-50:]
        prefix = ' '.join(words_before)
        
        return {
            'file_path': file_path,
            'has_management_pattern': True,
            'longest_match': prefix + ' ' + longest_match[0],
            'word_count': longest_match[1],
            'has_other_items': False
        }

def main():
    base_directory = r'C:\Users\abbra\Documents\Research\Koval Paper\Data\Mapped_Files'
    output_file = r'C:\Users\abbra\Documents\Research\Koval Paper\Data\Output\longest_mdas.txt'
    
    all_files = []
    for root, dirs, files in os.walk(base_directory):
        for file in files:
            if file.lower().endswith('.txt') and ('10-k' in file.lower() or '10-k405' in file.lower()) and not any(x in file.lower() for x in ['10-k/a', '10-k-a', '10-k405-a']):
                all_files.append(os.path.join(root, file))

    max_workers = min(32, os.cpu_count() + 4)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_file = {executor.submit(check_management_pattern, file_path): file_path for file_path in all_files}
        
        all_results = []
        for future in tqdm(as_completed(future_to_file), total=len(all_files), desc="Processing files"):
            file_path = future_to_file[future]
            try:
                result = future.result(timeout=60)
                if result['has_management_pattern']:
                    all_results.append(result)
            except Exception as exc:
                logging.error(f"ERROR: {file_path} generated an exception: {exc}")

    # Write longest MDAs to file
    with open(output_file, 'w', encoding='utf-8') as f:
        for result in all_results:
            f.write(f"File: {result['file_path']}\n")
            f.write(f"Word count: {result['word_count']}\n")
            f.write(f"Contains other items more than twice: No\n\n")
            f.write(result['longest_match'].strip())
            f.write("\n\n---------------\n\n")

    print(f"Total files processed: {len(all_files)}")
    print(f"Files with valid management pattern: {len(all_results)}")
    print(f"Longest MDAs saved to: {output_file}")

if __name__ == "__main__":
    main()

In [None]:
from pprint import pprint

# Define the file path
file_path = r"C:\Users\abbra\Documents\Research\Koval Paper\Data\10-X_C_2001-2005\2001\QTR1\20010201_10-K_edgar_data_775473_0001012870-01-000343.txt"


def check_management_pattern(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        content = file.read()
        
        # Check for management pattern using findall
        management_matches = management_pattern.findall(content)
        
        # Filter out matches with less than 500 words and check for other items
        valid_matches = []
        for match in management_matches:
            word_count = len(match.split())
            # if word_count >= 0 and not contains_other_items(match):
            #     valid_matches.append((match, word_count))
            valid_matches.append(match)
        
        if not valid_matches:
            return {
                'file_path': file_path,
                'has_management_pattern': False,
                'longest_match': "",
                'word_count': 0,
                'has_other_items': False
            }
        
        # Get the longest match
        #longest_match = max(valid_matches, key=lambda x: x[1])
        
        # Get 50 words before the longest match
        #words_before = ' '.join(content.split()[:content.index(longest_match[0])]).split()[-50:]
        #prefix = ' '.join(words_before)
        
        return {
            'file_path': file_path,
            'has_management_pattern': True,
            #'longest_match': prefix + ' ' + longest_match[0],
            #'word_count': longest_match[1],
            'matches': valid_matches,
            'has_other_items': False
        }

# Check if the file exists before processing
if os.path.exists(file_path):
    # Call the check_management_pattern function for this single file
    result = check_management_pattern(file_path)

    # Print the result
    if result['has_management_pattern']:
        print(f"File: {os.path.basename(result['file_path'])}")
        print(f"Contains other items more than twice: {'Yes' if result['has_other_items'] else 'No'}")
        print("\nExtracted MDA content:")
        pprint(result['matches'], width = 120)
    else:
        print(f"No valid management pattern found in {os.path.basename(file_path)}")
else:
    print(f"Error: File not found - {file_path}")
    print("Please check the file path and ensure it exists.")

In [None]:
import os

file_path = r'C:\Users\abbra\Documents\Research\Koval Paper\Data\10-X_C_2001-2005\2001\QTR1\20010117_10-K405-A_edgar_data_785786_0000950124-01-000209.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()
    print(f"Successfully opened file: {os.path.basename(file_path)}")
    print(f"File content: ", content)

In [None]:
# Read the CSV file
df = pd.read_csv(r'C:\Users\abbra\Documents\Research\Koval Paper\Data\Output\all_extracted_MDAs_2001-2020.csv')

# Get unique company names
unique_companies = df['company_name'].unique()

# Print the list of unique company names
print("List of unique company names:")
for company in unique_companies:
    print(company)

# Print the total count of unique companies
print(f"\nTotal number of unique companies: {len(unique_companies)}")


In [None]:
import requests
import time
import pandas as pd



# Read the CSV file
df = pd.read_csv(r'C:\Users\abbra\Documents\Research\Koval Paper\Data\Output\all_extracted_MDAs_2001-2020.csv')

# Get all unique CIK numbers
cik_numbers = df['cik_number'].unique()
cik_numbers = cik_numbers[0:1000]

# Initialize counters
total_companies = len(cik_numbers)
companies_with_tickers = 0

# Fetch ticker symbols for each CIK
for cik in cik_numbers:
    tickers = cik_to_ticker(str(cik))
    if tickers:
        companies_with_tickers += 1
    else:
        continue

# Print summary
print(f"\nTotal companies: {total_companies}")
print(f"Companies with tickers: {companies_with_tickers}")
print(f"Companies without tickers: {total_companies - companies_with_tickers}")


In [None]:
# Extract the MDA contents of the first row from the CSV file and store them in a txt file
import os

# Define the input CSV file path
input_csv_path = r'C:\Users\abbra\Documents\Research\Koval Paper\Data\Output\all_paired_mda_reports.csv'

# Check if the CSV file exists
if os.path.exists(input_csv_path):
    # Read the first row of the CSV file
    df = pd.read_csv(input_csv_path, nrows=2)
    
    if not df.empty and 'current_mda_content' in df.columns and 'next_mda_content' in df.columns:
        # Get the MDA content from the first row
        first_mda_content = df.iloc[1]['current_mda_content']
        second_mda_content = df.iloc[1]['next_mda_content']

        company_name = df.iloc[1]['company_name']
        
        # Define the output file path
        output_file_path = r'C:\Users\abbra\Documents\Research\Koval Paper\Data\Output\mda_contents.txt'
        
        # Write the content to a txt file
        with open(output_file_path, 'w', encoding='utf-8') as file:
            
            file.write(f"company_name: {company_name}\n")
            file.write('Current MDA Content:\n')
            file.write(first_mda_content)
            file.write('\n\n---------------------------\n\n')
            file.write('Next MDA Content:\n')
            file.write(second_mda_content)

        print(f"First MDA content has been saved to: {output_file_path}")
    else:
        print("Error: CSV file is empty or does not contain 'mda_content' column.")
else:
    print(f"Error: CSV file not found at {input_csv_path}")


Correlation Data

In [None]:
# MDA Extraction Test 1 

import os
import re
import csv
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import traceback
import logging
from datetime import datetime, timedelta
import yfinance as yf
import pandas as pd
import numpy as np


# Define the path for the log file
log_directory = r'C:\Users\abbra\Documents\Research\Koval Paper\Data\Output'
log_file = os.path.join(log_directory, 'mda_extraction.log')

# Ensure the log directory exists
os.makedirs(log_directory, exist_ok=True)

# Configure logging to write to a file
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename=log_file,
    filemode='w'  # 'w' mode overwrites the file, use 'a' for append
)

# Add a StreamHandler for tqdm to work correctly
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.ERROR)  # Only show ERROR level logs in the console
logging.getLogger().addHandler(console_handler)


# ================================
# 1. Define Regular Expressions
# ================================


#MDA Start Patterns
ITEM7_PATTERNS = [

    r'\b(?:Item|ITEM)\s*7\b(?!A|\.A)\.?(?:\s*[-–—])?\s*(?:Management[\'\s]?s?|Managements?|Managment|Manag[ae]?ment[\'\s]?s?)\b'
]

ITEM7_REGEX = re.compile('|'.join(ITEM7_PATTERNS), re.IGNORECASE)

# MDA End Patterns
ITEM7A_OR_8_PATTERNS = [
    r'\bItem\s*7A\.?\s*',
    r'\bITEM\s*7A\.?\s*',
    r'\bItem\s*7A\.?\s*Quantitative\s+and\s+Qualitative\s+Disclosures\s+About\s+Market\s+Risk\b',
    r'\bITEM\s*7A\.?\s*Quantitative\s+and\s+Qualitative\s+Disclosures\s+About\s+Market\s+Risk\b',
    r'\bItem\s*8\.?\s*',
    r'\bITEM\s*8\.?\s*',
    r'\bItem\s*7\.?\s*Financial\s+Statements\b',
    r'\bITEM\s*7\.?\s*Financial\s+Statements\b',
]
ITEM7A_OR_8_REGEX = re.compile('|'.join(ITEM7A_OR_8_PATTERNS), re.IGNORECASE)


# ================================
# 1. Helper Functions
# ================================


def get_stock_data(ticker, start_date, end_date):
    """
    Fetch daily stock price data for a given ticker and date range.
    """
    stock = yf.Ticker(ticker)
    data = stock.history(start=start_date, end=end_date)
    return data['Close']

def calculate_mdd(prices):
    """
    Calculate Maximum Drawdown (MDD) for a given price series.
    """
    peak = prices.iloc[0]
    mdd = 0
    for price in prices:
        if price > peak:
            peak = price
        dd = (peak - price) / peak
        if dd > mdd:
            mdd = dd
    return mdd

def label_risk(mdds):
    """
    Label companies as High Risk (1) or Normal Risk (0) based on MDD percentiles.
    """
    threshold = np.percentile(mdds, 80)
    return [1 if mdd >= threshold else 0 for mdd in mdds]

def prepare_risk_prediction_data(results):
    """
    Prepare the dataset for risk prediction.
    """
    data = []
    for cik in set(r['cik_number'] for r in results):
        company_reports = sorted([r for r in results if r['cik_number'] == cik], key=lambda x: x['filing_date'])
        for i in range(1, len(company_reports)):
            current_report = company_reports[i]
            previous_report = company_reports[i-1]
            
            # Fetch stock data for the next year
            start_date = current_report['filing_date']
            end_date = (datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=365)).strftime('%Y-%m-%d')
            
            try:
                stock_prices = get_stock_data(cik, start_date, end_date)
                
                if len(stock_prices) > 0:
                    mdd = calculate_mdd(stock_prices)
                    data.append({
                        'cik': cik,
                        'company_name': current_report['company_name'],
                        'current_mda': current_report['mda_content'],
                        'previous_mda': previous_report['mda_content'],
                        'mdd': mdd,
                        'filing_date': current_report['filing_date']
                    })
            except Exception as e:
                logging.error(f"Error fetching stock data for CIK {cik}: {e}")
    
    # Label risks
    df = pd.DataFrame(data)
    df['year'] = pd.to_datetime(df['filing_date']).dt.year
    
    def label_risk_by_year(group):
        mdds = group['mdd'].values
        return label_risk(mdds)
    
    df['risk_label'] = df.groupby('year').apply(label_risk_by_year).reset_index(drop=True)
    
    return df.to_dict('records')
    
    
def contains_other_items(mda_content):
    """
    Checks if the MDA content contains mentions of items other than Item 7.

    Parameters:
    - mda_content (str): The content of the MDA section.

    Returns:
    - bool: True if other items are mentioned, False otherwise.
    """

    # This pattern matches "Item" or "ITEM" followed by a number that's not 7
    pattern = r'\b(?:Item|ITEM)\s+(?!7\b)\d+'
    return bool(re.search(pattern, mda_content))

def extract_company_name(content):
    """
    Extracts the company name from the file content.

    Parameters:
    - content (str): The full text of the 10-K document.

    Returns:
    - str: The extracted company name or "Unknown" if not found.
    """

    match = re.search(r'COMPANY CONFORMED NAME:\s*(.+)$', content, re.MULTILINE)
    return match.group(1).strip() if match else "Unknown"

def extract_filing_date(content):
    """
    Extracts the filing date from the file content.

    Parameters:
    - content (str): The full text of the 10-K document.

    Returns:
    - str: The extracted filing date in YYYY-MM-DD format or "Unknown" if not found.
    """

    match = re.search(r'FILED AS OF DATE:\s*(\d{8})', content)
    if match:
        date_str = match.group(1)
        return f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
    return "Unknown"

def extract_company_id(content):
    """
    Extracts the company identifier (CIK) from the file content.

    Parameters:
    - content (str): The content of the file.

    Returns:
    - str or None: Extracted CIK or None if not found.
    """

    # Look for the CIK in the header of the file
    cik_pattern = r'CENTRAL INDEX KEY:\s*(\d{10})'
    match = re.search(cik_pattern, content)
    if match:
        return match.group(1)
    
    # If not found in the standard location, try an alternative pattern
    alt_pattern = r'CIK=(\d{10})'
    match = re.search(alt_pattern, content)
    if match:
        return match.group(1)
    
    return None

def count_companies(content):
    """
    Counts the number of unique companies in the SEC filing based on CIK numbers,
    company names, and filer identifiers.

    Parameters:
    - content (str): The full text of the 10-K document.

    Returns:
    - int: Number of unique companies detected.
    """

    # Extract CIK numbers using the existing helper function
    cik_number = extract_company_id(content)
    cik_numbers = set([cik_number]) if cik_number else set()

    # Extract company names using the existing helper function
    company_name = extract_company_name(content)
    company_names = set([company_name]) if company_name != "Unknown" else set()

    # Search for filer identifiers
    filer_pattern = r'FILER:\s*\n\s*COMPANY DATA:'
    filer_count = len(re.findall(filer_pattern, content))

    # Determine the number of unique companies
    num_companies = max(len(cik_numbers), len(company_names), filer_count)

    return num_companies

# Filter through table of contents
def is_table_of_contents(text, strict=True):
    """
    Determines if the given text is likely to be a table of contents.

    Parameters:
    - text (str): The text to analyze.
    - strict (bool): Whether to use strict or lenient criteria.

    Returns:
    - bool: True if the text is likely a table of contents, False otherwise.
    """

    logging.info("Checking if the text is a table of contents...")
    # Check if the text is short
    if len(text.split()) < 200:
        logging.info("Text is short, likely a table of contents.")
        return True
    
    # Check for patterns typical in table of contents
    toc_patterns = [
        r'\bItem\s+\d+[A-Z]?\.\s+.*\d+\s*$',  # Item followed by number and page number
        r'\bTable\s+of\s+Contents\b',
        r'\b(Page|p\.)\s+\d+\b',
        r'\bItem\s+\d+[A-Z]?\..*\n.*\d+\s*$',  # Item title followed by page number on next line
        r'^\s*\d+\s*$',  # Standalone page numbers
        r'\bItem\s+\d+[A-Z]?.*\d+\s*\b',  # Item followed by any text and a number (page number)
        r'\bItem\s+\d+[A-Z]?\.?\s+[^.]+?(?=\s+Item|\s+\d+|$)'  # Item followed by title, ending at next Item or number
    ]
    
    toc_regex = re.compile('|'.join(toc_patterns), re.IGNORECASE | re.MULTILINE)
    
    # Count matches
    matches = toc_regex.findall(text)
    if strict and len(matches) > 3:  # If more than 3 matches, likely a table of contents
        logging.info(f"Detected {len(matches)} matches in strict mode, likely a table of contents.")
        return True
    elif not strict and len(matches) > 5:  # Less strict for longer sections
        logging.info(f"Detected {len(matches)} matches in less strict mode, likely a table of contents.")
        return True
    
    # Check for consecutive item listings
    items = re.findall(r'\bItem\s+\d+[A-Z]?', text, re.IGNORECASE)
    if len(items) > 2:  # More than 2 ITEM listings
        # Check if these items appear close to each other
        item_positions = [m.start() for m in re.finditer(r'\bItem\s+\d+[A-Z]?', text, re.IGNORECASE)]
        if len(item_positions) > 1:
            avg_distance = sum(item_positions[i+1] - item_positions[i] for i in range(len(item_positions)-1)) / (len(item_positions)-1)
            if strict and avg_distance < 200:  # If average distance between ITEMs is less than 200 characters
                logging.info("High density of item listings detected, likely a table of contents.")
                return True
            elif not strict and avg_distance < 100:  # Less strict for longer sections
                logging.info("High density of item listings detected in less strict mode, likely a table of contents.")
                return True
    
    # Check for high density of item listings
    words = text.split()
    item_density = len(items) / len(words)
    if strict and item_density > 0.01:  # If more than 1% of words are item listings, likely a table of contents
        logging.info("Item density exceeds 1%, likely a table of contents.")
        return True
    elif not strict and item_density > 0.02:  # Less strict for longer sections
        logging.info("Item density exceeds 2% in less strict mode, likely a table of contents.")
        return True
    
    # Check for short paragraphs (typical in TOC)
    paragraphs = text.split('\n\n')
    short_paragraphs = [p for p in paragraphs if len(p.split()) < 20]
    
    if len(short_paragraphs) / len(paragraphs) > 0.5:  # If more than half of paragraphs are short, likely a TOC
        logging.info("More than half of paragraphs are short, likely a table of contents.")
        return True
    
    logging.info("Text does not appear to be a table of contents.")
    return False


def pair_reports(results):
    """
    Pairs reports for companies based on their filing dates, allowing for delays and differences in publication dates.

    Parameters:
    - results (list of dict): Extracted MDA sections with filenames and filing dates.

    Returns:
    - list of dict: Paired reports.
    """
    # Group reports by company
    company_reports = {}
    for report in results:
        cik = report['cik_number']
        if cik not in company_reports:
            company_reports[cik] = []
        company_reports[cik].append(report)

    paired_reports = []
    for cik, reports in company_reports.items():
        # Sort reports by filing date
        sorted_reports = sorted(reports, key=lambda x: datetime.strptime(x['filing_date'], '%Y-%m-%d'))
        
        for i in range(len(sorted_reports) - 1):
            current_report = sorted_reports[i]
            next_report = sorted_reports[i + 1]
            
            current_date = datetime.strptime(current_report['filing_date'], '%Y-%m-%d')
            next_date = datetime.strptime(next_report['filing_date'], '%Y-%m-%d')
            
            # Check if reports are within a reasonable time frame (e.g., 9-15 months apart)
            if timedelta(days=270) <= next_date - current_date <= timedelta(days=450):
                paired_reports.append({
                    'current_filename': current_report['filename'],
                    'next_filename': next_report['filename'],
                    'company_name': current_report['company_name'],
                    'cik_number': cik,
                    'current_filing_date': current_report['filing_date'],
                    'next_filing_date': next_report['filing_date'],
                    'current_mda_content': current_report['mda_content'],
                    'next_mda_content': next_report['mda_content'],
                    'time_difference': (next_date - current_date).days
                })

    return paired_reports

def pair_and_save_reports(results, output_directory):
    """
    Pairs reports and saves them to a CSV file.

    Parameters:
    - results (list of dict): Extracted MDA sections with filenames and filing dates.
    - output_directory (str): Directory to save the paired reports CSV.

    Returns:
    - None
    """

    paired_reports_csv = os.path.join(output_directory, 'paired_mda_reports.csv')
    
    # Use the new pair_reports function that doesn't rely on fiscal_calendar_df
    paired_reports = pair_reports(results)
    
    if paired_reports:
        fieldnames = ['current_filename', 'next_filename', 'company_name', 'cik_number', 
                      'current_filing_date', 'next_filing_date', 'current_mda_content', 
                      'next_mda_content', 'time_difference']
        
        with open(paired_reports_csv, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for row in paired_reports:
                writer.writerow(row)
        logging.info(f"INFO: Paired reports saved to {paired_reports_csv}")
    else:
        logging.warning("WARNING: No paired reports found.")

# ================================
# 2. MDA Text Extraction
# ================================


def extract_mda_section(text):
    """
    Extracts the Management's Discussion and Analysis (MDA) section from the given text.

    Parameters:
    - text (str): The full text of the 10-K document.

    Returns:
    - dict or None: A dictionary containing the extracted MDA section and metadata, or None if not found.
    """

    logging.info("Normalizing text for extraction...")
    normalized_text = re.sub(r'\s+', ' ', text)
    
    item7_matches = list(ITEM7_REGEX.finditer(normalized_text))
    logging.info(f"Found {len(item7_matches)} potential Item 7 matches.")
    
    if not item7_matches:
        logging.info("No Item 7 matches found.")
        return None
    
    for start_match in item7_matches:
        start_index = start_match.start()
        start_pattern = start_match.group()
        logging.info(f"Found potential MDA start at index {start_index}: '{start_pattern}'")
        
        if 'item 6' in start_pattern.lower() and 'management' not in start_pattern.lower() and 'md&a' not in start_pattern.lower():
            logging.info("Skipping Item 6 as it is not explicitly an MDA.")
            continue
        
        # Find end of MDA
        end_match = ITEM7A_OR_8_REGEX.search(normalized_text[start_index + 100:])
        if end_match:
            end_index = start_index + 100 + end_match.start()
            logging.info(f"Found potential MDA end at index {end_index}.")
        else:
            end_index = len(normalized_text)
            logging.info("No end pattern found, using end of document.")
        
        mda_text = normalized_text[start_index:end_index]
        
        # Check if the entire section is a table of contents
        if is_table_of_contents(mda_text):
            logging.info("This section appears to be a table of contents. Skipping.")
            continue
        
        # If the beginning looks like a table of contents, try to find the real start of the MDA
        first_1000_words = ' '.join(mda_text.split()[:1000])
        if is_table_of_contents(first_1000_words):
            logging.info("The beginning looks like a table of contents. Searching for the real MDA start.")
            real_start_match = re.search(r'(Management\'s\s+Discussion\s+and\s+Analysis|MD&A).{0,500}?(?=\n\n)', mda_text, re.IGNORECASE | re.DOTALL)
            if real_start_match:
                start_index += real_start_match.start()
                mda_text = mda_text[real_start_match.start():]
                logging.info(f"Found real MDA start at index {start_index}.")
            else:
                logging.info("Couldn't find the real MDA start. Skipping this occurrence.")
                continue
        
        # Check word count
        word_count = len(mda_text.split())
        logging.info(f"Extracted MDA section with word count: {word_count}.")
        if 500 <= word_count:
            return {
                'mda_content': mda_text,
                'start_index': start_index,
                'end_index': end_index,
                'start_pattern': start_pattern,
                'end_pattern': end_match.group() if end_match else "End of document",
                'word_count': word_count
            }
    
    logging.info("No suitable MDA section found.")
    return None


# Method to process each file 
def process_file(file_path):
    """
    Processes a single 10-K file to extract the MDA section and related metadata.

    Parameters:
    - file_path (str): Path to the 10-K file.

    Returns:
    - dict or None: A dictionary containing the extracted MDA section and metadata, or None if extraction fails.
    """

    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            content = file.read()

        num_companies = count_companies(content)

        if num_companies != 1:
            logging.info(f"Skipping file {file_path}: Detected {num_companies} companies.")
            return None 

        cik_number = extract_company_id(content)
        company_name = extract_company_name(content)
        filing_date = extract_filing_date(content)

        result = extract_mda_section(content)

        if result:
            result.update({
                'cik_number': cik_number,
                'company_name': company_name,
                'filing_date': filing_date,
                'filename': os.path.basename(file_path)
            })
            return result
        else:
            return None
    except Exception as e:
        logging.error(f"Error processing file {file_path}: {e}")
        traceback.print_exc()
        return None

        
# ================================
# 3. Main Processing Function
# ================================

def main():
    """
    Main function to process 10-K files, extract MDA sections, and generate paired reports.

    Parameters:
    - None

    Returns:
    - tuple: A tuple containing filtered results and paired reports.
    """

    base_directory = r'C:\Users\abbra\Documents\Research\Koval Paper\Data'
    output_directory = r'C:\Users\abbra\Documents\Research\Koval Paper\Data\Output'
    os.makedirs(output_directory, exist_ok=True)

    all_results = []
    all_files = []

    # Process specified periods
    for period in ['10-X_C_2001-2005', '10-X_C_2006-2010', '10-X_C_2011-2015', '10-X_C_2016-2020']:
        period_path = os.path.join(base_directory, period)
        if os.path.isdir(period_path):
            for year in os.listdir(period_path):
                year_path = os.path.join(period_path, year)
                if os.path.isdir(year_path) and year.isdigit():
                    for quarter in ['QTR1', 'QTR2', 'QTR3', 'QTR4']:
                        quarter_path = os.path.join(year_path, quarter)
                        if os.path.isdir(quarter_path):
                            logging.info(f"Processing {period} - {year} {quarter}")
                            quarter_files = [
                                os.path.join(quarter_path, f)
                                for f in os.listdir(quarter_path)
                                if f.lower().endswith('.txt') and '10-k' in f.lower() and not any(x in f.lower() for x in ['10-k/a', '10-k-a'])
                            ]
                            all_files.extend(quarter_files)

    # Process files concurrently
    max_workers = min(32, os.cpu_count() + 4)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_file = {executor.submit(process_file, file_path): file_path for file_path in all_files}
        
        for future in tqdm(as_completed(future_to_file), total=len(all_files), desc="Processing files"):
            file_path = future_to_file[future]
            try:
                result = future.result(timeout=60)
                if result:
                    all_results.append(result)
            except Exception as exc:
                logging.error(f"ERROR: {file_path} generated an exception: {exc}")

    # Filter out MDAs with mentions of items other than Item 7
    filtered_results = [result for result in all_results if not contains_other_items(result['mda_content'])]
    
    logging.info(f"Total MDAs extracted: {len(all_results)}")
    logging.info(f"MDAs with only Item 7 mentioned: {len(filtered_results)}")

    # Save all results to a single CSV (including those with other items mentioned)
    output_csv_file = os.path.join(output_directory, 'all_extracted_MDAs2006-2010.csv')
    fieldnames = ['filename', 'company_name', 'cik_number', 'filing_date', 'mda_content', 'start_index', 'end_index', 'end_pattern', 'start_pattern', 'word_count']
    with open(output_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in all_results:
            writer.writerow(row)
    logging.info(f"INFO: All extracted MDA sections and metadata saved to {output_csv_file}")

    # Pair reports using filtered results
    paired_reports = pair_reports(filtered_results)

    # Save paired reports
    paired_reports_csv = os.path.join(output_directory, 'all_paired_mda_reports.csv')
    if paired_reports:
        paired_fieldnames = ['current_filename', 'next_filename', 'company_name', 'cik_number', 
                             'current_filing_date', 'next_filing_date', 'current_mda_content', 
                             'next_mda_content', 'time_difference']
        with open(paired_reports_csv, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=paired_fieldnames)
            writer.writeheader()
            for row in paired_reports:
                writer.writerow(row)
        logging.info(f"INFO: All paired reports saved to {paired_reports_csv}")
    else:
        logging.warning("WARNING: No paired reports found.")

    print(f"Total files processed: {len(all_files)}")
    print(f"Number of files with MDA sections: {len(all_results)}")
    print(f"Number of MDAs with only Item 7 mentioned: {len(filtered_results)}")
    print(f"Number of paired reports: {len(paired_reports)}")

    return filtered_results, paired_reports

if __name__ == "__main__":
    filtered_results, paired_reports = main()

In [None]:
import pandas as pd
from pprint import pprint

# Load the paired reports CSV
paired_reports_df = pd.read_csv(r'C:\Users\abbra\Documents\Research\Koval Paper\Data\Output\all_paired_mda_reports.csv')
output_file = r'C:\Users\abbra\Documents\Research\Koval Paper\Data\Output\pairedMDA_row1_preview.txt'

# Check if there are any rows in the DataFrame
if not paired_reports_df.empty:
    # Get the first row
    first_pair = paired_reports_df.iloc[1028]
    
    with open(output_file, 'w', encoding='utf-8') as f:
        # Print the filenames and MDA contents
        f.write(f"Current filename: {first_pair['current_filename']}")
        f.write(f"Length of MDA in words: {len(first_pair['current_mda_content'].split())}")
        f.write(f"\nCurrent MDA content: {first_pair['current_mda_content']}")
        f.write("\n\n" + "="*50 + "\n\n")  # Separator
        f.write(f"Next filename: {first_pair['next_filename']}")
        f.write(f"Length of MDA in words: {len(first_pair['next_mda_content'].split())}")
        f.write(f"\nNext MDA content: {first_pair['next_mda_content']}")

    print("File saved successfully.")

else:
    print("No paired reports found in the CSV file.")


File Processing

In [None]:
import pandas as pd

# Let's check for occurrences of "ITEM 7A" in the 'mda_content' to identify any that might belong to Item 7A instead of Item 7.
data = pd.read_csv(r'C:\Users\abbra\Documents\Research\Koval Paper\Data\Output\all_extracted_MDAs2006-2010.csv')

item_7a_instances = data[data['mda_content'].str.contains("ITEM 7A|Item 7A", case=False, na=False)]

# Display the rows where Item 7A is found
print(item_7a_instances[['filename', 'mda_content']].head())
# Get the count of rows with Item 7A mentioned
item_7a_count = len(item_7a_instances)

print(f"Number of rows with Item 7A: {item_7a_count}")

# Print the first 50 words of each MDA content
for index, row in item_7a_instances.iterrows():
    filename = row['filename']
    content = row['mda_content']
    
    # Split the content into words and take the first 50
    words = content.split()[:50]
    
    # Join the words back into a string
    preview = ' '.join(words)
    
    print(f"\nContent from {filename}:\n")
    print(preview)
    print("\n" + "-" * 50)  # Print a line of 50 dashes as a separator

""" 
                                                  filename  \
39   20110112_10-K_edgar_data_805305_0000950123-11-...   
82   20110114_10-K_edgar_data_1156884_0001079974-11...   
109  20110121_10-K_edgar_data_20740_0001144204-11-0...   
110  20110120_10-K_edgar_data_39368_0001193125-11-0...   
115  20110125_10-K_edgar_data_1090061_0001193125-11...   

                                           mda_content  
39   Item 7A. QUANTITATIVE AND QUALITATIVE DISCLOSU...  
82   Item 7a. Quantitative And Qualitative Disclosu...  
109  Item 7A. Quantitative and Qualitative Disclosu...  
110  Item 7A. Quantitative and Qualitative Disclosu...  
115  Item 7A. Quantitative and Qualitative Disclosu...  
Number of rows with Item 7A: 878
"""

""" 
                                             filename  \
513  20110223_10-K_edgar_data_1289790_0001188112-11...   
688  20110225_10-K_edgar_data_1038357_0001193125-11...   
706  20110225_10-K_edgar_data_1090012_0000950123-11...   
716  20110225_10-K_edgar_data_1115836_0001104659-11...   
763  20110225_10-K_edgar_data_1407463_0001407463-11...   

                                           mda_content  
513  Item 7, and Quantitative and Qualitative Discl...  
688  Item 7. Management's Discussion and Analysis o...  
706  Item 7. Management s Discussion and Analysis o...  
716  Item 7 Management s Discussion and Analysis of...  
763  Management's Discussion and Analysis of Financ...  
Number of rows with Item 7A: 30
"""

In [None]:
import pandas as pd
import os

# Load the data
data = pd.read_csv(r'C:\Users\abbra\Documents\Research\Koval Paper\Data\Output\all_extracted_MDAs2006-2010.csv')

# Define the output file path
output_file = 'C:/Users/abbra/Documents/Research/Koval Paper/Data/Output/mda_previews.txt'

# Open the file in write mode
with open(output_file, 'w', encoding='utf-8') as f:
    for index, row in data.iterrows():
        filename = row['filename']
        content = row['mda_content']
        
        # Split the content into words and take the first 100
        words = content.split()[:100]
        
        # Join the words back into a string
        preview = ' '.join(words)
        
        # Write to the file instead of printing
        f.write(f"\nContent from {filename}:\n\n")
        f.write(preview)
        f.write("\n\n" + "-" * 50 + "\n")  # Write a line of 50 dashes as a separator

print(f"Content previews have been saved to {output_file}")

In [None]:
import csv
import statistics

# Path to the item_7_only CSV file
item_7_only_csv_path = r'C:\Users\abbra\Documents\Research\Koval Paper\Data\Output\all_paired_mda_reports.csv'

# List to store word counts of mda_content
mda_word_counts = []

try:
    with open(item_7_only_csv_path, 'r', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        
        for row in reader:
            word_count1 = len(row['current_mda_content'].split())
            word_count2 = len(row['next_mda_content'].split())
            mda_word_counts.append(word_count1)
            mda_word_counts.append(word_count2)

    if mda_word_counts:
        average_word_count = statistics.mean(mda_word_counts)
        print(f"Average word count of mda_content in Item 7 only MDAs: {average_word_count:.2f} words")
    else:
        print("No Item 7 only MDAs found in the file.")

except Exception as e:
    print(f"An error occurred: {e}")


In [None]:
import csv
import pandas as pd

# Path to the CSV file
csv_path = r'C:\Users\abbra\Documents\Research\Koval Paper\Data\Output\all_paired_mda_reports.csv'

# Read the CSV file
df = pd.read_csv(csv_path)

# Get the word counts of current_mda_content for rows 115-125
word_counts = df.loc[114:124, 'current_mda_content'].apply(lambda x: len(x.split()))

print("Word counts of current_mda_content for rows 115-125:")
for index, count in word_counts.items():
    print(f"Row {index + 1}: {count} words")

# Calculate and print the average word count
average_word_count = word_counts.mean()
print(f"\nAverage word count: {average_word_count:.2f} words")


In [None]:
import pandas as pd
import os

# Path to the CSV file
csv_path = r'C:\Users\abbra\Documents\Research\Koval Paper\Data\Output\all_paired_mda_reports.csv'

# Read the CSV file
df = pd.read_csv(csv_path)

# Get the mda content of row 120 (index 119 in zero-based indexing)
mda_content = df.loc[119, 'current_mda_content']

# Define the destination directory (Downloads folder)
destination_directory = 'C:/Users/abbra/Documents/Research/Koval Paper/Data/Output/'

# Construct the full destination file path
destination_file = os.path.join(destination_directory, 'row_120_mda_content.txt')

# Write the mda content to a txt file
try:
    with open(destination_file, 'w', encoding='utf-8') as file:
        file.write(mda_content)
    print(f"MDA content from row 120 has been saved to {destination_file}")
except Exception as e:
    print(f"An error occurred while writing the file: {e}")


In [None]:
import shutil
import os

# Define the source file path
source_file = r'C:\Users\abbra\Documents\Research\Koval Paper\Data\10-X_C_2001-2005\2001\QTR3\20010918_10-K_edgar_data_789019_0001032210-01-501099.txt'

# Define the destination directory (Downloads folder)
# Adjust the path according to your username
destination_directory = r'C:\Users\abbra\Downloads'

# Construct the full destination file path using the original file name
destination_file = os.path.join(destination_directory, os.path.basename(source_file))

# Copy the file
try:
    shutil.copy(source_file, destination_file)
    print(f"File copied successfully to {destination_file}")
except Exception as e:
    print(f"Error occurred while copying the file: {e}")

In [None]:
import os
import csv
from math import ceil
import sys

# Try to set the field size limit to a large value
max_int = sys.maxsize
while True:
    try:
        csv.field_size_limit(max_int)
        break
    except OverflowError:
        max_int = int(max_int/10)

# Define the source file path
source_file = r'C:\Users\abbra\Documents\Research\Koval Paper\Data\Output\paired_mda_reports.csv'

# Define the destination directory (Downloads folder)
destination_directory = r'C:\Users\abbra\Downloads'

# Construct the full destination file path
destination_file = os.path.join(destination_directory, 'testing_eighth.csv')

try:
    # Read the source CSV file
    with open(source_file, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        all_rows = list(reader)
    
    # Calculate one eighth of the rows (rounding up)
    eighth_point = ceil(len(all_rows) / 12)
    
    # Write the first eighth of the rows to the new CSV file
    with open(destination_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(all_rows[:eighth_point])
    
    print(f"First eighth of the file copied successfully to {destination_file}")
    print(f"Number of rows in original file: {len(all_rows)}")
    print(f"Number of rows in new file: {eighth_point}")

except Exception as e:
    print(f"Error occurred while processing the file: {e}")

In [1]:
import os
import re
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import pandas as pd

def extract_company_id(content):
    """
    Extracts the company CIK number from the file content.
    
    Parameters:
    - content (str): The full text of the 10-K document.
    
    Returns:
    - str: The extracted CIK number or None if not found.
    """
    match = re.search(r'CENTRAL INDEX KEY:\s*(\d+)', content, re.MULTILINE)
    return match.group(1).strip() if match else None

def extract_company_name(content):
    """
    Extracts the company name from the file content.

    Parameters:
    - content (str): The full text of the 10-K document.

    Returns:
    - str: The extracted company name or "Unknown" if not found.
    """
    match = re.search(r'COMPANY CONFORMED NAME:\s*(.+)$', content, re.MULTILINE)
    return match.group(1).strip() if match else "Unknown"

def check_multiple_companies(file_path):
    """
    Checks if a file contains multiple companies.
    
    Parameters:
    - file_path (str): Path to the 10-K file
    
    Returns:
    - dict: Dictionary containing file path and whether it has multiple companies
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            
        # Extract CIK numbers using the existing helper function
        cik_number = extract_company_id(content)
        cik_numbers = set([cik_number]) if cik_number else set()

        # Extract company names using the helper function
        company_name = extract_company_name(content)
        company_names = set([company_name]) if company_name != "Unknown" else set()

        # Search for filer identifiers
        filer_pattern = r'FILER:\s*\n\s*COMPANY DATA:'
        filer_count = len(re.findall(filer_pattern, content))

        # Determine if there are multiple companies
        num_companies = max(len(cik_numbers), len(company_names), filer_count)
        
        return {
            'file_path': file_path,
            'has_multiple_companies': num_companies > 1,
            'num_companies': num_companies
        }
        
    except Exception as e:
        logging.error(f"Error processing {file_path}: {e}")
        return {
            'file_path': file_path,
            'has_multiple_companies': False,
            'num_companies': 0
        }

def main():
    base_directory = r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Mapped_Files'
    
    # First count total number of files
    all_files = []
    total_files = 0
    print("Counting files...")
    for root, dirs, files in os.walk(base_directory):
        for file in files:
            if file.lower().endswith('.txt') and ('10-k' in file.lower() or '10-k405' in file.lower()) and not any(x in file.lower() for x in ['10-k/a', '10-k-a', '10-k405-a']):
                total_files += 1
                all_files.append(os.path.join(root, file))
    
    print(f"Total number of files to process: {total_files}")

    max_workers = min(32, os.cpu_count() + 4)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_file = {executor.submit(check_multiple_companies, file_path): file_path for file_path in all_files}
        
        multiple_company_files = []
        for future in tqdm(as_completed(future_to_file), total=len(all_files), desc="Processing files"):
            file_path = future_to_file[future]
            try:
                result = future.result(timeout=60)
                if result['has_multiple_companies']:
                    multiple_company_files.append(result)
            except Exception as exc:
                logging.error(f"ERROR: {file_path} generated an exception: {exc}")
    
    print(f"\nTotal files with multiple companies: {len(multiple_company_files)}")
    print("\n10 example files with multiple companies:")
    for i, file_info in enumerate(multiple_company_files[:10], 1):
        print(f"{i}. {file_info['file_path']} (Number of companies: {file_info['num_companies']})")
    
    return multiple_company_files  # Return the list

multiple_company_files = main()

# Load the paired MDA reports file
paired_mda_df = pd.read_csv(r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Output\paired_mda_reports_CLEANED.csv')

# Extract unique CIKs from paired_mda_reports
paired_ciks = set(paired_mda_df['cik_number'].unique())


print("\nChecking if companies from multiple-company files are in paired_mda_reports.csv...")

# Create lists to store files where companies are found in paired_mda_reports
found_companies = []
missing_companies = []

# Add tqdm for the second processing loop
for file_info in tqdm(multiple_company_files, desc="Checking paired reports"):
    file_path = file_info['file_path']
    
    # Extract CIK from the file content
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            cik = extract_company_id(content)
            
            if cik:
                cik = int(cik)  # Convert to integer to match format in paired_mda_reports
                if cik in paired_ciks:
                    found_companies.append({
                        'file_path': file_path,
                        'cik': cik,
                        'num_companies': file_info['num_companies']
                    })
                else:
                    missing_companies.append({
                        'file_path': file_path,
                        'cik': cik,
                        'num_companies': file_info['num_companies']
                    })

    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# Calculate percentage of paired reports affected
total_paired_reports = len(paired_mda_df)
percent_affected = (len(found_companies) / total_paired_reports) * 100

# Print summary statistics
print(f"\nSummary Statistics:")
print(f"Total paired reports in CSV: {total_paired_reports}")
print(f"Number of multiple-company files found in paired reports: {len(found_companies)}")
print(f"Percentage of paired reports affected: {percent_affected:.2f}%")

# Print some examples of found companies
print("\nExample files found in paired_mda_reports:")
for i, file_info in enumerate(found_companies[:5], 1):
    print(f"\n{i}. {file_info['file_path']}")
    print(f"   CIK: {file_info['cik']}")
    print(f"   Number of companies in file: {file_info['num_companies']}")

# Print some examples of missing companies
print("\nExample files not found in paired_mda_reports:")
for i, file_info in enumerate(missing_companies[:5], 1):
    print(f"\n{i}. {file_info['file_path']}")
    print(f"   CIK: {file_info['cik']}")
    print(f"   Number of companies in file: {file_info['num_companies']}")

Counting files...
Total number of files to process: 54551


Processing files: 100%|██████████| 54551/54551 [03:14<00:00, 280.03it/s]



Total files with multiple companies: 2020

10 example files with multiple companies:
1. C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Mapped_Files\10-X_C_2001-2005\2001\QTR1\20010323_10-K405_edgar_data_46207_0000898430-01-500022.txt (Number of companies: 2)
2. C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Mapped_Files\10-X_C_2001-2005\2001\QTR1\20010323_10-K405_edgar_data_354707_0000898430-01-500022.txt (Number of companies: 2)
3. C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Mapped_Files\10-X_C_2001-2005\2001\QTR1\20010322_10-K_edgar_data_1067701_0000950130-01-500286.txt (Number of companies: 2)
4. C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Mapped_Files\10-X_C_2001-2005\2001\QTR1\20010322_10-K_edgar_data_1047166_0000950130-01-500286.txt (Number of companies: 2)
5. C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Mapped_Files\10-X_C_2001-2005\2001\QTR1\20010323_10-K_edgar_data_899045_0000950134-01-002534.txt (Number of companies: 2)
6.

In [14]:
# Create sets from found_companies
multiple_company_ciks = set(file_info['cik'] for file_info in found_companies)
cik_to_num_companies = {file_info['cik']: file_info['num_companies'] for file_info in found_companies}

# Add columns to DataFrame
paired_mda_df['has_multiple_companies'] = paired_mda_df['cik_number'].isin(multiple_company_ciks)
paired_mda_df['num_companies'] = paired_mda_df['cik_number'].map(lambda x: cik_to_num_companies.get(x, 1))

# Print comprehensive analysis
print("\nComprehensive Multiple Companies Analysis:")
print("----------------------------------------")
print(f"Total paired rows: {len(paired_mda_df)}")
print(f"Pairs with multiple companies: {paired_mda_df['has_multiple_companies'].sum()}")
print(f"Percentage of pairs with multiple companies: {(paired_mda_df['has_multiple_companies'].sum() / len(paired_mda_df)) * 100:.2f}%")

# Distribution by year
print("\nDistribution by Year:")
print("--------------------")
yearly_counts = paired_mda_df[paired_mda_df['has_multiple_companies']].groupby(
    pd.to_datetime(paired_mda_df['current_filing_date']).dt.year
).size()

for year, count in yearly_counts.items():
    total_in_year = len(paired_mda_df[pd.to_datetime(paired_mda_df['current_filing_date']).dt.year == year])
    percentage = (count / total_in_year) * 100
    print(f"Year {year}: {count} pairs ({percentage:.2f}%)")

# Cross-reference with original analysis
print("\nCross-Reference with Original Analysis:")
print("--------------------------------------")
print(f"Original individual files count: 1,798")
print(f"Current analysis:")
print(f"- Unique CIKs with multiple companies: {len(multiple_company_ciks)}")
print(f"- Total pairs with multiple companies: {paired_mda_df['has_multiple_companies'].sum()}")
print(f"- Estimated individual files: {paired_mda_df['has_multiple_companies'].sum() * 2}")  # multiply by 2 since each pair represents 2 files

# Save the updated DataFrame
output_path = r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Output\paired_mda_reports_CLEANEDV4.csv'
paired_mda_df.to_csv(output_path, index=False)
print(f"\nUpdated CSV saved to: {output_path}")

# Sample of pairs with multiple companies
print("\nSample of pairs with multiple companies:")
sample_pairs = paired_mda_df[paired_mda_df['has_multiple_companies']].sample(min(5, len(paired_mda_df[paired_mda_df['has_multiple_companies']])))
for _, row in sample_pairs.iterrows():
    print(f"\nCIK: {row['cik_number']}")
    print(f"Number of Companies: {row['num_companies']}")
    print(f"Current Filing Date: {row['current_filing_date']}")
    print(f"Next Filing Date: {row['next_filing_date']}")


Comprehensive Multiple Companies Analysis:
----------------------------------------
Total paired rows: 27508
Pairs with multiple companies: 654
Percentage of pairs with multiple companies: 2.38%

Distribution by Year:
--------------------
Year 2001: 25 pairs (3.41%)
Year 2002: 22 pairs (2.81%)
Year 2003: 23 pairs (2.81%)
Year 2004: 22 pairs (2.55%)
Year 2005: 25 pairs (2.75%)
Year 2006: 27 pairs (2.75%)
Year 2007: 35 pairs (3.28%)
Year 2008: 30 pairs (2.51%)
Year 2009: 33 pairs (2.25%)
Year 2010: 33 pairs (2.16%)
Year 2011: 37 pairs (2.26%)
Year 2012: 36 pairs (2.15%)
Year 2013: 30 pairs (1.71%)
Year 2014: 41 pairs (2.26%)
Year 2015: 44 pairs (2.32%)
Year 2016: 45 pairs (2.24%)
Year 2017: 49 pairs (2.36%)
Year 2018: 51 pairs (2.36%)
Year 2019: 46 pairs (2.15%)

Cross-Reference with Original Analysis:
--------------------------------------
Original individual files count: 1,798
Current analysis:
- Unique CIKs with multiple companies: 85
- Total pairs with multiple companies: 654
- Esti

In [15]:
# Convert filing dates to datetime
paired_mda_df['current_filing_date'] = pd.to_datetime(paired_mda_df['current_filing_date'])
paired_mda_df['next_filing_date'] = pd.to_datetime(paired_mda_df['next_filing_date'])

# Check for 2020 in either current or next filing dates
pairs_2020_current = paired_mda_df[paired_mda_df['current_filing_date'].dt.year == 2020]
pairs_2020_next = paired_mda_df[paired_mda_df['next_filing_date'].dt.year == 2020]

print("\n2020 Pairs Analysis:")
print("-------------------")
print(f"Pairs with 2020 as current year: {len(pairs_2020_current)}")
print(f"Pairs with 2020 as next year: {len(pairs_2020_next)}")

# Detailed look at 2020 pairs
if len(pairs_2020_current) > 0:
    print("\nSample of pairs with 2020 as current year:")
    print(pairs_2020_current[['cik_number', 'current_filing_date', 'next_filing_date', 'has_multiple_companies']].head())

if len(pairs_2020_next) > 0:
    print("\nSample of pairs with 2020 as next year:")
    print(pairs_2020_next[['cik_number', 'current_filing_date', 'next_filing_date', 'has_multiple_companies']].head())

# Distribution of filing months for 2020
if len(pairs_2020_current) > 0:
    print("\nDistribution of filing months for 2020 current year:")
    print(pairs_2020_current['current_filing_date'].dt.month.value_counts().sort_index())

if len(pairs_2020_next) > 0:
    print("\nDistribution of filing months for 2020 next year:")
    print(pairs_2020_next['next_filing_date'].dt.month.value_counts().sort_index())


2020 Pairs Analysis:
-------------------
Pairs with 2020 as current year: 8
Pairs with 2020 as next year: 2145

Sample of pairs with 2020 as current year:
       cik_number current_filing_date next_filing_date  has_multiple_companies
22689     1045942          2020-01-10       2020-12-23                   False
23262     1394638          2020-04-01       2020-12-30                   False
24261     1022505          2020-01-10       2020-12-29                   False
25013     1536089          2020-01-15       2020-12-29                   False
25650     1508348          2020-01-14       2020-12-21                   False

Sample of pairs with 2020 as next year:
    cik_number current_filing_date next_filing_date  has_multiple_companies
21       46619          2019-12-19       2020-12-23                   False
30      318300          2019-03-01       2020-03-03                   False
46       72162          2019-03-11       2020-03-11                   False
62     1038074          2

In [3]:
import pandas as pd
import os

# Load the cleaned data with multiple-company markings
input_path = r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Output\paired_mda_reports_CLEANEDV4.csv'
df = pd.read_csv(input_path)

print("\nInitial Data Check:")
print(f"Total rows: {len(df)}")
print(f"Total rows with multiple companies: {df['has_multiple_companies'].sum()}")
print(f"Unique CIKs with multiple companies: {df[df['has_multiple_companies']]['cik_number'].nunique()}")

# Convert dates to datetime
df['current_filing_date'] = pd.to_datetime(df['current_filing_date'])
df['next_filing_date'] = pd.to_datetime(df['next_filing_date'])

# Add year-month columns for transition handling
df['current_ym'] = pd.to_datetime(df['current_filing_date'].dt.strftime('%Y-%m-01'))
df['next_ym'] = pd.to_datetime(df['next_filing_date'].dt.strftime('%Y-%m-01'))

# Define period boundaries
TRAIN_END = pd.Timestamp('2015-06-30')    # Training cutoff
VAL_END = pd.Timestamp('2017-06-30')      # Validation cutoff

# Identify all CIKs with multiple companies
multiple_company_ciks = set(df[df['has_multiple_companies']]['cik_number'])

# Create the DAPT dataset (2000-2010 + all multiple companies)
dapt_data = df[
    (df['current_filing_date'].dt.year <= 2010) |
    (df['cik_number'].isin(multiple_company_ciks))
]

# Remaining data (excluding DAPT data)
remaining_data = df[
    ~(df['current_filing_date'].dt.year <= 2010) &
    ~df['cik_number'].isin(multiple_company_ciks)
]

# Split remaining data with clean cutoffs
train_data = remaining_data[
    (remaining_data['current_filing_date'] <= TRAIN_END)
]

val_data = remaining_data[
    (remaining_data['current_filing_date'] > TRAIN_END) &
    (remaining_data['current_filing_date'] <= VAL_END)
]

test_data = remaining_data[
    (remaining_data['current_filing_date'] > VAL_END)
]

# Print comprehensive statistics
print("\nDataset Statistics:")
print(f"Total pairs in dataset: {len(df)}")
print(f"Total CIKs with multiple companies: {len(multiple_company_ciks)}")
print(f"DAPT data (2000-2010 + all multiple companies): {len(dapt_data)} pairs")
print(f"Training data (2011-2015.06, single company only): {len(train_data)} pairs")
print(f"Validation data (2015.07-2017.06, single company only): {len(val_data)} pairs")
print(f"Test data (2017.07+, single company only): {len(test_data)} pairs")

def check_date_ranges(dataset, name):
    if len(dataset) > 0:
        print(f"\n{name} date range:")
        print(f"Current MDAs: {dataset['current_filing_date'].min()} to {dataset['current_filing_date'].max()}")
        print(f"Next MDAs: {dataset['next_filing_date'].min()} to {dataset['next_filing_date'].max()}")
        print(f"Number of unique companies: {dataset['company_name'].nunique()}")
        print(f"Number of pairs with multiple companies: {dataset['has_multiple_companies'].sum()}")
        print("\nYear-Month distribution for current filings:")
        print(dataset.groupby([dataset['current_filing_date'].dt.year, 
                             dataset['current_filing_date'].dt.month]).size().sort_index())
        print("\nYear-Month distribution for next filings:")
        print(dataset.groupby([dataset['next_filing_date'].dt.year, 
                             dataset['next_filing_date'].dt.month]).size().sort_index())
    else:
        print(f"\n{name} is empty")

# Check each split
check_date_ranges(dapt_data, "DAPT data")
check_date_ranges(train_data, "Training data")
check_date_ranges(val_data, "Validation data")
check_date_ranges(test_data, "Test data")

# Verify no temporal overlap
print("\nVerifying no temporal overlap between splits...")
def verify_no_overlap(df1, df1_name, df2, df2_name):
    next_mdas_1 = set(df1['current_filing_date'])
    current_mdas_2 = set(df2['current_filing_date'])
    overlap = next_mdas_1.intersection(current_mdas_2)
    if overlap:
        print(f"Warning: Found {len(overlap)} overlapping dates between {df1_name} and {df2_name}")
        print("Sample overlapping dates:", sorted(overlap)[:5], "...")
    else:
        print(f"No overlap between {df1_name} and {df2_name}")

verify_no_overlap(train_data, "Training", val_data, "Validation")
verify_no_overlap(val_data, "Validation", test_data, "Test")

# Create output directory if it doesn't exist
output_base = r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Output\Model Data'
os.makedirs(output_base, exist_ok=True)

# Save the splits
dapt_data.to_csv(os.path.join(output_base, 'dapt_data.csv'), index=False)
train_data.to_csv(os.path.join(output_base, 'train_data.csv'), index=False)
val_data.to_csv(os.path.join(output_base, 'val_data.csv'), index=False)
test_data.to_csv(os.path.join(output_base, 'test_data.csv'), index=False)

# Verify all multiple-company pairs are in DAPT
multiple_company_pairs = df['has_multiple_companies'].sum()
multiple_company_pairs_in_dapt = dapt_data['has_multiple_companies'].sum()
assert multiple_company_pairs == multiple_company_pairs_in_dapt, "Not all multiple-company pairs are in DAPT dataset"
print(f"\nVerification: All {multiple_company_pairs} multiple-company pairs are in DAPT dataset")

# Verify all data is assigned
total_assigned = len(dapt_data) + len(train_data) + len(val_data) + len(test_data)
print(f"\nTotal rows: {len(df)}")
print(f"Total assigned: {total_assigned}")
if total_assigned != len(df):
    print(f"Warning: {len(df) - total_assigned} rows unassigned!")
else:
    print("All rows assigned successfully!")


Initial Data Check:
Total rows: 27508
Total rows with multiple companies: 654
Unique CIKs with multiple companies: 85

Dataset Statistics:
Total pairs in dataset: 27508
Total CIKs with multiple companies: 85
DAPT data (2000-2010 + all multiple companies): 10719 pairs
Training data (2011-2015.06, single company only): 8257 pairs
Validation data (2015.07-2017.06, single company only): 3975 pairs
Test data (2017.07+, single company only): 4557 pairs

DAPT data date range:
Current MDAs: 2001-01-10 00:00:00 to 2019-12-05 00:00:00
Next MDAs: 2001-12-14 00:00:00 to 2020-12-07 00:00:00
Number of unique companies: 2427
Number of pairs with multiple companies: 654

Year-Month distribution for current filings:
current_filing_date  current_filing_date
2001                 1                       19
                     2                       25
                     3                      373
                     4                      132
                     5                       17
         

In [3]:
# After your existing code, add:

# Extract years from file paths of companies found in paired reports
years_analysis = {}
for company in found_companies:
    file_path = company['file_path']
    
    # Extract year from file path using regex
    year_match = re.search(r'[12]\d{3}', os.path.basename(file_path))
    if year_match:
        year = year_match.group()
        years_analysis[year] = years_analysis.get(year, 0) + 1

# Sort years and print distribution
print("\nDistribution of multiple-company files by year:")
for year in sorted(years_analysis.keys()):
    count = years_analysis[year]
    percentage = (count / len(found_companies)) * 100
    print(f"Year {year}: {count} files ({percentage:.2f}%)")

# Print summary statistics
print("\nYear Range Summary:")
print(f"Earliest year: {min(years_analysis.keys())}")
print(f"Latest year: {max(years_analysis.keys())}")
print(f"Number of years covered: {len(years_analysis)}")

# Optionally, create a more detailed DataFrame for analysis
year_analysis_df = pd.DataFrame([
    {'year': year, 'count': count, 'percentage': (count / len(found_companies)) * 100}
    for year, count in years_analysis.items()
])
year_analysis_df = year_analysis_df.sort_values('year')

# Save the year analysis to a CSV
year_analysis_path = r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Output\multiple_companies_year_analysis.csv'
year_analysis_df.to_csv(year_analysis_path, index=False)
print(f"\nDetailed year analysis saved to: {year_analysis_path}")


Distribution of multiple-company files by year:
Year 2001: 58 files (3.96%)
Year 2002: 19 files (1.30%)
Year 2003: 20 files (1.36%)
Year 2004: 28 files (1.91%)
Year 2005: 41 files (2.80%)
Year 2006: 39 files (2.66%)
Year 2007: 35 files (2.39%)
Year 2008: 49 files (3.34%)
Year 2009: 41 files (2.80%)
Year 2010: 30 files (2.05%)
Year 2011: 37 files (2.52%)
Year 2012: 48 files (3.27%)
Year 2013: 41 files (2.80%)
Year 2014: 128 files (8.73%)
Year 2015: 112 files (7.64%)
Year 2016: 127 files (8.66%)
Year 2017: 140 files (9.55%)
Year 2018: 159 files (10.85%)
Year 2019: 157 files (10.71%)
Year 2020: 157 files (10.71%)

Year Range Summary:
Earliest year: 2001
Latest year: 2020
Number of years covered: 20

Detailed year analysis saved to: C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Output\multiple_companies_year_analysis.csv


In [10]:
# Extract years from the CSV file's filing dates and from the file paths
csv_cutoff_date = pd.to_datetime('2020-04-01')  # The latest date in your CSV

# Get CSV years distribution excluding partial 2020 data
csv_dates = pd.to_datetime(paired_mda_df['current_filing_date'])
csv_years = csv_dates[csv_dates < csv_cutoff_date].dt.year.value_counts().sort_index()

# Reanalyze multiple-company files with date cutoff
multiple_company_dates = []
for file_info in multiple_company_files:
    try:
        with open(file_info['file_path'], 'r', encoding='utf-8') as f:
            content = f.read()
            filing_date = extract_filing_date(content)
            if filing_date != "Unknown":
                file_date = pd.to_datetime(filing_date)
                if file_date < csv_cutoff_date:  # Exclude 2020 data
                    multiple_company_dates.append({
                        'file_path': file_info['file_path'],
                        'filing_date': filing_date,
                        'year_from_path': re.search(r'[12]\d{3}', os.path.basename(file_info['file_path'])).group()
                    })
    except Exception as e:
        print(f"Error processing {file_info['file_path']}: {e}")

# Convert to DataFrame for analysis
multiple_dates_df = pd.DataFrame(multiple_company_dates)
multiple_dates_df['filing_year'] = pd.to_datetime(multiple_dates_df['filing_date']).dt.year

# Create updated years_analysis dictionary
updated_years_analysis = multiple_dates_df['filing_year'].value_counts().to_dict()

# Create comparison DataFrame with cutoff-adjusted data (excluding 2020)
comparison_data = []
for year in sorted(set(csv_years.index)):  # Only use complete years
    if year < 2020:  # Exclude 2020 entirely
        total_files = csv_years.get(year, 0)
        multiple_files = updated_years_analysis.get(year, 0)
        
        comparison_data.append({
            'year': year,
            'total_files': total_files,
            'multiple_company_files': multiple_files,
            'percentage': (multiple_files / total_files * 100) if total_files > 0 else 0
        })

comparison_df = pd.DataFrame(comparison_data)

# Print the comparison
print("\nYear-by-Year Comparison (2001-2019):")
print("------------------------------------------------")
for _, row in comparison_df.iterrows():
    print(f"Year {int(row['year'])}:")
    print(f"  Total files in CSV: {int(row['total_files'])}")
    print(f"  Files with multiple companies: {int(row['multiple_company_files'])}")
    print(f"  Percentage: {row['percentage']:.2f}%")
    print("------------------------------------------------")

# Save updated analysis
analysis_path = r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Output\filing_dates_analysis_adjusted.csv'
comparison_df.to_csv(analysis_path, index=False)
print(f"\nUpdated analysis saved to: {analysis_path}")

# Print summary statistics
print("\nSummary Statistics (2001-2019):")
print(f"Total files in CSV: {comparison_df['total_files'].sum()}")
print(f"Total multiple-company files: {comparison_df['multiple_company_files'].sum()}")
print(f"Average percentage across years: {comparison_df['percentage'].mean():.2f}%")
print(f"Maximum percentage in a single year: {comparison_df['percentage'].max():.2f}%")
print(f"Minimum percentage in a single year: {comparison_df['percentage'].min():.2f}%")


Year-by-Year Comparison (2001-2019):
------------------------------------------------
Year 2001:
  Total files in CSV: 733
  Files with multiple companies: 74
  Percentage: 10.10%
------------------------------------------------
Year 2002:
  Total files in CSV: 783
  Files with multiple companies: 23
  Percentage: 2.94%
------------------------------------------------
Year 2003:
  Total files in CSV: 818
  Files with multiple companies: 22
  Percentage: 2.69%
------------------------------------------------
Year 2004:
  Total files in CSV: 864
  Files with multiple companies: 41
  Percentage: 4.75%
------------------------------------------------
Year 2005:
  Total files in CSV: 908
  Files with multiple companies: 61
  Percentage: 6.72%
------------------------------------------------
Year 2006:
  Total files in CSV: 981
  Files with multiple companies: 60
  Percentage: 6.12%
------------------------------------------------
Year 2007:
  Total files in CSV: 1066
  Files with multiple 

In [7]:
def extract_filing_date(content):
    """
    Extracts the filing date from the file content.

    Parameters:
    - content (str): The full text of the 10-K document.

    Returns:
    - str: The extracted filing date in YYYY-MM-DD format or "Unknown" if not found.
    """

    match = re.search(r'FILED AS OF DATE:\s*(\d{8})', content)
    if match:
        date_str = match.group(1)
        return f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
    return "Unknown"

# First, let's analyze the filing dates of the multiple-company files
multiple_company_dates = []
for file_info in multiple_company_files:
    try:
        with open(file_info['file_path'], 'r', encoding='utf-8') as f:
            content = f.read()
            filing_date = extract_filing_date(content)
            if filing_date != "Unknown":
                multiple_company_dates.append({
                    'file_path': file_info['file_path'],
                    'filing_date': filing_date,
                    'year_from_path': re.search(r'[12]\d{3}', os.path.basename(file_info['file_path'])).group()
                })
    except Exception as e:
        print(f"Error processing {file_info['file_path']}: {e}")

# Convert to DataFrame for easier analysis
multiple_dates_df = pd.DataFrame(multiple_company_dates)
multiple_dates_df['filing_year'] = pd.to_datetime(multiple_dates_df['filing_date']).dt.year

# Compare years from file paths vs actual filing dates
print("\nAnalysis of Multiple-Company Files:")
print("-----------------------------------")
print("\nComparison of file path years vs filing dates:")
year_comparison = multiple_dates_df.groupby(['year_from_path', 'filing_year']).size().unstack(fill_value=0)
print(year_comparison)

# Analyze CSV filing dates
print("\nAnalysis of CSV Filing Dates:")
print("-----------------------------")
print("\nDistribution of current_filing_date by year:")
csv_dates = pd.to_datetime(paired_mda_df['current_filing_date'])
print(csv_dates.dt.year.value_counts().sort_index())

print("\nDate range in CSV:")
print(f"Earliest date: {csv_dates.min()}")
print(f"Latest date: {csv_dates.max()}")

# Check for any "Unknown" dates
unknown_dates = paired_mda_df[paired_mda_df['current_filing_date'] == "Unknown"]
if len(unknown_dates) > 0:
    print(f"\nFound {len(unknown_dates)} entries with 'Unknown' filing dates in CSV")

# Check for potential date format issues
print("\nSample of filing dates from CSV:")
print(paired_mda_df['current_filing_date'].head())

# Compare with next_filing_dates
print("\nDate range for next_filing_date:")
next_dates = pd.to_datetime(paired_mda_df['next_filing_date'])
print(f"Earliest next date: {next_dates.min()}")
print(f"Latest next date: {next_dates.max()}")

# Save detailed analysis to CSV
analysis_df = pd.DataFrame({
    'year': range(2001, 2021),
    'csv_current_dates': csv_dates.dt.year.value_counts(),
    'csv_next_dates': next_dates.dt.year.value_counts(),
    'multiple_company_files': [years_analysis.get(str(year), 0) for year in range(2001, 2021)]
})
analysis_df = analysis_df.fillna(0)

analysis_path = r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Output\filing_dates_analysis.csv'
analysis_df.to_csv(analysis_path, index=False)
print(f"\nDetailed analysis saved to: {analysis_path}")


Analysis of Multiple-Company Files:
-----------------------------------

Comparison of file path years vs filing dates:
filing_year     2001  2002  2003  2004  2005  2006  2007  2008  2009  2010  \
year_from_path                                                               
2001              74     0     0     0     0     0     0     0     0     0   
2002               0    23     0     0     0     0     0     0     0     0   
2003               0     0    22     0     0     0     0     0     0     0   
2004               0     0     0    41     0     0     0     0     0     0   
2005               0     0     0     0    61     0     0     0     0     0   
2006               0     0     0     0     0    60     0     0     0     0   
2007               0     0     0     0     0     0    48     0     0     0   
2008               0     0     0     0     0     0     0    74     0     0   
2009               0     0     0     0     0     0     0     0    63     0   
2010               0 

In [2]:
import pandas as pd

# Read the CSV file
df = pd.read_csv(r'C:\Users\abbra\Documents\Research Code\Koval Paper\Data\Output\paired_mda_reports_CLEANED.csv')

# Display column names
print("Column names in the CSV:")
print(df.columns.tolist())

# Display basic information about the DataFrame
print("\nDataFrame Info:")
df.info()

Column names in the CSV:
['current_filename', 'next_filename', 'company_name', 'cik_number', 'current_filing_date', 'next_filing_date', 'current_mda_content', 'next_mda_content', 'time_difference']

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27508 entries, 0 to 27507
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   current_filename     27508 non-null  object
 1   next_filename        27508 non-null  object
 2   company_name         27508 non-null  object
 3   cik_number           27508 non-null  int64 
 4   current_filing_date  27508 non-null  object
 5   next_filing_date     27508 non-null  object
 6   current_mda_content  27508 non-null  object
 7   next_mda_content     27508 non-null  object
 8   time_difference      27508 non-null  int64 
dtypes: int64(2), object(7)
memory usage: 1.9+ MB
