In [22]:
import pandas as pd
import numpy as np
import logging
from datetime import datetime, date
from io import StringIO

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s:%(message)s')


In [23]:
# If your modules are in a different directory, adjust sys.path
import sys
sys.path.append('path_to_your_project_directory')

# Import your classes and functions
from data_processing.data_processor import DataProcessor
from data_processing.report_generator import ReportGenerator
from utils.helpers import determine_period
from config.settings import HARD_CODED_DATA

# Since the modules might not be accessible, we can define the classes and functions directly in the notebook


In [25]:
#    Determines the period identifier for a given date.

def determine_period(input_date):
    """
    Determines the period identifier for a given date.

    The periods are defined based on specific date ranges, following a recurring
    pattern every two years. The function calculates the period number based on
    the input date.

    Args:
        input_date (datetime.date or datetime.datetime or str): The input date.

    Returns:
        str or None: The period identifier (e.g., 'P1', 'P2', etc.), or None if the date is invalid.
    """
    # Handle NaT values
    if pd.isna(input_date):
        return None

    # Convert input_date to date object if it's a string
    if isinstance(input_date, str):
        try:
            input_date = datetime.strptime(input_date, "%d/%m/%Y").date()
        except ValueError:
            return None  # Return None if the date string is invalid
    elif isinstance(input_date, datetime):
        input_date = input_date.date()
    elif not isinstance(input_date, date):
        return None  # Return None for any other unexpected type

    # Define the base date (start of P1)
    base_date = date(2020, 1, 1)

    # Calculate the number of days since the base date
    days_since_base = (input_date - base_date).days

    # Handle dates before the base date
    if days_since_base < 0:
        return None  # or you could return a special period for pre-2020 dates

    # Calculate the number of complete 2-year cycles
    two_year_cycles = days_since_base // 730  # 730 days in 2 years (ignoring leap years for simplicity)

    # Calculate the remaining days within the current 2-year cycle
    days_in_cycle = days_since_base % 730

    # Determine the period within the 2-year cycle
    if days_in_cycle < 181:  # First 6 months (181 days)
        period_in_cycle = 1
    elif days_in_cycle < 273:  # Next 3 months (92 days)
        period_in_cycle = 2
    elif days_in_cycle < 365:  # Next 3 months (92 days)
        period_in_cycle = 3
    elif days_in_cycle < 456:  # Next 3 months (91 days)
        period_in_cycle = 4
    elif days_in_cycle < 547:  # Next 3 months (91 days)
        period_in_cycle = 5
    elif days_in_cycle < 638:  # Next 3 months (91 days)
        period_in_cycle = 6
    else:  # Last 3 months (92 days)
        period_in_cycle = 7

    # Calculate the final period number
    period_number = two_year_cycles * 7 + period_in_cycle

    return f"P{period_number}"

def clean_isin(isin):
    """
    Cleans and standardizes ISIN codes.

    Args:
        isin (str): The ISIN code to clean.

    Returns:
        str: The cleaned and standardized ISIN code.
    """
    return str(isin).strip().upper()


In [26]:
# Instantiate DataProcessor with DataFrames
data_processor = DataProcessor(
    esma_si_df=esma_si_df,
    trade_source_df=trade_source_df,
    trade_source_scope_df=trade_source_scope_df,
    esma_threshold_df=esma_threshold_df
)

# Run data processing
data_processor.process_data()


TypeError: DataProcessor.__init__() got an unexpected keyword argument 'trade_source_df'

In [None]:
# Access the processed data
processed_trade_source = data_processor.trade_source
processed_trade_source_scope = data_processor.trade_source_scope
result_df = data_processor.result_df
issuer_review = data_processor.issuer_review


In [None]:
# Check the first few rows
print(processed_trade_source.head())
print(processed_trade_source_scope.head())
print(result_df.head())
print(issuer_review.head())

# Check DataFrame info
processed_trade_source.info()
processed_trade_source_scope.info()
result_df.info()
issuer_review.info()


In [None]:
# Check for NaN values in critical columns
print(processed_trade_source['ISSUER'].isna().sum())
print(processed_trade_source_scope['ISSUER'].isna().sum())

# Verify that 'Period' columns are correctly assigned
print(processed_trade_source['Period'].unique())
print(esma_si_df['Period'].unique())


Generate the Report:



In [None]:
# Specify the output directory (can be a temporary directory)
output_dir = 'path_to_output_directory'  # Replace with your desired path

# Instantiate ReportGenerator
report_generator = ReportGenerator(output_dir)

# Generate the report
report = report_generator.generate_report(
    esma_si_df=esma_si_df,
    trade_source=processed_trade_source,
    trade_source_scope=processed_trade_source_scope,
    result_df=result_df,
    issuer_review=issuer_review,
    all_periods=data_processor.all_periods
)

# Display the report
print(report)


Example: Total Trades per Period



In [None]:
import matplotlib.pyplot as plt

# Total trades per period in Trade_Source
trade_counts = processed_trade_source['Period'].value_counts().sort_index()

plt.figure(figsize=(10, 6))
trade_counts.plot(kind='bar')
plt.title('Total Trades per Period')
plt.xlabel('Period')
plt.ylabel('Number of Trades')
plt.show()


Example: SI Scores per Issuer



In [None]:
# Ensure 'Total SI Score' is calculated
issuer_review['Total SI Score'] = issuer_review.filter(regex='SI Score').sum(axis=1)

# Top 10 issuers by SI Score
top_issuers = issuer_review.nlargest(10, 'Total SI Score')

plt.figure(figsize=(10, 6))
plt.barh(top_issuers['ISSUER'], top_issuers['Total SI Score'])
plt.title('Top 10 Issuers by Total SI Score')
plt.xlabel('Total SI Score')
plt.ylabel('Issuer')
plt.gca().invert_yaxis()  # Invert y-axis to have the highest score on top
plt.show()
