In [2]:
# customers_processing.py

import pandas as pd
import numpy as np
from pymongo import MongoClient
from datetime import datetime, timezone
import sys
import os

# Define the absolute path to the config directory
CONFIG_DIR = r'C:\Users\ak012\Documents\Projects\config'

# Add the config directory to sys.path if it's not already included
if CONFIG_DIR not in sys.path:
    sys.path.append(CONFIG_DIR)

import config

print("Loaded config module:", config)
print("Config attributes:", dir(config))

# Specifically check for the required attributes
print("STATUS_SCORE_COMPLETED:", getattr(config, 'STATUS_SCORE_COMPLETED', 'Not Found'))
print("STATUS_SCORE_NOT_COMPLETED:", getattr(config, 'STATUS_SCORE_NOT_COMPLETED', 'Not Found'))

# MongoDB Connection Details
MONGO_URI = "Insert-here-DATABASE_NAME = "legitt-prod"
INVOICES_COLLECTION = "company_invoices"
CONTRACTS_COLLECTION = "company_contracts"

# Function to calculate interference score
def calculate_interference(row):
    overdue_days = row['overdue_days']
    status = row['status'].strip().lower()
    if status == "completed":
        status_score = config.STATUS_SCORE_COMPLETED
    else:
        status_score = config.STATUS_SCORE_NOT_COMPLETED
    return overdue_days * config.OVERDUE_MULTIPLIER + status_score

# Function to assign rating based on interference score
def assign_rating(score):
    if score < config.SCORE_THRESHOLD_GREEN:
        return "Green"
    elif config.SCORE_THRESHOLD_GREEN <= score < config.SCORE_THRESHOLD_YELLOW:
        return "Yellow"
    else:
        return "Red"

def process_invoices_and_contracts():
    # Connect to MongoDB
    client = MongoClient(MONGO_URI)
    db = client[DATABASE_NAME]
    invoices_collection = db[INVOICES_COLLECTION]
    contracts_collection = db[CONTRACTS_COLLECTION]
    customers_collection = db["company_customers"]  # new collection for storing all rows

    # Fetch data from MongoDB
    invoices = list(invoices_collection.find())
    contracts = list(contracts_collection.find())
    print(f"Number of invoices fetched: {len(invoices)}")
    print(f"Number of contracts fetched: {len(contracts)}")

    # Convert data to DataFrames
    df_invoices = pd.DataFrame(invoices)
    df_contracts = pd.DataFrame(contracts)

    # Ensure date columns are in datetime format
    for col in ['due_date', 'from_date', 'to_date']:
        df_invoices[col] = pd.to_datetime(df_invoices[col], errors='coerce')

    # Handle timezone-aware dates
    df_invoices['due_date'] = df_invoices['due_date'].apply(
        lambda x: x.replace(tzinfo=timezone.utc) if pd.notnull(x) else x
    )

    # Calculate overdue days
    today = datetime.now(timezone.utc)
    df_invoices['overdue_days'] = df_invoices['due_date'].apply(
        lambda x: max((today - x).days, 0) if pd.notnull(x) and x < today else 0
    )

    # Calculate interference score and rating
    df_invoices['interference_score'] = df_invoices.apply(calculate_interference, axis=1)
    df_invoices['rating'] = df_invoices['interference_score'].apply(assign_rating)

    # Filter and aggregate invoices by contract
    df_invoices_filtered = df_invoices[
        ['contract_name', 'from_date', 'to_date', 'total_amount_in_usd', 'interference_score', 'rating']
    ].copy()

    # Avoid division errors
    df_invoices_filtered['total_amount_in_usd'] = df_invoices_filtered['total_amount_in_usd'].replace(0, np.nan)

    # Group data and calculate aggregates
    aggregated_invoices = df_invoices_filtered.groupby("contract_name").agg(
        from_date=("from_date", "min"),
        to_date=("to_date", "max"),
        total_amount_in_usd=("total_amount_in_usd", "sum"),
        invoice_count=("contract_name", "size")
    ).reset_index()

    # Weighted interference score
    def calculate_weighted_interference(row):
        contract_name = row['contract_name']
        relevant_invoices = df_invoices_filtered[df_invoices_filtered['contract_name'] == contract_name]
        total_amount = row['total_amount_in_usd']
        if pd.notnull(total_amount) and total_amount > 0:
            weighted_sum = (relevant_invoices['interference_score'] * relevant_invoices['total_amount_in_usd']).sum()
            return weighted_sum / total_amount
        else:
            return 0

    aggregated_invoices['weighted_interference_score'] = aggregated_invoices.apply(calculate_weighted_interference, axis=1)
    aggregated_invoices['rating'] = aggregated_invoices['weighted_interference_score'].apply(assign_rating)

    # Merge aggregated invoices with contracts
    df_contracts_filtered = df_contracts[
        ['contract_name', 'parent_account_name', 'contract_value', 'start_date', 'end_date']
    ].copy()
    df_contracts_filtered['start_date'] = pd.to_datetime(df_contracts_filtered['start_date'], errors='coerce')
    df_contracts_filtered['end_date'] = pd.to_datetime(df_contracts_filtered['end_date'], errors='coerce')

    merged_data = pd.merge(
        aggregated_invoices,
        df_contracts_filtered,
        on="contract_name",
        how="left"
    )

    # Group by parent account name and calculate weighted scores
    grouped_parent = merged_data.groupby("parent_account_name")
    parent_aggregated = grouped_parent.agg(
        total_contract_count=("contract_name", "nunique"),
        total_contract_value=("contract_value", "sum"),
        total_invoice_count=("invoice_count", "sum"),
        total_amount_in_usd=("total_amount_in_usd", "sum")
    ).reset_index()

    # Weighted interference score for parent companies
    def compute_parent_weighted_score(group):
        if group["total_amount_in_usd"].sum() > 0:
            return (group["weighted_interference_score"] * group["total_amount_in_usd"]).sum() / group["total_amount_in_usd"].sum()
        else:
            return 0

    parent_aggregated['weighted_interference_score'] = grouped_parent.apply(compute_parent_weighted_score).values
    parent_aggregated['rating'] = parent_aggregated['weighted_interference_score'].apply(assign_rating)

    # Drop 'total_contract_value' if present
    parent_aggregated = parent_aggregated.drop(columns=['total_contract_value'], errors='ignore')

    # ====================== Save Aggregated Data (unchanged logic) ======================
    output_excel = r"C:\Users\ak012\Documents\Projects\Customers\aggregated_parent_companies.xlsx"
    try:
        parent_aggregated.to_excel(output_excel, index=False)
        print(f"Aggregated parent company data saved to {output_excel}")
    except PermissionError:
        print(f"PermissionError: Could not write to '{output_excel}'. "
              "Please close the file or use a different path.")
    except Exception as e:
        print(f"Error saving Excel file: {e}")

    output_json = r"C:\Users\ak012\Documents\Projects\Customers\aggregated_parent_companies.json"
    try:
        parent_aggregated.to_json(output_json, orient="records", date_format="iso")
        print(f"Aggregated parent company data saved to {output_json}")
    except Exception as e:
        print(f"Error saving JSON file: {e}")

    # =============================================================================
    #  NEW CODE: Insert ALL rows from merged_data (ensuring datetime fix for NaT).
    #  We include ALL columns, do not drop anything.
    # =============================================================================

    df_for_db = merged_data.copy()

    # If 'total_amount_in_usd' exists, rename to 'total_invoice_amount'
    if "total_amount_in_usd" in df_for_db.columns:
        df_for_db.rename(columns={"total_amount_in_usd": "total_invoice_amount"}, inplace=True)

    # 1) Convert any datetime columns (including possibly NaT) to safe ISO strings or None
    def date_to_string(dt):
        if pd.isnull(dt):
            return None
        if isinstance(dt, datetime):
            return dt.isoformat()  # or str(dt)
        return str(dt)

    # Apply date_to_string to all datetime columns
    datetime_cols = df_for_db.select_dtypes(include=["datetimetz", "datetime64[ns]"]).columns
    for col in datetime_cols:
        df_for_db[col] = df_for_db[col].apply(date_to_string)

    # 2) Insert all rows into 'company_customers' without excluding any columns
    try:
        results = customers_collection.insert_many(df_for_db.to_dict("records"))
        print(
            f"Inserted {len(results.inserted_ids)} documents into 'company_customers' collection."
        )
    except Exception as e:
        print(f"Error inserting into 'company_customers': {e}")


# Run the script
if __name__ == "__main__":
    process_invoices_and_contracts()

Loaded config module: <module 'config' from 'C:\\Users\\ak012\\Documents\\Projects\\config\\config.py'>
Config attributes: ['COVERAGE_PENALTY_WEIGHT', 'MILESTONE_DAYS_THRESHOLD_3', 'MILESTONE_DAYS_THRESHOLD_5', 'MILESTONE_DAYS_THRESHOLD_7', 'MILESTONE_PENALTY_LESS_THAN_3_DAYS', 'MILESTONE_PENALTY_LESS_THAN_5_DAYS', 'MILESTONE_PENALTY_LESS_THAN_7_DAYS', 'MILESTONE_PENALTY_MORE_THAN_7_DAYS', 'OVERDUE_MULTIPLIER', 'RAISED_NOT_SENT_WEIGHT_FIRST_ELIF', 'RAISED_NOT_SENT_WEIGHT_FIRST_TOLERANCE_LEVEL', 'RAISED_NOT_SENT_WEIGHT_SECOND_ELIF', 'RAISED_NOT_SENT_WEIGHT_SECOND_TOLERANCE_LEVEL', 'RAISED_NOT_SENT_WEIGHT_ZERO', 'SCORE_THRESHOLD_GREEN', 'SCORE_THRESHOLD_YELLOW', 'STATUS_SCORE_COMPLETED', 'STATUS_SCORE_NOT_COMPLETED', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'validate_config']
STATUS_SCORE_COMPLETED: 5
STATUS_SCORE_NOT_COMPLETED: 20
Number of invoices fetched: 42395
Number of contracts fetched: 7652


  parent_aggregated['weighted_interference_score'] = grouped_parent.apply(compute_parent_weighted_score).values


Aggregated parent company data saved to C:\Users\ak012\Documents\Projects\Customers\aggregated_parent_companies.xlsx
Aggregated parent company data saved to C:\Users\ak012\Documents\Projects\Customers\aggregated_parent_companies.json
Inserted 5353 documents into 'company_customers' collection.
