# Task6_FinTech_Vendor_Scorecard.ipynb

## Import dependencies

In [None]:
# Objective: Develop a FinTech Vendor Scorecard by combining NER extractions with Telegram metadata.

# --- Mount Google Drive ---
from google.colab import drive
drive.mount('/content/drive')

# --- Step 1: Install Necessary Libraries ---
!pip install transformers pandas numpy

# --- Step 2: Import Libraries ---
import os
import json
import numpy as np
import pandas as pd
from datetime import datetime, timedelta, timezone # Import timezone for robust datetime handling
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import re # For price extraction

## --- Configuration ---

In [None]:
# IMPORTANT: Adjust 'colab_projects/EthioMart_NER' to your desired base path in Google Drive
DRIVE_PROJECT_BASE_PATH = "/content/drive/MyDrive/colab_projects/EthioMart_NER"

# Path to your preprocessed data file
PREPROCESSED_DATA_PATH = os.path.join(DRIVE_PROJECT_BASE_PATH, "data/preprocessed_data/preprocessed_amharic_ecommerce_messages.json")

# Path to the best performing NER model saved in your Google Drive from Task 4
# This path should ideally be read from a file generated by Task 4, or hardcoded if Task 4 already ran.
BEST_MODEL_PATH = os.path.join(DRIVE_PROJECT_BASE_PATH, "XLM-R-Amharic-NER_ner_output/final_model")
# Alternatively, read from file if it exists:
# try:
#     with open(os.path.join(DRIVE_PROJECT_BASE_PATH, "best_model_path.txt"), "r") as f:
#         BEST_MODEL_PATH = f.read().strip()
#     print(f"Loaded best model path from file: {BEST_MODEL_PATH}")
# except FileNotFoundError:
#     print("best_model_path.txt not found. Using default BEST_MODEL_PATH.")


# Define your entity types (must match what you used for training)
LABEL_NAMES = ["O", "B-PRODUCT", "I-PRODUCT", "B-LOC", "I-LOC", "B-PRICE", "I-PRICE"]

# --- Step 3: Load Preprocessed Data ---
print(f"Loading preprocessed data from: {PREPROCESSED_DATA_PATH}")
all_messages_data = [] # Initialize to an empty list to prevent NameError if file not found
try:
    with open(PREPROCESSED_DATA_PATH, 'r', encoding='utf-8') as f:
        all_messages_data = json.load(f)
    print(f"Successfully loaded {len(all_messages_data)} messages.")
except FileNotFoundError:
    print(f"Error: Data file not found at {PREPROCESSED_DATA_PATH}. Please ensure it's uploaded or path is correct.")
    # Do not exit(), let the script continue with empty data if file not found
except json.JSONDecodeError as e:
    print(f"Error decoding JSON from {PREPROCESSED_DATA_PATH}: {e}")
    # Do not exit(), let the script continue with empty data if JSON is malformed

# --- Step 4: Load the Best Fine-Tuned NER Model ---
print(f"Loading best NER model from: {BEST_MODEL_PATH}")
try:
    model = AutoModelForTokenClassification.from_pretrained(BEST_MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(BEST_MODEL_PATH)
    ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
    print("NER model loaded successfully.")
    model_loaded_for_scorecard = True
except Exception as e:
    print(f"Error loading NER model: {e}. Skipping scorecard calculation.")
    model_loaded_for_scorecard = False


if model_loaded_for_scorecard:
    # --- Step 5: NER Inference and Data Augmentation ---
    print("\nRunning NER inference on all preprocessed messages...")

    # Function to extract numerical price from NER output
    def extract_numerical_price(ner_output):
        price_entities = []
        for entity in ner_output:
            if entity['entity_group'] == 'PRICE':
                price_text = entity['word'].replace(',', '').lower()
                numbers = re.findall(r'\d+\.?\d*', price_text)
                if numbers:
                    try:
                        price_value = float(numbers[0])
                        if 'ሺህ' in price_text or 'thousand' in price_text:
                            price_value *= 1000
                        price_entities.append(price_value)
                    except ValueError:
                        continue
        if price_entities:
            return np.mean(price_entities)
        return None

    # Dictionary to store data grouped by channel
    channel_data = {}

    for i, message in enumerate(all_messages_data):
        if (i + 1) % 5000 == 0: # Print progress more frequently for large datasets
            print(f"Processed {i+1}/{len(all_messages_data)} messages for scorecard NER.")

        channel_id = message.get('channel_id')
        channel_name = message.get('channel_name', f"Channel_{channel_id}")
        cleaned_text = message.get('cleaned_text', '')
        views = message.get('views', 0)
        timestamp_str = message.get('date')

        if not channel_id or not cleaned_text:
            continue

        if channel_id not in channel_data:
            channel_data[channel_id] = {
                'name': channel_name,
                'posts': [],
                'first_post_date': datetime.max.replace(tzinfo=timezone.utc), # Initialize as timezone-aware UTC
                'last_post_date': datetime.min.replace(tzinfo=timezone.utc),  # Initialize as timezone-aware UTC
                'total_views': 0,
                'total_prices': 0,
                'price_count': 0,
                'top_post_views': -1,
                'top_post_details': {}
            }

        # Perform NER for scorecard
        ner_results = []
        try:
            ner_results = ner_pipeline(cleaned_text)
        except Exception as e:
            print(f"Warning: NER inference failed for message ID {message.get('id')} for scorecard: {e}")
            ner_results = []
        
        numerical_price = extract_numerical_price(ner_results)

        post_timestamp = None
        if timestamp_str:
            try:
                dt_object = datetime.fromisoformat(timestamp_str)
                if dt_object.tzinfo is None: # If naive, assume UTC or add a default
                    post_timestamp = dt_object.replace(tzinfo=timezone.utc) # Make it aware
                else:
                    post_timestamp = dt_object.astimezone(timezone.utc) # Convert to UTC if already aware
            except ValueError:
                print(f"Warning: Could not parse timestamp {timestamp_str} for message ID {message.get('id')}. Skipping timestamp for this post.")
                post_timestamp = None

        post_details = {
            'id': message['id'],
            'text': cleaned_text,
            'views': views,
            'timestamp': post_timestamp,
            'ner_entities': ner_results,
            'numerical_price': numerical_price
        }
        channel_data[channel_id]['posts'].append(post_details)

        channel_data[channel_id]['total_views'] += views
        if numerical_price is not None:
            channel_data[channel_id]['total_prices'] += numerical_price
            channel_data[channel_id]['price_count'] += 1

        if views > channel_data[channel_id]['top_post_views']:
            channel_data[channel_id]['top_post_views'] = views
            top_product = next((e['word'] for e in ner_results if e['entity_group'] == 'PRODUCT'), 'N/A')
            top_price = numerical_price if numerical_price is not None else 'N/A'
            channel_data[channel_id]['top_post_details'] = {
                'message_id': message['id'],
                'views': views,
                'product': top_product,
                'price': top_price
            }

        if post_details['timestamp']:
            if post_details['timestamp'] < channel_data[channel_id]['first_post_date']:
                channel_data[channel_id]['first_post_date'] = post_details['timestamp']
            if post_details['timestamp'] > channel_data[channel_id]['last_post_date']:
                channel_data[channel_id]['last_post_date'] = post_details['timestamp']

    print("NER inference for scorecard complete.")

    # --- Step 6: Calculate Key Vendor Metrics and Lending Score ---
    vendor_scores = []
    print("\nCalculating vendor metrics and lending scores...")
    if not channel_data:
        print("No channel data available to calculate metrics.")
    else:
        for c_id, data in channel_data.items():
            num_posts = len(data['posts'])
            
            activity_duration_days = 0
            if data['first_post_date'] != datetime.max.replace(tzinfo=timezone.utc) and data['last_post_date'] != datetime.min.replace(tzinfo=timezone.utc):
                activity_duration_days = (data['last_post_date'] - data['first_post_date']).days
            
            posting_frequency_per_week = 0
            if activity_duration_days > 0:
                posting_frequency_per_week = (num_posts / activity_duration_days) * 7
            elif num_posts > 0:
                posting_frequency_per_week = num_posts

            average_views_per_post = data['total_views'] / num_posts if num_posts > 0 else 0
            average_price_point = data['total_prices'] / data['price_count'] if data['price_count'] > 0 else 0

            max_posting_freq = 50.0
            max_avg_views = 10000.0
            max_avg_price = 50000.0

            normalized_posting_freq = min(posting_frequency_per_week / max_posting_freq, 1.0)
            normalized_avg_views = min(average_views_per_post / max_avg_views, 1.0)
            normalized_avg_price = min(average_price_point / max_avg_price, 1.0)

            lending_score = (normalized_posting_freq * 0.4) + \
                            (normalized_avg_views * 0.4) + \
                            (normalized_avg_price * 0.2)

            vendor_scores.append({
                'Vendor Channel': data['name'],
                'Posts/Week': round(posting_frequency_per_week, 2),
                'Avg. Views/Post': round(average_views_per_post, 2),
                'Avg. Price (ETB)': round(average_price_point, 2) if average_price_point is not None else 'N/A',
                'Top Post Product': data['top_post_details'].get('product', 'N/A'),
                'Top Post Price (ETB)': data['top_post_details'].get('price', 'N/A'),
                'Lending Score': round(lending_score, 4)
            })

        # --- Step 7: Present Final "Vendor Scorecard" Table ---
        vendor_scorecard_df = pd.DataFrame(vendor_scores)
        vendor_scorecard_df = vendor_scorecard_df.sort_values(by='Lending Score', ascending=False)

        print("\n--- FinTech Vendor Scorecard for Micro-Lending ---")
        print("\n**Note on NER Accuracy:**")
        print("The NER model's F1-score was approximately 15% during fine-tuning due to the small labeled dataset (30-50 messages).")
        print("Therefore, the 'Top Post Product' and 'Avg. Price (ETB)' metrics derived from NER extractions may contain significant inaccuracies.")
        print("To improve the reliability of this scorecard, a substantially larger labeled dataset for NER is CRITICAL.")
        print(vendor_scorecard_df.to_markdown(index=False))

        print("\n--- Lending Score Design Notes ---")
        print("The 'Lending Score' is a simple weighted average designed for demonstration purposes. Its components are:")
        print(" - Posting Frequency: Measures vendor activity.")
        print(" - Average Views per Post: Indicates market reach and customer interest.")
        print(" - Average Price Point: Provides insight into the vendor's product segment.")
        print("Normalization factors (e.g., max_posting_freq, max_avg_views) are heuristic and should be refined with real business data distributions.")
        print("Weights (e.g., 0.4, 0.4, 0.2) are illustrative and should be determined by FinTech experts based on lending criteria.")
else:
    print("Skipping Vendor Scorecard calculation as NER model could not be loaded or no preprocessed messages were available.")
