# Vendor Analytics Engine
This notebook combines NER-extracted entities with Telegram post metadata to create a vendor scorecard with business insights.

In [ ]:
import os
import json
import pandas as pd
from datetime import datetime
from collections import defaultdict
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
from src.utils.ner_data_utils import build_label_maps


In [ ]:
# Load Telegram messages with metadata
jsonl_path = '../data/telegram_messages.jsonl'
messages = []
with open(jsonl_path, 'r', encoding='utf-8') as f:
    for line in f:
        messages.append(json.loads(line))
df = pd.DataFrame(messages)
print(df.head())


In [ ]:
# Load NER model
MODEL_DIR = 'notebooks/amharicnermodel'  # Adjust as needed
model = AutoModelForTokenClassification.from_pretrained(MODEL_DIR)
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model.eval()


In [ ]:
# Helper: Run NER model to extract entities from text
def extract_entities(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    preds = torch.argmax(logits, dim=-1)[0].cpu().numpy()
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    # Build label maps (assume model.config.id2label exists)
    id2label = model.config.id2label if hasattr(model.config, 'id2label') else {i: str(i) for i in range(logits.shape[-1])}
    entities = []
    current = None
    for token, pred in zip(tokens, preds):
        label = id2label[str(pred)] if isinstance(id2label, dict) else id2label[pred]
        if label.startswith('B-') or (label != 'O' and (current is None or label != current['label'])):
            if current: entities.append(current)
            current = {'label': label[2:] if label.startswith('B-') else label, 'tokens': [token]}
        elif label.startswith('I-') and current and label[2:] == current['label']:
            current['tokens'].append(token)
        else:
            if current: entities.append(current); current = None
    if current: entities.append(current)
    # Join tokens for each entity
    for ent in entities:
        ent['text'] = tokenizer.convert_tokens_to_string(ent['tokens'])
    return entities


In [ ]:
# Extract entities and enrich dataframe
def extract_price(entities):
    for ent in entities:
        if ent['label'].lower() == 'price':
            # Try to extract numeric value
            try:
                return float(''.join([c for c in ent['text'] if c.isdigit() or c == '.']))
            except:
                return None
    return None
df['entities'] = df['text'].apply(extract_entities)
df['price'] = df['entities'].apply(extract_price)
print(df[['channel', 'text', 'price']].head())


In [ ]:
# Calculate metrics per vendor/channel
def posts_per_week(timestamps):
    dates = pd.to_datetime(timestamps)
    if len(dates) < 2:
        return 0
    weeks = (dates.max() - dates.min()).days / 7.0
    return len(dates) / weeks if weeks > 0 else len(dates)
scorecard = []
for channel, group in df.groupby('channel'):
    avg_views = group['views'].mean() if 'views' in group else np.nan
    freq = posts_per_week(group['timestamp'])
    avg_price = group['price'].mean() if group['price'].notnull().any() else np.nan
    top_post = group.loc[group['views'].idxmax()] if 'views' in group and group['views'].notnull().any() else None
    lending_score = (avg_views or 0) * 0.5 + freq * 0.5
    scorecard.append({
        'Vendor': channel,
        'Avg. Views/Post': avg_views,
        'Posts/Week': freq,
        'Avg. Price (ETB)': avg_price,
        'Lending Score': lending_score,
        'Top Post': top_post['text'] if top_post is not None else '',
        'Top Post Views': top_post['views'] if top_post is not None else '',
        'Top Post Price': top_post['price'] if top_post is not None else ''
    })
scorecard_df = pd.DataFrame(scorecard)
print(scorecard_df[['Vendor', 'Avg. Views/Post', 'Posts/Week', 'Avg. Price (ETB)', 'Lending Score']])
