In [None]:
# FinTech Vendor Scorecard

import pandas as pd
import numpy as np
from transformers import pipeline
import os
import re

In [None]:
# --- 1. Load Data and NER Model ---

# Define paths
GDRIVE_PROJECT_PATH = '/content/drive/MyDrive/Ethio_mart'
PREPROCESSED_CSV = os.path.join(GDRIVE_PROJECT_PATH, 'preprocessed_telegram_data.csv')
SAVED_MODEL_PATH = os.path.join(GDRIVE_PROJECT_PATH, 'models', 'mbert-cased-ner-finetuned')

print("--- Loading data and model ---")
df = pd.read_csv(PREPROCESSED_CSV)

# Load the NER pipeline from Task 5
ner_pipeline = pipeline(
    "token-classification",
    model=SAVED_MODEL_PATH,
    tokenizer=SAVED_MODEL_PATH,
    aggregation_strategy="simple" # Groups sub-words back into words
)



In [None]:
# --- 2. Feature Engineering: Price Extraction Function ---
# This function will use the NER model to attempt to extract a price.
def extract_price(text):
    if not isinstance(text, str):
        return np.nan
    try:
        predictions = ner_pipeline(text)
        for entity in predictions:
            # Check if the entity is a price
            if entity['entity_group'] == 'PRICE':
                # Extract all numbers from the word/phrase
                price_numbers = re.findall(r'\d+', entity['word'])
                if price_numbers:
                    # Return the first number found as a float
                    return float(price_numbers[0])
    except Exception as e:
        # Handle potential errors during inference
        # print(f"Could not process text: {text[:50]}... Error: {e}")
        pass
    return np.nan


In [None]:
# --- 3. Data Preparation ---
# Drop rows with missing essential data
df.dropna(subset=['Views', 'Date', 'Channel Username'], inplace=True)
# Convert 'Date' column to datetime objects
df['Date'] = pd.to_datetime(df['Date'], utc=True)


In [None]:
# --- 4. Apply NER to Extract Prices ---
# This step demonstrates the integration of the NER model into the business logic.
# NOTE: Given our model's F1-score of 0.0, we expect this column to be mostly empty (NaN).
# This is a key finding for the report.
print("\n--- Applying NER model to extract prices (this may take a moment)... ---")
df['extracted_price'] = df['cleaned_text'].apply(extract_price)
print("--- Price extraction complete. ---")


In [None]:
# --- 5. Calculate Vendor Metrics ---
print("\n--- Calculating vendor metrics... ---")

# Group by vendor (channel)
vendor_groups = df.groupby('Channel Username')

# Dictionary to hold our final results
vendor_analytics = {}

for name, group in vendor_groups:
    # Calculate time span for frequency calculation
    time_span_days = (group['Date'].max() - group['Date'].min()).days
    # Avoid division by zero if all posts are on the same day
    time_span_weeks = time_span_days / 7 if time_span_days > 0 else 1
    
    # Calculate metrics
    post_count = len(group)
    posting_frequency = post_count / time_span_weeks
    avg_views = group['Views'].mean()
    avg_price = group['extracted_price'].mean() # Will be NaN if no prices were found
    
    # Store results
    vendor_analytics[name] = {
        'Posts/Week': posting_frequency,
        'Avg. Views/Post': avg_views,
        'Avg. Price (ETB)': avg_price
    }

# Convert the dictionary to a DataFrame
scorecard_df = pd.DataFrame.from_dict(vendor_analytics, orient='index')


In [None]:
# --- 6. Create the Final "Lending Score" ---
# A simple weighted score as defined in the task description.
# We will treat NaN avg_price as 0 for scoring purposes.
scorecard_df['Lending Score'] = (scorecard_df['Avg. Views/Post'].fillna(0) * 0.5) + \
                                (scorecard_df['Posts/Week'].fillna(0) * 0.5)

# Sort by the lending score to find the top candidates
scorecard_df = scorecard_df.sort_values(by='Lending Score', ascending=False)

# --- 7. Present the Final Vendor Scorecard ---
print("\n" + "="*80)
print("--- FinTech Vendor Scorecard for Micro-Lending ---")
print("="*80)
display(scorecard_df.round(2))

print("\n--- Analysis of Scorecard ---")
print("The scorecard successfully ranks vendors based on their activity (Posting Frequency) and reach (Average Views).")
print("The 'Avg. Price (ETB)' column is likely empty (NaN) because our NER model's performance was poor (F1=0.0).")
print("This result powerfully demonstrates the project's potential: with a better-performing NER model (trained on more data), this scorecard would become a highly effective tool for identifying valuable vendors by automatically extracting and averaging their product price points.")