In [41]:
import pandas as pd
import json
import re
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display, HTML

def categorize_tag(tag):
    """Categorizes a tag string based on regex patterns."""
    if pd.isna(tag): return 'unknown'
    tag_str = str(tag).lower().strip()
    if re.match(r'^\d{2}-[a-z]{3}-\d{2}-(99acres|magicbricks|olx|housing)', tag_str): return 'date_source_combo'
    if re.match(r'^\d{2}-\d{2}-\d{4}$', tag_str): return 'date_only'
    if tag_str in ['sell-leads', 'cleardeals-lead', 'lead', 'recalling']: return 'generic_status'
    if re.search(r'\d', tag_str) and re.search(r'[a-zA-Z]', tag_str) and '-' in tag_str: return 'property_identifier'
    return 'other'

def create_property_ranking(df_property_leads, all_expired_tags):
    """
    Performs feature engineering to create a sophisticated ranking score for properties.
    """
    print("\n📈 Engineering features for property ranking...")
    grouped = df_property_leads.groupby('Tags')
    features = []
    for tag, group in grouped:
        if tag not in all_expired_tags: continue
        site_visit_done_count = (group['To Lead Type'] == 'Site visit done').sum()
        site_visit_scheduled_count = (group['To Lead Type'] == 'Site visit scheduled').sum()
        schedule_to_done_conversion = site_visit_done_count / site_visit_scheduled_count if site_visit_scheduled_count > 0 else 0
        group = group.sort_values('At')
        time_diffs = group['At'].diff().dt.total_seconds() / (3600 * 24)
        avg_days_between_interactions = time_diffs.mean()
        features.append({
            'Tag': tag, 'total_interactions': len(group), 'site_visit_done_count': site_visit_done_count,
            'site_visit_scheduled_count': site_visit_scheduled_count, 'schedule_to_done_conversion': schedule_to_done_conversion,
            'avg_days_between_interactions': avg_days_between_interactions
        })

    if not features:
        print("⚠️ No expired properties found in the interaction data to rank."); return pd.DataFrame()
        
    ranking_df = pd.DataFrame(features).fillna(0)
    original_features_df = ranking_df.copy()

    scaler = MinMaxScaler()
    cols_to_scale = ['total_interactions', 'site_visit_done_count', 'site_visit_scheduled_count', 'schedule_to_done_conversion']
    ranking_df[cols_to_scale] = scaler.fit_transform(ranking_df[cols_to_scale])
    ranking_df['velocity_score'] = 1 - scaler.fit_transform(ranking_df[['avg_days_between_interactions']])
    
    weights = {'site_visit_done_count': 0.40, 'site_visit_scheduled_count': 0.25, 'schedule_to_done_conversion': 0.15, 'velocity_score': 0.10, 'total_interactions': 0.10}
    ranking_df['Rank_Score'] = (
        ranking_df['site_visit_done_count'] * weights['site_visit_done_count'] +
        ranking_df['site_visit_scheduled_count'] * weights['site_visit_scheduled_count'] +
        ranking_df['schedule_to_done_conversion'] * weights['schedule_to_done_conversion'] +
        ranking_df['velocity_score'] * weights['velocity_score'] +
        ranking_df['total_interactions'] * weights['total_interactions']
    )
    ranking_df['Weight'] = (ranking_df['Rank_Score'] * 10).astype(int) + 1
    
    final_ranking_df = pd.merge(original_features_df, ranking_df[['Tag', 'Rank_Score', 'Weight']], on='Tag')
    final_ranking_df.sort_values(by='Rank_Score', ascending=False, inplace=True)
    final_ranking_df.reset_index(drop=True, inplace=True)
    final_ranking_df['Overall_Rank'] = final_ranking_df.index + 1
    
    print("✅ Ranking complete.")
    return final_ranking_df

def generate_full_lead_report(ranking_df, routed_leads_df):
    """
    Generates a detailed, visually appealing HTML report for all routed leads.
    """
    print("\n\n--- 📄 Generating Full HTML Lead Report ---")
    
    if routed_leads_df.empty:
        print("No leads were routed, so no report can be generated.")
        return

    # --- 1. Calculate Summary Statistics ---
    total_assignments = len(routed_leads_df)
    lead_counts_per_property = routed_leads_df.groupby('routed_to_expired_property_tag')['lead_contact'].nunique()
    total_properties_receiving_leads = len(lead_counts_per_property)
    lead_distribution = lead_counts_per_property.value_counts().sort_index().reset_index()
    lead_distribution.columns = ['Number of Leads Received', 'Number of Properties']

    # --- 2. Prepare data for the detailed log ---
    rank_map = ranking_df.set_index('Tag')['Overall_Rank'].to_dict()
    grouped = routed_leads_df.groupby('routed_to_expired_property_tag')
    html_parts = []
    sorted_groups = sorted(grouped, key=lambda x: len(x[1].drop_duplicates(subset=['lead_contact'])), reverse=True)

    for expired_tag, group_df in sorted_groups:
        num_unique_leads = len(group_df.drop_duplicates(subset=['lead_contact']))
        rank = rank_map.get(expired_tag, 'N/A')
        details_header = f"""<details><summary><strong>Rank #{rank}: {expired_tag}</strong> &mdash; Received {num_unique_leads} Unique New Leads</summary>"""
        leads_table = group_df[['lead_contact', 'original_active_property_tag', 'timestamp']].to_html(index=False)
        details_footer = "</details>"
        html_parts.append(details_header + leads_table + details_footer)

    # --- 3. Build the final HTML file ---
    final_html = f"""
    <html><head><title>Complete Daily Lead Routing Report</title>
    <style>
        body {{ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; margin: 40px; color: #333; }}
        h1, h2, h3 {{ color: #2c3e50; }}
        .summary-box {{ background-color: #eaf2f8; border-left: 5px solid #3498db; padding: 15px; margin-bottom: 30px; }}
        details {{ border: 1px solid #ddd; border-radius: 8px; margin-bottom: 10px; overflow: hidden; }}
        summary {{ padding: 15px; font-size: 1.1em; font-weight: bold; background-color: #f7f7f7; cursor: pointer; outline: none; }}
        table {{ width: 100%; border-collapse: collapse; }}
        th, td {{ padding: 12px 15px; border-top: 1px solid #ddd; text-align: left; }}
        th {{ background-color: #f2f2f2; }}
        tr:nth-child(even) {{ background-color: #fafafa; }}
    </style>
    </head><body>
    <h1>Complete Lead Routing Report</h1>
    <h2>Summary Statistics</h2>
    <div class="summary-box">
        <p><strong>Total Unique Leads Routed Today:</strong> {total_assignments}</p>
        <p><strong>Total Expired Properties Receiving Leads:</strong> {total_properties_receiving_leads}</p>
    </div>
    <h3>Lead Distribution Breakdown</h3>
    {lead_distribution.to_html(index=False)}
    <h2>Detailed Lead Log</h2>
    <p>Click on each property to see the full list of leads it received.</p>
    {''.join(html_parts)}
    </body></html>
    """
    
    report_path = 'full_lead_report.html'
    with open(report_path, 'w', encoding='utf-8') as f: f.write(final_html)
    print(f"✅ Complete analysis report has been saved to '{report_path}'.")

def route_daily_leads_hybrid():
    """
    Routes new leads using a hybrid model: broadcast for top-tier,
    weighted round-robin for standard-tier properties.
    """
    print("🚀 Starting daily lead routing with HYBRID logic...")
    
    try:
        with open('matches.json', 'r') as f: expired_to_active_matches = json.load(f)
        df_changes = pd.read_csv('ContactTypeChange.csv', low_memory=False)
    except FileNotFoundError as e:
        print(f"❌ Error: Missing a required file. {e}"); return

    print("\n🔍 Cleaning and filtering lead interaction tags...")
    df_changes['tag_type'] = df_changes['Tags'].apply(categorize_tag)
    df_property_leads = df_changes[df_changes['tag_type'] == 'property_identifier'].copy()
    df_property_leads['At'] = pd.to_datetime(df_property_leads['At'], errors='coerce')
    print(f"✅ Found {len(df_property_leads)} interactions linked to specific properties.")

    all_expired_tags = list(expired_to_active_matches.keys())
    ranking_df = create_property_ranking(df_property_leads, all_expired_tags)
    if ranking_df.empty: return

    # --- HYBRID MODEL LOGIC ---
    top_tier_threshold = ranking_df['Overall_Rank'].quantile(0.20)
    top_tier_tags = set(ranking_df[ranking_df['Overall_Rank'] <= top_tier_threshold]['Tag'])
    print(f"\n✨ Identified {len(top_tier_tags)} properties as 'Top Tier' (Top 20%).")
    
    work_scores_dict = ranking_df.set_index('Tag')['Weight'].to_dict()
    
    #most_recent_date = df_property_leads['At'].max().date()
    #print(type(most_recent_date))
    most_recent_date = pd.to_datetime('2025-07-31').date()
    print(type(most_recent_date))
    print(most_recent_date)
    daily_leads = df_property_leads[df_property_leads['At'].dt.date == most_recent_date]
    print(f"\n🗓️ Simulating daily run for date: {most_recent_date}")

    active_to_expired_map = defaultdict(list)
    for expired_tag, active_tags in expired_to_active_matches.items():
        for active_tag in active_tags:
            active_to_expired_map[active_tag].append(expired_tag)

    routed_leads = []; daily_assignments = defaultdict(set); last_assigned_index = defaultdict(int)
    daily_lead_caps = defaultdict(int); MAX_LEADS_PER_DAY = 7

    for _, lead in daily_leads.iterrows():
        active_prop_tag = lead['Tags']
        lead_contact = lead['Contact']
        
        if active_prop_tag in active_to_expired_map:
            candidate_expired_props = active_to_expired_map[active_prop_tag]
            
            # Filter out properties with no rank
            ranked_candidates = [tag for tag in candidate_expired_props if tag in work_scores_dict]
            
            top_tier_candidates = [tag for tag in ranked_candidates if tag in top_tier_tags]
            standard_tier_candidates = [tag for tag in ranked_candidates if tag not in top_tier_tags]

            # 1. Broadcast to all Top Tier candidates
            for expired_tag in top_tier_candidates:
                if lead_contact not in daily_assignments[expired_tag] and daily_lead_caps[expired_tag] < MAX_LEADS_PER_DAY:
                    routed_leads.append({'lead_contact': lead_contact, 'original_active_property_tag': active_prop_tag, 'routed_to_expired_property_tag': expired_tag, 'timestamp': lead['At']})
                    daily_assignments[expired_tag].add(lead_contact)
                    daily_lead_caps[expired_tag] += 1
            
            # 2. Weighted Round-Robin for Standard Tier candidates
            if standard_tier_candidates:
                weighted_list = []
                for expired_tag in standard_tier_candidates:
                    weighted_list.extend([expired_tag] * work_scores_dict.get(expired_tag, 1))
                if weighted_list:
                    idx = last_assigned_index[active_prop_tag]
                    assigned_tag = weighted_list[idx % len(weighted_list)]
                    last_assigned_index[active_prop_tag] += 1
                    
                    if lead_contact not in daily_assignments[assigned_tag] and daily_lead_caps[assigned_tag] < MAX_LEADS_PER_DAY:
                        routed_leads.append({'lead_contact': lead_contact, 'original_active_property_tag': active_prop_tag, 'routed_to_expired_property_tag': assigned_tag, 'timestamp': lead['At']})
                        daily_assignments[assigned_tag].add(lead_contact)
                        daily_lead_caps[assigned_tag] += 1
                        
    if routed_leads:
        routing_results_df = pd.DataFrame(routed_leads)
        print(f"\n✅ Lead routing complete. Created {len(routing_results_df)} new lead assignments.")
        generate_full_lead_report(ranking_df, routing_results_df)
    else:
        print("\nℹ️ No new leads were routed today for the matched properties.")

# --- How to run this ---
# route_daily_leads_hybrid()


In [42]:
route_daily_leads_hybrid()

🚀 Starting daily lead routing with HYBRID logic...

🔍 Cleaning and filtering lead interaction tags...
✅ Found 74902 interactions linked to specific properties.

📈 Engineering features for property ranking...
✅ Ranking complete.

✨ Identified 255 properties as 'Top Tier' (Top 20%).
<class 'datetime.date'>
2025-07-31

🗓️ Simulating daily run for date: 2025-07-31

✅ Lead routing complete. Created 819 new lead assignments.


--- 📄 Generating Full HTML Lead Report ---
✅ Complete analysis report has been saved to 'full_lead_report.html'.
