In [35]:
import pandas as pd
import json
import re
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler

def categorize_tag(tag):
    """Categorizes a tag string based on regex patterns."""
    if pd.isna(tag):
        return 'unknown'
    tag_str = str(tag).lower().strip()
    if re.match(r'^\d{2}-[a-z]{3}-\d{2}-(99acres|magicbricks|olx|housing)', tag_str):
        return 'date_source_combo'
    if re.match(r'^\d{2}-\d{2}-\d{4}$', tag_str):
        return 'date_only'
    if tag_str in ['sell-leads', 'cleardeals-lead', 'lead', 'recalling']:
        return 'generic_status'
    if re.search(r'\d', tag_str) and re.search(r'[a-zA-Z]', tag_str) and '-' in tag_str:
        return 'property_identifier'
    return 'other'

def create_property_ranking(df_property_leads, all_expired_tags):
    """
    Performs feature engineering to create a sophisticated ranking score for properties.
    """
    print("\n📈 Engineering features for property ranking...")
    
    # Group by property tag to calculate features for each property
    grouped = df_property_leads.groupby('Tags')
    
    features = []
    for tag, group in grouped:
        # We only care about expired properties for the ranking
        if tag not in all_expired_tags:
            continue
            
        # 1. High-Value Action Counts
        site_visit_done_count = (group['To Lead Type'] == 'Site visit done').sum()
        site_visit_scheduled_count = (group['To Lead Type'] == 'Site visit scheduled').sum()
        
        # 2. Lead Quality (Conversion Rate)
        if site_visit_scheduled_count > 0:
            schedule_to_done_conversion = site_visit_done_count / site_visit_scheduled_count
        else:
            schedule_to_done_conversion = 0
            
        # 3. Engagement Velocity
        group = group.sort_values('At')
        time_diffs = group['At'].diff().dt.total_seconds() / (3600 * 24) # Difference in days
        avg_days_between_interactions = time_diffs.mean()
        
        features.append({
            'Tag': tag,
            'total_interactions': len(group),
            'site_visit_done_count': site_visit_done_count,
            'site_visit_scheduled_count': site_visit_scheduled_count,
            'schedule_to_done_conversion': schedule_to_done_conversion,
            'avg_days_between_interactions': avg_days_between_interactions
        })

    if not features:
        print("⚠️ No expired properties found in the interaction data to rank.")
        return pd.DataFrame(), {}
        
    ranking_df = pd.DataFrame(features).fillna(0)
    
    # --- Calculate the final Rank Score ---
    scaler = MinMaxScaler()
    
    # Normalize features so they can be combined in a weighted formula
    cols_to_scale = ['total_interactions', 'site_visit_done_count', 'site_visit_scheduled_count', 'schedule_to_done_conversion']
    ranking_df[cols_to_scale] = scaler.fit_transform(ranking_df[cols_to_scale])
    
    # Invert the time difference so that shorter time = higher score
    ranking_df['velocity_score'] = 1 - scaler.fit_transform(ranking_df[['avg_days_between_interactions']])
    
    # Define the weights for each feature. Site visits are most important.
    weights = {
        'site_visit_done_count': 0.40,
        'site_visit_scheduled_count': 0.25,
        'schedule_to_done_conversion': 0.15,
        'velocity_score': 0.10,
        'total_interactions': 0.10
    }
    
    # Calculate the final weighted score
    ranking_df['Rank_Score'] = (
        ranking_df['site_visit_done_count'] * weights['site_visit_done_count'] +
        ranking_df['site_visit_scheduled_count'] * weights['site_visit_scheduled_count'] +
        ranking_df['schedule_to_done_conversion'] * weights['schedule_to_done_conversion'] +
        ranking_df['velocity_score'] * weights['velocity_score'] +
        ranking_df['total_interactions'] * weights['total_interactions']
    )
    
    # Convert the score to an integer weight for the round-robin
    ranking_df['Weight'] = (ranking_df['Rank_Score'] * 10).astype(int) + 1 # Min weight of 1
    
    ranking_df.sort_values(by='Rank_Score', ascending=False, inplace=True)
    
    print("✅ Ranking complete. Top 5 most active/high-quality expired properties:")
    print(ranking_df[['Tag', 'Rank_Score', 'Weight']].head())
    
    work_scores_dict = ranking_df.set_index('Tag')['Weight'].to_dict()
    
    return ranking_df, work_scores_dict


def route_daily_leads_with_ranking():
    """
    Ranks expired properties with advanced features and routes new leads.
    """
    print("🚀 Starting daily lead routing with advanced property ranking...")
    
    try:
        with open('matches.json', 'r') as f:
            expired_to_active_matches = json.load(f)
        df_changes = pd.read_csv('ContactTypeChange.csv', low_memory=False)
    except FileNotFoundError as e:
        print(f"❌ Error: Missing a required file. {e}"); return

    print("\n🔍 Cleaning and filtering lead interaction tags...")
    df_changes['tag_type'] = df_changes['Tags'].apply(categorize_tag)
    df_property_leads = df_changes[df_changes['tag_type'] == 'property_identifier'].copy()
    df_property_leads['At'] = pd.to_datetime(df_property_leads['At'], errors='coerce')
    print(f"✅ Found {len(df_property_leads)} interactions linked to specific properties.")

    # --- Create the Property Ranking ---
    all_expired_tags = list(expired_to_active_matches.keys())
    ranking_df, work_scores_dict = create_property_ranking(df_property_leads, all_expired_tags)
    
    if ranking_df.empty: return

    # --- Prepare for Routing ---
    most_recent_date = df_property_leads['At'].max().date()
    daily_leads = df_property_leads[df_property_leads['At'].dt.date == most_recent_date]
    print(f"\n🗓️ Simulating daily run for date: {most_recent_date}")

    active_to_expired_map = defaultdict(list)
    for expired_tag, active_tags in expired_to_active_matches.items():
        for active_tag in active_tags:
            active_to_expired_map[active_tag].append(expired_tag)

    # --- Weighted Round-Robin Routing based on Rank ---
    routed_leads = []
    last_assigned_index = defaultdict(int)

    for _, lead in daily_leads.iterrows():
        active_prop_tag = lead['Tags']
        
        if active_prop_tag in active_to_expired_map:
            candidate_expired_props = active_to_expired_map[active_prop_tag]
            if not candidate_expired_props: continue

            weighted_candidate_list = []
            for expired_tag in candidate_expired_props:
                score = work_scores_dict.get(expired_tag, 1)
                weighted_candidate_list.extend([expired_tag] * score)
            
            if not weighted_candidate_list: continue

            pool_key = active_prop_tag
            current_index = last_assigned_index[pool_key]
            assigned_expired_tag = weighted_candidate_list[current_index % len(weighted_candidate_list)]
            last_assigned_index[pool_key] += 1
            
            routed_leads.append({
                'lead_contact': lead['Contact'],
                'original_active_property_tag': active_prop_tag,
                'routed_to_expired_property_tag': assigned_expired_tag,
                'timestamp': lead['At']
            })

    # --- Save and Display Results ---
    if routed_leads:
        routing_results_df = pd.DataFrame(routed_leads)
        routing_results_df.to_csv('todays_routed_leads.csv', index=False)
        print(f"\n✅ Lead routing complete. Routed {len(routed_leads)} new leads.")
        print("Results saved to 'todays_routed_leads.csv'.")
        print("\n--- Sample of Routed Leads ---")
        print(routing_results_df.head())
    else:
        print("\nℹ️ No new leads were routed today for the matched properties.")

# --- How to run this ---
# route_daily_leads_with_ranking()

In [36]:
route_daily_leads_with_ranking()

🚀 Starting daily lead routing with advanced property ranking...

🔍 Cleaning and filtering lead interaction tags...
✅ Found 74902 interactions linked to specific properties.

📈 Engineering features for property ranking...
✅ Ranking complete. Top 5 most active/high-quality expired properties:
                                             Tag  Rank_Score  Weight
522             a-304-vrajdham-1-ghatlodia-jan25    0.768037       8
848                c-203-ganesh-gold-gota-sept24    0.672587       7
86           155-aarti-apartment-ghatlodia-feb25    0.654443       7
466      a-1306-abhishek-heights-naranpura-oct24    0.624352       7
89   16-shreenath-society-part-2-ghatlodia-may24    0.621482       7

🗓️ Simulating daily run for date: 2025-07-31

✅ Lead routing complete. Routed 235 new leads.
Results saved to 'todays_routed_leads.csv'.

--- Sample of Routed Leads ---
         lead_contact                  original_active_property_tag  \
0  Abdulhannan Shaikh  c-302-signature-2-business-par

In [37]:
import json
import pandas as pd
import re
from sklearn.preprocessing import MinMaxScaler

def categorize_tag(tag):
    """Categorizes a tag string based on regex patterns."""
    if pd.isna(tag):
        return 'unknown'
    tag_str = str(tag).lower().strip()
    if re.match(r'^\d{2}-[a-z]{3}-\d{2}-(99acres|magicbricks|olx|housing)', tag_str):
        return 'date_source_combo'
    if re.match(r'^\d{2}-\d{2}-\d{4}$', tag_str):
        return 'date_only'
    if tag_str in ['sell-leads', 'cleardeals-lead', 'lead', 'recalling']:
        return 'generic_status'
    if re.search(r'\d', tag_str) and re.search(r'[a-zA-Z]', tag_str) and '-' in tag_str:
        return 'property_identifier'
    return 'other'

def analyze_zero_match_rankings(matches_path='matches.json', interactions_path='ContactTypeChange.csv'):
    """
    Analyzes the quality ranking of expired properties that had zero matches.
    """
    try:
        with open(matches_path, 'r') as f:
            matches_data = json.load(f)
        df_changes = pd.read_csv(interactions_path, low_memory=False)
        print("✅ Successfully loaded required files.")
    except FileNotFoundError as e:
        print(f"❌ Error: A required file was not found. {e}")
        return

    # --- 1. Identify the zero-match properties ---
    zero_match_tags = {tag for tag, matches in matches_data.items() if len(matches) == 0}
    all_expired_tags = list(matches_data.keys())
    
    print(f"\nFound {len(zero_match_tags)} expired properties with 0 matches to analyze.")

    # --- 2. Clean interaction data and create the full ranking ---
    df_changes['tag_type'] = df_changes['Tags'].apply(categorize_tag)
    df_property_leads = df_changes[df_changes['tag_type'] == 'property_identifier'].copy()
    df_property_leads['At'] = pd.to_datetime(df_property_leads['At'], errors='coerce')

    # --- Re-create the full ranking DataFrame ---
    grouped = df_property_leads.groupby('Tags')
    features = []
    for tag, group in grouped:
        if tag not in all_expired_tags:
            continue
        
        site_visit_done_count = (group['To Lead Type'] == 'Site visit done').sum()
        site_visit_scheduled_count = (group['To Lead Type'] == 'Site visit scheduled').sum()
        
        if site_visit_scheduled_count > 0:
            schedule_to_done_conversion = site_visit_done_count / site_visit_scheduled_count
        else:
            schedule_to_done_conversion = 0
            
        group = group.sort_values('At')
        time_diffs = group['At'].diff().dt.total_seconds() / (3600 * 24)
        avg_days_between_interactions = time_diffs.mean()
        
        features.append({
            'Tag': tag,
            'total_interactions': len(group),
            'site_visit_done_count': site_visit_done_count,
            'site_visit_scheduled_count': site_visit_scheduled_count,
            'schedule_to_done_conversion': schedule_to_done_conversion,
            'avg_days_between_interactions': avg_days_between_interactions
        })

    if not features:
        print("⚠️ No expired properties found in the interaction data to rank.")
        return
        
    ranking_df = pd.DataFrame(features).fillna(0)
    
    scaler = MinMaxScaler()
    cols_to_scale = ['total_interactions', 'site_visit_done_count', 'site_visit_scheduled_count', 'schedule_to_done_conversion']
    ranking_df[cols_to_scale] = scaler.fit_transform(ranking_df[cols_to_scale])
    
    ranking_df['velocity_score'] = 1 - scaler.fit_transform(ranking_df[['avg_days_between_interactions']])
    
    weights = {
        'site_visit_done_count': 0.40,
        'site_visit_scheduled_count': 0.25,
        'schedule_to_done_conversion': 0.15,
        'velocity_score': 0.10,
        'total_interactions': 0.10
    }
    
    ranking_df['Rank_Score'] = (
        ranking_df['site_visit_done_count'] * weights['site_visit_done_count'] +
        ranking_df['site_visit_scheduled_count'] * weights['site_visit_scheduled_count'] +
        ranking_df['schedule_to_done_conversion'] * weights['schedule_to_done_conversion'] +
        ranking_df['velocity_score'] * weights['velocity_score'] +
        ranking_df['total_interactions'] * weights['total_interactions']
    )
    
    ranking_df.sort_values(by='Rank_Score', ascending=False, inplace=True)
    ranking_df.reset_index(inplace=True)
    ranking_df.rename(columns={'index': 'Overall_Rank'}, inplace=True)
    ranking_df['Overall_Rank'] = ranking_df.index + 1


    # --- 3. Filter the ranking to show only the zero-match properties ---
    zero_match_ranking_df = ranking_df[ranking_df['Tag'].isin(zero_match_tags)]

    print("\n\n--- 📊 Ranking Analysis for Zero-Match Properties ---")
    print("This table shows the final rank and quality score for the properties that had no matches.")
    print(zero_match_ranking_df[['Tag', 'Overall_Rank', 'Rank_Score']].head(20))

    # --- 4. Analyze the distribution of their ranks ---
    print("\n--- 📈 Distribution of Ranks ---")
    rank_distribution = pd.cut(
        zero_match_ranking_df['Overall_Rank'],
        bins=[0, 100, 500, 1000, 1500, 2000],
        labels=['Top 100', 'Rank 101-500', 'Rank 501-1000', 'Rank 1001-1500', 'Rank 1501+']
    ).value_counts().sort_index()

    print("This shows where the zero-match properties fall in the overall quality ranking:")
    print(rank_distribution)

# --- How to run this in your notebook ---
# analyze_zero_match_rankings()

In [38]:
analyze_zero_match_rankings()

✅ Successfully loaded required files.

Found 73 expired properties with 0 matches to analyze.


--- 📊 Ranking Analysis for Zero-Match Properties ---
This table shows the final rank and quality score for the properties that had no matches.
                                                  Tag  Overall_Rank  \
39                       c-302-suvas-oram-odhav-aug24            40   
130                    c-103-aditya-oriana-gota-jan24           131   
171                     23-mahavir-villa-mahudi-aug24           172   
196                 69-mayur-park-society-nikol-oct24           197   
206          3-samta-society-subhashbridge-rent-oct24           207   
238              23-24-sabandh-bunglows-thaltej-jun24           239   
254                  q-504-paradise-park-vinzol-feb25           255   
336                          b-3-ratnadweepflat-vasna           337   
366                 25-sharnam-sky-vastral-rent-aug24           367   
390      j-201-parshwanath-atlantis-park-sughad-feb

In [39]:
import pandas as pd
import json
import re
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler

def categorize_tag(tag):
    """Categorizes a tag string based on regex patterns."""
    if pd.isna(tag):
        return 'unknown'
    tag_str = str(tag).lower().strip()
    if re.match(r'^\d{2}-[a-z]{3}-\d{2}-(99acres|magicbricks|olx|housing)', tag_str):
        return 'date_source_combo'
    if re.match(r'^\d{2}-\d{2}-\d{4}$', tag_str):
        return 'date_only'
    if tag_str in ['sell-leads', 'cleardeals-lead', 'lead', 'recalling']:
        return 'generic_status'
    if re.search(r'\d', tag_str) and re.search(r'[a-zA-Z]', tag_str) and '-' in tag_str:
        return 'property_identifier'
    return 'other'

def create_property_ranking(df_property_leads, all_expired_tags):
    """
    Performs feature engineering to create a sophisticated ranking score for properties.
    """
    print("\n📈 Engineering features for property ranking...")
    
    grouped = df_property_leads.groupby('Tags')
    
    features = []
    for tag, group in grouped:
        if tag not in all_expired_tags:
            continue
            
        site_visit_done_count = (group['To Lead Type'] == 'Site visit done').sum()
        site_visit_scheduled_count = (group['To Lead Type'] == 'Site visit scheduled').sum()
        
        if site_visit_scheduled_count > 0:
            schedule_to_done_conversion = site_visit_done_count / site_visit_scheduled_count
        else:
            schedule_to_done_conversion = 0
            
        group = group.sort_values('At')
        time_diffs = group['At'].diff().dt.total_seconds() / (3600 * 24)
        avg_days_between_interactions = time_diffs.mean()
        
        features.append({
            'Tag': tag,
            'total_interactions': len(group),
            'site_visit_done_count': site_visit_done_count,
            'site_visit_scheduled_count': site_visit_scheduled_count,
            'schedule_to_done_conversion': schedule_to_done_conversion,
            'avg_days_between_interactions': avg_days_between_interactions
        })

    if not features:
        print("⚠️ No expired properties found in the interaction data to rank.")
        return pd.DataFrame(), {}
        
    ranking_df = pd.DataFrame(features).fillna(0)
    
    scaler = MinMaxScaler()
    cols_to_scale = ['total_interactions', 'site_visit_done_count', 'site_visit_scheduled_count', 'schedule_to_done_conversion']
    ranking_df[cols_to_scale] = scaler.fit_transform(ranking_df[cols_to_scale])
    
    ranking_df['velocity_score'] = 1 - scaler.fit_transform(ranking_df[['avg_days_between_interactions']])
    
    weights = {
        'site_visit_done_count': 0.40,
        'site_visit_scheduled_count': 0.25,
        'schedule_to_done_conversion': 0.15,
        'velocity_score': 0.10,
        'total_interactions': 0.10
    }
    
    ranking_df['Rank_Score'] = (
        ranking_df['site_visit_done_count'] * weights['site_visit_done_count'] +
        ranking_df['site_visit_scheduled_count'] * weights['site_visit_scheduled_count'] +
        ranking_df['schedule_to_done_conversion'] * weights['schedule_to_done_conversion'] +
        ranking_df['velocity_score'] * weights['velocity_score'] +
        ranking_df['total_interactions'] * weights['total_interactions']
    )
    
    ranking_df['Weight'] = (ranking_df['Rank_Score'] * 10).astype(int) + 1
    
    ranking_df.sort_values(by='Rank_Score', ascending=False, inplace=True)
    
    print("✅ Ranking complete. Top 5 most active/high-quality expired properties:")
    print(ranking_df[['Tag', 'Rank_Score', 'Weight']].head())
    
    work_scores_dict = ranking_df.set_index('Tag')['Weight'].to_dict()
    
    return ranking_df, work_scores_dict


def route_daily_leads_with_ranking():
    """
    Ranks expired properties and routes new leads with deduplication.
    """
    print("🚀 Starting daily lead routing with advanced property ranking...")
    
    try:
        with open('matches.json', 'r') as f:
            expired_to_active_matches = json.load(f)
        df_changes = pd.read_csv('ContactTypeChange.csv', low_memory=False)
    except FileNotFoundError as e:
        print(f"❌ Error: Missing a required file. {e}"); return

    print("\n🔍 Cleaning and filtering lead interaction tags...")
    df_changes['tag_type'] = df_changes['Tags'].apply(categorize_tag)
    df_property_leads = df_changes[df_changes['tag_type'] == 'property_identifier'].copy()
    df_property_leads['At'] = pd.to_datetime(df_property_leads['At'], errors='coerce')
    print(f"✅ Found {len(df_property_leads)} interactions linked to specific properties.")

    all_expired_tags = list(expired_to_active_matches.keys())
    ranking_df, work_scores_dict = create_property_ranking(df_property_leads, all_expired_tags)
    
    if ranking_df.empty: return

    most_recent_date = df_property_leads['At'].max().date()
    daily_leads = df_property_leads[df_property_leads['At'].dt.date == most_recent_date]
    print(f"\n🗓️ Simulating daily run for date: {most_recent_date}")

    active_to_expired_map = defaultdict(list)
    for expired_tag, active_tags in expired_to_active_matches.items():
        for active_tag in active_tags:
            active_to_expired_map[active_tag].append(expired_tag)

    # --- UPDATED: Deduplication Logic ---
    routed_leads = []
    last_assigned_index = defaultdict(int)
    # This log will track assignments for this specific run to prevent duplicates
    daily_assignments = defaultdict(set) 

    for _, lead in daily_leads.iterrows():
        active_prop_tag = lead['Tags']
        lead_contact = lead['Contact']
        
        if active_prop_tag in active_to_expired_map:
            candidate_expired_props = active_to_expired_map[active_prop_tag]
            if not candidate_expired_props: continue

            weighted_candidate_list = []
            for expired_tag in candidate_expired_props:
                score = work_scores_dict.get(expired_tag, 1)
                weighted_candidate_list.extend([expired_tag] * score)
            
            if not weighted_candidate_list: continue

            pool_key = active_prop_tag
            current_index = last_assigned_index[pool_key]
            assigned_expired_tag = weighted_candidate_list[current_index % len(weighted_candidate_list)]
            last_assigned_index[pool_key] += 1
            
            # --- Check if this lead has already been sent to this expired property today ---
            if lead_contact not in daily_assignments[assigned_expired_tag]:
                routed_leads.append({
                    'lead_contact': lead_contact,
                    'original_active_property_tag': active_prop_tag,
                    'routed_to_expired_property_tag': assigned_expired_tag,
                    'timestamp': lead['At']
                })
                # --- Log the assignment to prevent future duplicates in this run ---
                daily_assignments[assigned_expired_tag].add(lead_contact)

    # --- Save and Display Results ---
    if routed_leads:
        routing_results_df = pd.DataFrame(routed_leads)
        routing_results_df.to_csv('todays_routed_leads.csv', index=False)
        print(f"\n✅ Lead routing complete. Routed {len(routed_leads)} new, unique leads.")
        print("Results saved to 'todays_routed_leads.csv'.")
        print("\n--- Sample of Routed Leads ---")
        print(routing_results_df.head())
    else:
        print("\nℹ️ No new leads were routed today for the matched properties.")

# --- How to run this ---
# route_daily_leads_with_ranking()

In [40]:
route_daily_leads_with_ranking()

🚀 Starting daily lead routing with advanced property ranking...

🔍 Cleaning and filtering lead interaction tags...
✅ Found 74902 interactions linked to specific properties.

📈 Engineering features for property ranking...
✅ Ranking complete. Top 5 most active/high-quality expired properties:
                                             Tag  Rank_Score  Weight
522             a-304-vrajdham-1-ghatlodia-jan25    0.768037       8
848                c-203-ganesh-gold-gota-sept24    0.672587       7
86           155-aarti-apartment-ghatlodia-feb25    0.654443       7
466      a-1306-abhishek-heights-naranpura-oct24    0.624352       7
89   16-shreenath-society-part-2-ghatlodia-may24    0.621482       7

🗓️ Simulating daily run for date: 2025-07-31

✅ Lead routing complete. Routed 220 new, unique leads.
Results saved to 'todays_routed_leads.csv'.

--- Sample of Routed Leads ---
         lead_contact                  original_active_property_tag  \
0  Abdulhannan Shaikh  c-302-signature-2-busi

In [41]:
import pandas as pd
import json
import re
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

def categorize_tag(tag):
    """Categorizes a tag string based on regex patterns."""
    if pd.isna(tag):
        return 'unknown'
    tag_str = str(tag).lower().strip()
    if re.match(r'^\d{2}-[a-z]{3}-\d{2}-(99acres|magicbricks|olx|housing)', tag_str):
        return 'date_source_combo'
    if re.match(r'^\d{2}-\d{2}-\d{4}$', tag_str):
        return 'date_only'
    if tag_str in ['sell-leads', 'cleardeals-lead', 'lead', 'recalling']:
        return 'generic_status'
    if re.search(r'\d', tag_str) and re.search(r'[a-zA-Z]', tag_str) and '-' in tag_str:
        return 'property_identifier'
    return 'other'

def create_property_ranking(df_property_leads, all_expired_tags):
    """
    Performs feature engineering to create a sophisticated ranking score for properties.
    """
    print("\n📈 Engineering features for property ranking...")
    
    grouped = df_property_leads.groupby('Tags')
    
    features = []
    for tag, group in grouped:
        if tag not in all_expired_tags:
            continue
            
        site_visit_done_count = (group['To Lead Type'] == 'Site visit done').sum()
        site_visit_scheduled_count = (group['To Lead Type'] == 'Site visit scheduled').sum()
        
        if site_visit_scheduled_count > 0:
            schedule_to_done_conversion = site_visit_done_count / site_visit_scheduled_count
        else:
            schedule_to_done_conversion = 0
            
        group = group.sort_values('At')
        time_diffs = group['At'].diff().dt.total_seconds() / (3600 * 24)
        avg_days_between_interactions = time_diffs.mean()
        
        features.append({
            'Tag': tag,
            'total_interactions': len(group),
            'site_visit_done_count': site_visit_done_count,
            'site_visit_scheduled_count': site_visit_scheduled_count,
            'schedule_to_done_conversion': schedule_to_done_conversion,
            'avg_days_between_interactions': avg_days_between_interactions
        })

    if not features:
        print("⚠️ No expired properties found in the interaction data to rank.")
        return pd.DataFrame(), {}
        
    ranking_df = pd.DataFrame(features).fillna(0)
    
    # Store original features before scaling for the final report
    original_features_df = ranking_df.copy()

    scaler = MinMaxScaler()
    cols_to_scale = ['total_interactions', 'site_visit_done_count', 'site_visit_scheduled_count', 'schedule_to_done_conversion']
    ranking_df[cols_to_scale] = scaler.fit_transform(ranking_df[cols_to_scale])
    
    ranking_df['velocity_score'] = 1 - scaler.fit_transform(ranking_df[['avg_days_between_interactions']])
    
    weights = {
        'site_visit_done_count': 0.40,
        'site_visit_scheduled_count': 0.25,
        'schedule_to_done_conversion': 0.15,
        'velocity_score': 0.10,
        'total_interactions': 0.10
    }
    
    ranking_df['Rank_Score'] = (
        ranking_df['site_visit_done_count'] * weights['site_visit_done_count'] +
        ranking_df['site_visit_scheduled_count'] * weights['site_visit_scheduled_count'] +
        ranking_df['schedule_to_done_conversion'] * weights['schedule_to_done_conversion'] +
        ranking_df['velocity_score'] * weights['velocity_score'] +
        ranking_df['total_interactions'] * weights['total_interactions']
    )
    
    ranking_df['Weight'] = (ranking_df['Rank_Score'] * 10).astype(int) + 1
    
    # Merge back original features for the final report
    final_ranking_df = pd.merge(original_features_df, ranking_df[['Tag', 'Rank_Score', 'Weight']], on='Tag')
    final_ranking_df.sort_values(by='Rank_Score', ascending=False, inplace=True)
    
    # --- THIS IS THE FIX ---
    final_ranking_df.reset_index(drop=True, inplace=True)
    final_ranking_df['Overall_Rank'] = final_ranking_df.index + 1
    
    print("✅ Ranking complete.")
    
    work_scores_dict = final_ranking_df.set_index('Tag')['Weight'].to_dict()
    
    return final_ranking_df, work_scores_dict

    
def analyze_daily_routing(ranking_df, routed_leads_df):
    """
    Analyzes and displays the results of the lead routing simulation,
    including the overall rank of each property.
    """
    print("\n\n--- 📊 Detailed Lead Routing Analysis ---")
    
    if routed_leads_df.empty:
        print("No leads were routed, so no analysis can be performed.")
        return

    # Count how many leads were assigned to each expired property
    lead_counts = routed_leads_df['routed_to_expired_property_tag'].value_counts().reset_index()
    lead_counts.columns = ['Tag', 'Leads_Received_Today']
    
    # Merge the lead counts with the original ranking dataframe
    # The ranking_df already has the 'Overall_Rank' from the create_property_ranking function
    analysis_df = pd.merge(ranking_df, lead_counts, on='Tag', how='left').fillna(0)
    analysis_df['Leads_Received_Today'] = analysis_df['Leads_Received_Today'].astype(int)
    
    # Filter to show only properties that received leads today
    analysis_df = analysis_df[analysis_df['Leads_Received_Today'] > 0]
    analysis_df.sort_values(by='Leads_Received_Today', ascending=False, inplace=True)
    
    print("The table below shows the ranking features for each expired property and the number of leads it received in today's run.")
    
    # UPDATED: Added 'Overall_Rank' to the display columns
    display_cols = [
        'Tag', 'Overall_Rank', 'Rank_Score', 'Weight', 'Leads_Received_Today',
        'total_interactions', 'site_visit_done_count', 'site_visit_scheduled_count',
        'schedule_to_done_conversion', 'avg_days_between_interactions'
    ]
    
    print(analysis_df[display_cols].head(20))

    # Visualize the distribution
    plt.figure(figsize=(12, 8))
    top_20_analysis = analysis_df.head(20)
    sns.barplot(x='Leads_Received_Today', y='Tag', data=top_20_analysis, palette='viridis')
    plt.title('Number of Leads Routed to Expired Properties (Top 20)')
    plt.xlabel('Number of Leads Received')
    plt.ylabel('Expired Property Tag')
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.show()
def route_daily_leads_with_ranking():
    """
    Ranks expired properties and routes new leads with deduplication.
    """
    print("🚀 Starting daily lead routing with advanced property ranking...")
    
    try:
        with open('matches.json', 'r') as f:
            expired_to_active_matches = json.load(f)
        df_changes = pd.read_csv('ContactTypeChange.csv', low_memory=False)
    except FileNotFoundError as e:
        print(f"❌ Error: Missing a required file. {e}"); return

    print("\n🔍 Cleaning and filtering lead interaction tags...")
    df_changes['tag_type'] = df_changes['Tags'].apply(categorize_tag)
    df_property_leads = df_changes[df_changes['tag_type'] == 'property_identifier'].copy()
    df_property_leads['At'] = pd.to_datetime(df_property_leads['At'], errors='coerce')
    print(f"✅ Found {len(df_property_leads)} interactions linked to specific properties.")

    all_expired_tags = list(expired_to_active_matches.keys())
    ranking_df, work_scores_dict = create_property_ranking(df_property_leads, all_expired_tags)
    
    if ranking_df.empty: return

    most_recent_date = df_property_leads['At'].max().date()
    daily_leads = df_property_leads[df_property_leads['At'].dt.date == most_recent_date]
    print(f"\n🗓️ Simulating daily run for date: {most_recent_date}")

    active_to_expired_map = defaultdict(list)
    for expired_tag, active_tags in expired_to_active_matches.items():
        for active_tag in active_tags:
            active_to_expired_map[active_tag].append(expired_tag)

    routed_leads = []
    last_assigned_index = defaultdict(int)
    daily_assignments = defaultdict(set) 

    for _, lead in daily_leads.iterrows():
        active_prop_tag = lead['Tags']
        lead_contact = lead['Contact']
        
        if active_prop_tag in active_to_expired_map:
            candidate_expired_props = active_to_expired_map[active_prop_tag]
            if not candidate_expired_props: continue

            weighted_candidate_list = []
            for expired_tag in candidate_expired_props:
                score = work_scores_dict.get(expired_tag, 1)
                weighted_candidate_list.extend([expired_tag] * score)
            
            if not weighted_candidate_list: continue

            pool_key = active_prop_tag
            current_index = last_assigned_index[pool_key]
            assigned_expired_tag = weighted_candidate_list[current_index % len(weighted_candidate_list)]
            last_assigned_index[pool_key] += 1
            
            if lead_contact not in daily_assignments[assigned_expired_tag]:
                routed_leads.append({
                    'lead_contact': lead_contact,
                    'original_active_property_tag': active_prop_tag,
                    'routed_to_expired_property_tag': assigned_expired_tag,
                    'timestamp': lead['At']
                })
                daily_assignments[assigned_expired_tag].add(lead_contact)

    if routed_leads:
        routing_results_df = pd.DataFrame(routed_leads)
        print(f"\n✅ Lead routing complete. Routed {len(routed_leads)} new, unique leads.")
        
        # --- THIS IS THE KEY CHANGE ---
        return ranking_df, routing_results_df
    else:
        print("\nℹ️ No new leads were routed today for the matched properties.")
        return pd.DataFrame(), pd.DataFrame() # Return empty dataframes if no leads

# --- How to run this ---
# route_daily_leads_with_ranking()

In [12]:
# This will run the routing and capture the two dataframes you need
ranking_df, routing_results_df = route_daily_leads_with_ranking()

🚀 Starting daily lead routing with advanced property ranking...

🔍 Cleaning and filtering lead interaction tags...
✅ Found 74902 interactions linked to specific properties.

📈 Engineering features for property ranking...
✅ Ranking complete.

🗓️ Simulating daily run for date: 2025-07-31

✅ Lead routing complete. Routed 225 new, unique leads.


In [22]:
import pandas as pd
import json
from IPython.display import HTML

def generate_full_lead_report(ranking_df, routed_leads_df):
    """
    Generates a detailed, visually appealing HTML report for all routed leads,
    including the rank of each expired property.
    """
    print("\n\n--- 📄 Generating Full HTML Lead Report ---")
    
    if routed_leads_df.empty:
        print("No leads were routed, so no report can be generated.")
        return

    # Create a dictionary to map Tag to its Overall_Rank for easy lookup
    rank_map = ranking_df.set_index('Tag')['Overall_Rank'].to_dict()

    # Group leads by the expired property they were routed to
    grouped = routed_leads_df.groupby('routed_to_expired_property_tag')

    html_parts = []
    total_leads_processed = 0

    # Sort groups by the number of leads received
    sorted_groups = sorted(grouped, key=lambda x: len(x[1]), reverse=True)

    for expired_tag, group_df in sorted_groups:
        num_leads = len(group_df)
        total_leads_processed += num_leads
        
        # Get the rank for the current property
        rank = rank_map.get(expired_tag, 'N/A')
        
        # Create a collapsible section for each expired property, now with rank
        details_header = f"""
        <details>
            <summary>
                <strong>Rank #{rank}: {expired_tag}</strong> &mdash; Received {num_leads} New Leads
            </summary>
        """
        
        # Create a table of the leads for this property
        leads_table = group_df[['lead_contact', 'original_active_property_tag', 'timestamp']].to_html(index=False)
        
        details_footer = "</details>"
        
        html_parts.append(details_header + leads_table + details_footer)

    # --- Build the final HTML file ---
    final_html = f"""
    <html>
    <head>
        <title>Complete Daily Lead Routing Report</title>
        <style>
            body {{ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; margin: 40px; color: #333; }}
            h1 {{ color: #2c3e50; }}
            details {{ 
                border: 1px solid #ddd; 
                border-radius: 8px; 
                margin-bottom: 10px; 
                overflow: hidden;
            }}
            summary {{ 
                padding: 15px; 
                font-size: 1.1em;
                font-weight: bold;
                background-color: #f7f7f7; 
                cursor: pointer;
                outline: none;
            }}
            table {{ 
                width: 100%; 
                border-collapse: collapse; 
            }}
            th, td {{ 
                padding: 12px 15px; 
                border-top: 1px solid #ddd;
                text-align: left;
            }}
            th {{ 
                background-color: #f2f2f2; 
            }}
            tr:nth-child(even) {{ 
                background-color: #fafafa; 
            }}
        </style>
    </head>
    <body>
        <h1>Complete Lead Routing Report</h1>
        <p><strong>Total Unique Leads Routed Today: {total_leads_processed}</strong></p>
        <p>Click on each property to see the full list of leads it received. The properties are listed in order of most leads received.</p>
        {''.join(html_parts)}
    </body>
    </html>
    """
    
    report_path = 'full_lead_report.html'
    with open(report_path, 'w') as f:
        f.write(final_html)
        
    print(f"✅ Complete analysis report has been saved to '{report_path}'.")
    print("You can now open this file in your browser to view the interactive report.")

In [47]:
generate_full_lead_report(ranking_df, routing_results_df)



--- 📄 Generating Full HTML Lead Report ---
✅ Complete analysis report has been saved to 'full_lead_report.html'.
You can now open this file in your browser to view the interactive report.


In [50]:
import pandas as pd
import json
import re
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display, HTML

def categorize_tag(tag):
    """Categorizes a tag string based on regex patterns."""
    if pd.isna(tag): return 'unknown'
    tag_str = str(tag).lower().strip()
    if re.match(r'^\d{2}-[a-z]{3}-\d{2}-(99acres|magicbricks|olx|housing)', tag_str): return 'date_source_combo'
    if re.match(r'^\d{2}-\d{2}-\d{4}$', tag_str): return 'date_only'
    if tag_str in ['sell-leads', 'cleardeals-lead', 'lead', 'recalling']: return 'generic_status'
    if re.search(r'\d', tag_str) and re.search(r'[a-zA-Z]', tag_str) and '-' in tag_str: return 'property_identifier'
    return 'other'

def create_property_ranking(df_property_leads, all_expired_tags):
    """
    Performs feature engineering to create a sophisticated ranking score for properties.
    """
    print("\n📈 Engineering features for property ranking...")
    grouped = df_property_leads.groupby('Tags')
    features = []
    for tag, group in grouped:
        if tag not in all_expired_tags: continue
        site_visit_done_count = (group['To Lead Type'] == 'Site visit done').sum()
        site_visit_scheduled_count = (group['To Lead Type'] == 'Site visit scheduled').sum()
        schedule_to_done_conversion = site_visit_done_count / site_visit_scheduled_count if site_visit_scheduled_count > 0 else 0
        group = group.sort_values('At')
        time_diffs = group['At'].diff().dt.total_seconds() / (3600 * 24)
        avg_days_between_interactions = time_diffs.mean()
        features.append({
            'Tag': tag, 'total_interactions': len(group), 'site_visit_done_count': site_visit_done_count,
            'site_visit_scheduled_count': site_visit_scheduled_count, 'schedule_to_done_conversion': schedule_to_done_conversion,
            'avg_days_between_interactions': avg_days_between_interactions
        })

    if not features:
        print("⚠️ No expired properties found in the interaction data to rank."); return pd.DataFrame()
        
    ranking_df = pd.DataFrame(features).fillna(0)
    original_features_df = ranking_df.copy()

    scaler = MinMaxScaler()
    cols_to_scale = ['total_interactions', 'site_visit_done_count', 'site_visit_scheduled_count', 'schedule_to_done_conversion']
    ranking_df[cols_to_scale] = scaler.fit_transform(ranking_df[cols_to_scale])
    ranking_df['velocity_score'] = 1 - scaler.fit_transform(ranking_df[['avg_days_between_interactions']])
    
    weights = {'site_visit_done_count': 0.40, 'site_visit_scheduled_count': 0.25, 'schedule_to_done_conversion': 0.15, 'velocity_score': 0.10, 'total_interactions': 0.10}
    ranking_df['Rank_Score'] = (
        ranking_df['site_visit_done_count'] * weights['site_visit_done_count'] +
        ranking_df['site_visit_scheduled_count'] * weights['site_visit_scheduled_count'] +
        ranking_df['schedule_to_done_conversion'] * weights['schedule_to_done_conversion'] +
        ranking_df['velocity_score'] * weights['velocity_score'] +
        ranking_df['total_interactions'] * weights['total_interactions']
    )
    
    final_ranking_df = pd.merge(original_features_df, ranking_df[['Tag', 'Rank_Score']], on='Tag')
    final_ranking_df.sort_values(by='Rank_Score', ascending=False, inplace=True)
    final_ranking_df.reset_index(drop=True, inplace=True)
    final_ranking_df['Overall_Rank'] = final_ranking_df.index + 1
    
    print("✅ Ranking complete.")
    return final_ranking_df

def generate_full_lead_report(ranking_df, routed_leads_df):
    """
    Generates a detailed, visually appealing HTML report for all routed leads.
    """
    print("\n\n--- 📄 Generating Full HTML Lead Report ---")
    
    if routed_leads_df.empty:
        print("No leads were routed, so no report can be generated.")
        return

    # --- 1. Calculate Summary Statistics ---
    total_assignments = len(routed_leads_df)
    lead_counts_per_property = routed_leads_df.groupby('routed_to_expired_property_tag')['lead_contact'].nunique()
    total_properties_receiving_leads = len(lead_counts_per_property)
    lead_distribution = lead_counts_per_property.value_counts().sort_index().reset_index()
    lead_distribution.columns = ['Number of Leads Received', 'Number of Properties']

    # --- 2. Prepare data for the detailed log ---
    rank_map = ranking_df.set_index('Tag')['Overall_Rank'].to_dict()
    grouped = routed_leads_df.groupby('routed_to_expired_property_tag')
    html_parts = []
    sorted_groups = sorted(grouped, key=lambda x: len(x[1].drop_duplicates(subset=['lead_contact'])), reverse=True)

    for expired_tag, group_df in sorted_groups:
        num_unique_leads = len(group_df.drop_duplicates(subset=['lead_contact']))
        rank = rank_map.get(expired_tag, 'N/A')
        details_header = f"""<details><summary><strong>Rank #{rank}: {expired_tag}</strong> &mdash; Received {num_unique_leads} Unique New Leads</summary>"""
        leads_table = group_df[['lead_contact', 'original_active_property_tag', 'timestamp']].to_html(index=False)
        details_footer = "</details>"
        html_parts.append(details_header + leads_table + details_footer)

    # --- 3. Build the final HTML file ---
    final_html = f"""
    <html><head><title>Complete Daily Lead Routing Report</title>
    <style>
        body {{ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; margin: 40px; color: #333; }}
        h1, h2, h3 {{ color: #2c3e50; }}
        .summary-box {{ background-color: #eaf2f8; border-left: 5px solid #3498db; padding: 15px; margin-bottom: 30px; }}
        details {{ border: 1px solid #ddd; border-radius: 8px; margin-bottom: 10px; overflow: hidden; }}
        summary {{ padding: 15px; font-size: 1.1em; font-weight: bold; background-color: #f7f7f7; cursor: pointer; outline: none; }}
        table {{ width: 100%; border-collapse: collapse; }}
        th, td {{ padding: 12px 15px; border-top: 1px solid #ddd; text-align: left; }}
        th {{ background-color: #f2f2f2; }}
        tr:nth-child(even) {{ background-color: #fafafa; }}
    </style>
    </head><body>
    <h1>Complete Lead Routing Report</h1>
    <h2>Summary Statistics</h2>
    <div class="summary-box">
        <p><strong>Total Unique Leads Routed Today:</strong> {total_assignments}</p>
        <p><strong>Total Expired Properties Receiving Leads:</strong> {total_properties_receiving_leads}</p>
    </div>
    <h3>Lead Distribution Breakdown</h3>
    {lead_distribution.to_html(index=False)}
    <h2>Detailed Lead Log</h2>
    <p>Click on each property to see the full list of leads it received.</p>
    {''.join(html_parts)}
    </body></html>
    """
    
    report_path = 'full_lead_report.html'
    with open(report_path, 'w', encoding='utf-8') as f: f.write(final_html)
    print(f"✅ Complete analysis report has been saved to '{report_path}'.")

def route_daily_leads_prioritized():
    """
    Routes new leads using a Priority Queue model to ensure the best-ranked
    properties get the first opportunity.
    """
    print("🚀 Starting daily lead routing with PRIORITY QUEUE logic...")
    
    try:
        with open('matches.json', 'r') as f: expired_to_active_matches = json.load(f)
        df_changes = pd.read_csv('ContactTypeChange.csv', low_memory=False)
    except FileNotFoundError as e:
        print(f"❌ Error: Missing a required file. {e}"); return

    print("\n🔍 Cleaning and filtering lead interaction tags...")
    df_changes['tag_type'] = df_changes['Tags'].apply(categorize_tag)
    df_property_leads = df_changes[df_changes['tag_type'] == 'property_identifier'].copy()
    df_property_leads['At'] = pd.to_datetime(df_property_leads['At'], errors='coerce')
    print(f"✅ Found {len(df_property_leads)} interactions linked to specific properties.")

    all_expired_tags = list(expired_to_active_matches.keys())
    ranking_df = create_property_ranking(df_property_leads, all_expired_tags)
    if ranking_df.empty: return
    
    most_recent_date = df_property_leads['At'].max().date()
    daily_leads = df_property_leads[df_property_leads['At'].dt.date == most_recent_date]
    print(f"\n🗓️ Simulating daily run for date: {most_recent_date}")

    active_to_expired_map = defaultdict(list)
    for expired_tag, active_tags in expired_to_active_matches.items():
        for active_tag in active_tags:
            active_to_expired_map[active_tag].append(expired_tag)

    routed_leads = []; daily_assignments = defaultdict(set); daily_lead_caps = defaultdict(int)
    MAX_LEADS_PER_DAY = 7

    for _, lead in daily_leads.iterrows():
        active_prop_tag = lead['Tags']
        lead_contact = lead['Contact']
        
        if active_prop_tag in active_to_expired_map:
            candidate_expired_props = active_to_expired_map[active_prop_tag]
            
            # Filter out properties with no rank
            ranked_candidates = [tag for tag in candidate_expired_props if tag in ranking_df['Tag'].values]
            
            # Sort candidates by their overall rank (lower is better)
            ranked_candidates.sort(key=lambda tag: ranking_df.loc[ranking_df['Tag'] == tag, 'Overall_Rank'].iloc[0])

            # Iterate through the sorted candidates and assign the lead to the first one available
            for assigned_tag in ranked_candidates:
                if lead_contact not in daily_assignments[assigned_tag] and daily_lead_caps[assigned_tag] < MAX_LEADS_PER_DAY:
                    # Assign the lead to this property
                    routed_leads.append({'lead_contact': lead_contact, 'original_active_property_tag': active_prop_tag, 'routed_to_expired_property_tag': assigned_tag, 'timestamp': lead['At']})
                    daily_assignments[assigned_tag].add(lead_contact)
                    daily_lead_caps[assigned_tag] += 1
                    # IMPORTANT: Break the loop once the lead is assigned to prevent broadcasting
                    break
                        
    if routed_leads:
        routing_results_df = pd.DataFrame(routed_leads)
        print(f"\n✅ Lead routing complete. Created {len(routing_results_df)} new lead assignments.")
        generate_full_lead_report(ranking_df, routing_results_df)
    else:
        print("\nℹ️ No new leads were routed today for the matched properties.")

# --- How to run this ---
# route_daily_leads_prioritized()


In [52]:
route_daily_leads_prioritized()

🚀 Starting daily lead routing with PRIORITY QUEUE logic...

🔍 Cleaning and filtering lead interaction tags...
✅ Found 74902 interactions linked to specific properties.

📈 Engineering features for property ranking...
✅ Ranking complete.

🗓️ Simulating daily run for date: 2025-07-31

✅ Lead routing complete. Created 240 new lead assignments.


--- 📄 Generating Full HTML Lead Report ---
✅ Complete analysis report has been saved to 'full_lead_report.html'.
