In [1]:
# Import libraries

import pandas as pd
import numpy as np
import cvxpy as cp
from sentence_transformers import SentenceTransformer, util
from scipy.optimize import linprog
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from pulp import LpProblem, LpMaximize, LpVariable, lpSum, LpInteger, LpBinary
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Data

df_instagram_influencers = pd.read_csv("input/instagram_influencers_final.csv")
df_brief = pd.read_csv("dummy_brief_data.csv")
df_labeled_caption = pd.read_csv("input/labeled_caption.csv")
df_labeled_comment = pd.read_csv("input/labeled_comment.csv")
df_bio = pd.read_csv("input/bio.csv")
df_captions = pd.read_csv("input/captions.csv")

In [3]:
df_brief

Unnamed: 0,brief_id,brand_name,industry,product_name,overview,usp,marketing_objective,target_goals,timing_campaign,audience_preference,influencer_persona,total_influencer,niche,location_prior,esg_allignment,budget,output,risk_tolerance
0,BRIEF_001,Avoskin,Skincare & Beauty,GlowSkin Vitamin C Serum,Premium vitamin C serum untuk mencerahkan dan ...,Formula 20% Vitamin C dengan teknologi nano-en...,"['Brand Awareness', 'Product Launch']","['Increase Brand Recognition', 'Drive Sales']",2025-02-15,"{""top_locations"": {""countries"": [""Indonesia"", ...","Beauty enthusiast, skincare expert, authentic ...",3,"['Beauty', 'Skincare']","Indonesia, Malaysia","Cruelty-free, sustainable packaging",50000000.0,"{""content_types"": [""Reels"", ""Feeds""], ""deliver...",Medium
1,BRIEF_002,Specs,Sport,TechFlow Wireless Earbuds,Premium wireless earbuds dengan active noise c...,Hi-Res audio certification dengan teknologi ad...,"['Product Launch', 'Sales Conversion']","['Generate Leads', 'Drive Sales']",2025-03-01,"{""top_locations"": {""countries"": [""Indonesia"", ...","Tech reviewer, gadget enthusiast, early adopter",5,"['Technology', 'Audio', 'Gadgets']","Indonesia, Singapore","Recyclable packaging, carbon neutral shipping",75000000.0,"{""content_types"": [""Reels"", ""Feeds"", ""Stories""...",Low
2,BRIEF_003,Kopi Jago,F&B,Kopi Nusantara Premium,Kopi premium single origin dari petani lokal I...,"Direct trade dengan petani, roasting profile u...","['Brand Awareness', 'Community Building']","['Increase Brand Recognition', 'Build Community']",2025-02-20,"{""top_locations"": {""countries"": [""Indonesia""],...","Coffee enthusiast, local culture advocate, sus...",4,"['Food & Beverage', 'Coffee', 'Local Culture']",Indonesia,"Fair trade, support local farmers, biodegradab...",35000000.0,"{""content_types"": [""Feeds"", ""Stories""], ""deliv...",Medium
3,BRIEF_004,Cottonink,Fashion,UrbanStyle Sustainable Fashion,Fashion brand berkelanjutan dengan material ec...,"100% organic cotton, ethical production, timel...","['Brand Awareness', 'Sales Conversion']","['Increase Brand Recognition', 'Drive Sales']",2025-03-10,"{""top_locations"": {""countries"": [""Indonesia"", ...","Fashion influencer, sustainability advocate, s...",3,"['Fashion', 'Sustainability', 'Lifestyle']","Indonesia, Malaysia","Sustainable materials, ethical labor, zero was...",60000000.0,"{""content_types"": [""Reels"", ""Feeds""], ""deliver...",Medium
4,BRIEF_005,Bigetron Esports,Sport,GameZone Pro Controller,Professional gaming controller dengan customiz...,"Tournament-grade precision, wireless connectiv...","['Product Launch', 'Sales Conversion']","['Generate Leads', 'Drive Sales']",2025-02-25,"{""top_locations"": {""countries"": [""Indonesia"", ...","Pro gamer, gaming content creator, esports ent...",4,"['Gaming', 'Esports', 'Technology']","Indonesia, Malaysia, Singapore","Recyclable materials, energy efficient",45000000.0,"{""content_types"": [""Reels"", ""Stories""], ""deliv...",High
5,BRIEF_006,Betadine,FMCG,WellnessPlus Multivitamin,Suplemen multivitamin premium dengan formula k...,"Natural ingredients, clinically tested, no art...","['Brand Awareness', 'Education']","['Increase Brand Recognition', 'Educate Audien...",2025-03-05,"{""top_locations"": {""countries"": [""Indonesia"", ...","Health & wellness expert, fitness enthusiast, ...",3,"['Health', 'Wellness', 'Nutrition']","Indonesia, Malaysia","Natural ingredients, sustainable sourcing",40000000.0,"{""content_types"": [""Feeds"", ""Stories""], ""deliv...",Low
6,BRIEF_007,Specs,Sport,AutoTech Smart Dashcam,Dashcam pintar dengan AI detection dan cloud s...,"AI-powered incident detection, 4K recording, a...","['Product Launch', 'Education']","['Generate Leads', 'Educate Audience']",2025-03-15,"{""top_locations"": {""countries"": [""Indonesia""],...","Automotive enthusiast, tech reviewer, safety a...",2,"['Automotive', 'Technology', 'Safety']",Indonesia,"Reduce accidents, promote safe driving",55000000.0,"{""content_types"": [""Reels"", ""Feeds""], ""deliver...",Medium
7,BRIEF_008,Pahamify,Learning,LearnHub Online Course Platform,Platform pembelajaran online dengan mentor ahl...,"Expert mentors, interactive learning, industry...","['Brand Awareness', 'Lead Generation']","['Increase Brand Recognition', 'Generate Leads']",2025-02-28,"{""top_locations"": {""countries"": [""Indonesia"", ...","Education expert, career coach, lifelong learner",4,"['Education', 'Career Development', 'Technology']","Indonesia, Malaysia","Accessible education, skill development",65000000.0,"{""content_types"": [""Feeds"", ""Stories"", ""Reels""...",Low


In [4]:
# Kelas untuk optimasi pemilihan influencer berdasarkan budget dan estimasi impact
class BudgetOptimizer:
    def __init__(self):
        pass

    def filter_and_optimize(self, brief_budget, influencers_df):
        """Filter influencers by budget + find optimal content mix"""
        filtered_influencers = []

        for _, infl in influencers_df.iterrows():
            if (brief_budget >= infl['rate_card_story'] or
                brief_budget >= infl['rate_card_feeds'] or
                brief_budget >= infl['rate_card_reels']):

                optimal_mix = self.optimize_content_mix(
                    budget=brief_budget,
                    story_rate=infl['rate_card_story'],
                    feeds_rate=infl['rate_card_feeds'],
                    reels_rate=infl['rate_card_reels'],
                    story_impact=self.estimate_story_impact(infl),
                    feeds_impact=self.estimate_feeds_impact(infl),
                    reels_impact=self.estimate_reels_impact(infl)
                )

                infl_dict = infl.to_dict()
                infl_dict.update({
                    'optimal_content_mix': optimal_mix,
                    'budget_efficiency': optimal_mix['total_impact'] / optimal_mix['total_cost'] if optimal_mix['total_cost'] > 0 else 0
                })
                filtered_influencers.append(infl_dict)

        return pd.DataFrame(filtered_influencers)

    def optimize_content_mix(self, budget, story_rate, feeds_rate, reels_rate, story_impact, feeds_impact, reels_impact):
        prob = LpProblem("OptimalContentMix", LpMaximize)

        x = LpVariable("story_count", 0, 5, cat=LpInteger)
        y = LpVariable("feeds_count", 0, 4, cat=LpInteger)
        z = LpVariable("reels_count", 0, 3, cat=LpInteger)

        use_story = LpVariable("use_story", 0, 1, LpBinary)
        use_feeds = LpVariable("use_feeds", 0, 1, LpBinary)
        use_reels = LpVariable("use_reels", 0, 1, LpBinary)

        prob += story_rate * x + feeds_rate * y + reels_rate * z <= budget

        prob += x >= 1 * use_story
        prob += x <= 1000 * use_story

        prob += y >= 1 * use_feeds
        prob += y <= 1000 * use_feeds

        prob += z >= 1 * use_reels
        prob += z <= 1000 * use_reels

        diversity_bonus = 5 * (use_story + use_feeds + use_reels)
        prob += story_impact * x + feeds_impact * y + reels_impact * z + diversity_bonus

        prob.solve()

        story_count = int(x.varValue)
        feeds_count = int(y.varValue)
        reels_count = int(z.varValue)
        total_cost = story_count * story_rate + feeds_count * feeds_rate + reels_count * reels_rate
        total_impact = story_count * story_impact + feeds_count * feeds_impact + reels_count * reels_impact
        remaining_budget = budget - total_cost

        return {
            'story_count': story_count,
            'feeds_count': feeds_count,
            'reels_count': reels_count,
            'total_cost': total_cost,
            'total_impact': total_impact,
            'remaining_budget': remaining_budget
        }

    def estimate_story_impact(self, influencer):
        """Estimate impact score for Instagram Story"""
        base_impact = influencer.get('engagement_rate_pct', 0.02) * 100
        tier_multiplier = {
            'Nano': 1.2, 'Micro': 1.1, 'Mid': 1.0, 'Macro': 0.9, 'Mega': 0.8
        }.get(influencer.get('tier_followers', 'Micro'), 1.0)
        return base_impact * tier_multiplier * 0.7

    def estimate_feeds_impact(self, influencer):
        """Estimate impact score for Instagram Feeds"""
        base_impact = influencer.get('engagement_rate_pct', 0.02) * 100
        tier_multiplier = {
            'Nano': 1.2, 'Micro': 1.1, 'Mid': 1.0, 'Macro': 0.9, 'Mega': 0.8
        }.get(influencer.get('tier_followers', 'Micro'), 1.0)
        return base_impact * tier_multiplier * 1.0

    def estimate_reels_impact(self, influencer):
        """Estimate impact score for Instagram Reels"""
        base_impact = influencer.get('engagement_rate_pct', 0.02) * 100
        viral_bonus = 1.5 if influencer.get('trending_status', False) else 1.2
        tier_multiplier = {
            'Nano': 1.3, 'Micro': 1.2, 'Mid': 1.1, 'Macro': 1.0, 'Mega': 0.9
        }.get(influencer.get('tier_followers', 'Micro'), 1.0)
        return base_impact * tier_multiplier * viral_bonus

# Kelas untuk menghitung kecocokan persona brief dan influencer secara semantic (NLP)
class PersonaSemanticMatcher:
    def __init__(self, model_name='paraphrase-multilingual-MiniLM-L12-v2'):
        self.model = SentenceTransformer(model_name)

    def prepare_influencer_texts(self, bio_df, caption_df, max_posts=5):
        """
        Prepare influencer texts dari bio dan captions untuk similarity calculation
        """
        influencer_texts = {}
        grouped_captions = caption_df.groupby('instagram_account')

        for _, row in bio_df.iterrows():
            account = row['instagram_account']
            bio = str(row['bio']) if pd.notna(row['bio']) else ""

            captions = []
            if account in grouped_captions.groups:
                captions_raw = grouped_captions.get_group(account)['post_caption'].head(max_posts)
                captions = [str(c) for c in captions_raw if pd.notna(c)]

            full_text = bio + ' ' + ' '.join(captions)
            influencer_texts[account] = full_text.strip()

        return influencer_texts

    def calculate_similarity_scores(self, brief_persona_text, influencer_texts):
        """
        Hitung cosine similarity antara brief persona dan teks masing-masing influencer.
        """
        scores = {}
        brief_vec = self.model.encode(brief_persona_text)

        accounts = list(influencer_texts.keys())
        texts = list(influencer_texts.values())

        infl_vecs = self.model.encode(texts, show_progress_bar=False)
        cosine_scores = cosine_similarity([brief_vec], infl_vecs)[0]
        scores = dict(zip(accounts, cosine_scores))

        return scores

    def get_scored_df(self, brief_persona_text, bio_df, caption_df):
        """
        Mengembalikan DataFrame berisi akun influencer dan skor kecocokan persona.
        """
        influencer_texts = self.prepare_influencer_texts(bio_df, caption_df)
        scores = self.calculate_similarity_scores(brief_persona_text, influencer_texts)
        return pd.DataFrame([
            {'instagram_account': account, 'persona_fit_score': score}
            for account, score in scores.items()
        ])

# Kelas untuk audience matching menggunakan JSON-based approach dengan flexible weighting
class AudienceMatchingGNN:
    def __init__(self, default_weights=None):
        """
        Initialize audience matcher dengan flexible weight system
        """
        self.default_weights = default_weights or {
            "location": 0.4,
            "age": 0.3, 
            "gender": 0.3
        }
        
        # Predefined weight strategies untuk different business scenarios
        self.weight_strategies = {
            "balanced": {"location": 0.33, "age": 0.33, "gender": 0.34},
            "location_focused": {"location": 0.6, "age": 0.25, "gender": 0.15},
            "demographic_focused": {"location": 0.2, "age": 0.4, "gender": 0.4},
            "age_priority": {"location": 0.2, "age": 0.6, "gender": 0.2},
            "gender_priority": {"location": 0.15, "age": 0.25, "gender": 0.6},
            "geo_only": {"location": 1.0, "age": 0.0, "gender": 0.0},
            "demo_only": {"location": 0.0, "age": 0.5, "gender": 0.5}
        }
    
    def get_weight_strategy(self, strategy_name):
        """Get predefined weight strategy"""
        return self.weight_strategies.get(strategy_name, self.default_weights)
    
    def calculate_audience_match_score(self, brief_audience, influencer_audience, weights=None, strategy=None):
        """
        Calculate audience match score dengan real Instagram insights data
        """
        # Priority: custom weights > strategy > default
        if weights is not None:
            used_weights = weights
        elif strategy is not None:
            used_weights = self.get_weight_strategy(strategy)
        else:
            used_weights = self.default_weights
            
        # Normalize weights
        total_weight = sum(used_weights.values())
        if abs(total_weight - 1.0) > 0.01:
            used_weights = {k: v/total_weight for k, v in used_weights.items()}
        
        # Calculate component scores
        location_score = self._calculate_location_score(brief_audience, influencer_audience)
        age_score = self._calculate_age_score(brief_audience, influencer_audience) 
        gender_score = self._calculate_gender_score(brief_audience, influencer_audience)
        
        # Weighted total
        total_score = (
            location_score * used_weights["location"] +
            age_score * used_weights["age"] +
            gender_score * used_weights["gender"]
        )
        
        return {
            "location_score": round(location_score, 3),
            "age_score": round(age_score, 3),
            "gender_score": round(gender_score, 3),
            "total_score": round(total_score, 3),
            "weights_used": used_weights,
            "strategy_used": strategy or "custom"
        }
    
    def _calculate_location_score(self, brief, influencer):
        """Calculate location overlap score using REAL Instagram insights"""
        if "top_locations" not in brief or "top_locations" not in influencer:
            return 0.0
            
        brief_countries = set(brief["top_locations"].get("countries", []))
        brief_cities = set(brief["top_locations"].get("cities", []))
        
        country_score = sum(
            loc["percent"] for loc in influencer["top_locations"].get("countries", [])
            if loc["country"] in brief_countries
        )
        city_score = sum(
            loc["percent"] for loc in influencer["top_locations"].get("cities", [])
            if loc["city"] in brief_cities
        )
        
        return (country_score + city_score) / 200
    
    def _calculate_age_score(self, brief, influencer):
        """Calculate age overlap score using REAL Instagram insights"""
        if "age_range" not in brief or "age_range_overall" not in influencer:
            return 0.0
            
        brief_age = set(brief["age_range"])
        return sum(
            a["percent"] for a in influencer["age_range_overall"]
            if a["range"] in brief_age
        ) / 100
    
    def _calculate_gender_score(self, brief, influencer):
        """Calculate gender overlap score using REAL Instagram insights"""
        if "gender" not in brief or "gender_overall" not in influencer:
            return 0.0
            
        brief_gender = set(brief["gender"])
        return sum(
            g["percent"] for g in influencer["gender_overall"]
            if g["gender"] in brief_gender
        ) / 100

    def calculate_demographic_similarity(self, brief_data, influencer_data):
        """
        Backward compatibility dengan format lama - akan menggunakan extract functions
        """
        try:
            # Extract audience data using real data functions
            if 'audience_preference' in brief_data:
                brief_audience = json.loads(brief_data['audience_preference'])
            else:
                # Fallback for old format
                brief_audience = {
                    "top_locations": {"countries": ["Indonesia"], "cities": ["Jakarta"]},
                    "age_range": ["18-24", "25-34"], 
                    "gender": ["Female", "Male"]
                }
            
            if 'audience_analytics' in influencer_data:
                infl_audience = json.loads(influencer_data['audience_analytics'])
            else:
                # Fallback for old format
                infl_audience = {
                    "top_locations": {"countries": [{"country": "Indonesia", "percent": 80}], "cities": []},
                    "age_range_overall": [{"range": "18-24", "percent": 50}],
                    "gender_overall": [{"gender": "Female", "percent": 60}]
                }
            
            result = self.calculate_audience_match_score(brief_audience, infl_audience, strategy="balanced")
            return result["total_score"]
            
        except Exception as e:
            print(f"Error in demographic similarity: {e}")
            return 0.5  # Default score if error

# Kelas untuk prediksi performa campaign influencer secara heuristik
class SocialMediaPerformancePredictor:
    def __init__(self):
        self.tier_benchmarks = {
            'Nano': {'engagement': 0.04, 'views': 5000},
            'Micro': {'engagement': 0.02, 'views': 25000},
            'Mid': {'engagement': 0.015, 'views': 75000},
            'Macro': {'engagement': 0.012, 'views': 200000},
            'Mega': {'engagement': 0.01, 'views': 500000}
        }

    def predict_campaign_performance(self, influencer_data, brief_data):
        """Predict campaign performance using heuristic approach"""
        # Calculate performance components
        engagement_score = self.calculate_engagement_score(influencer_data)
        authenticity_score = self.calculate_authenticity_score(influencer_data)
        reach_potential = self.calculate_reach_potential(influencer_data)
        brief_fit = self.calculate_brief_fit(influencer_data, brief_data)

        # Combined performance score
        performance_score = (
            engagement_score * 0.3 +
            authenticity_score * 0.25 +
            reach_potential * 0.25 +
            brief_fit * 0.2
        )

        return {
            'performance_score': min(performance_score, 1.0),
            'engagement_score': engagement_score,
            'authenticity_score': authenticity_score,
            'reach_potential': reach_potential,
            'brief_fit': brief_fit
        }

    def calculate_engagement_score(self, influencer_data):
        """Calculate engagement quality score"""
        er = influencer_data.get('engagement_rate_pct', 0)
        tier = influencer_data.get('tier_followers', 'Micro')
        benchmark = self.tier_benchmarks.get(tier, {'engagement': 0.02})['engagement']

        normalized_er = min(er / benchmark, 2.0) if benchmark > 0 else 0
        return normalized_er / 2.0

    def calculate_authenticity_score(self, influencer_data):
        """Calculate authenticity based on endorse rate and consistency"""
        endorse_rate = influencer_data.get('random_endorse_rate', 0.5)
        consistency = influencer_data.get('behavior_consistency', False)

        authenticity = (1 - endorse_rate) * 0.7
        if consistency:
            authenticity += 0.3

        return min(authenticity, 1.0)

    def calculate_reach_potential(self, influencer_data):
        """Calculate reach potential based on views and virality"""
        avg_views = influencer_data.get('avg_reels_views', 0)
        trending = influencer_data.get('trending_status', False)
        tier = influencer_data.get('tier_followers', 'Micro')

        benchmark = self.tier_benchmarks.get(tier, {'views': 25000})['views']
        view_score = min(avg_views / benchmark, 2.0) / 2.0 if benchmark > 0 else 0

        if trending:
            view_score *= 1.2

        return min(view_score, 1.0)

    def calculate_brief_fit(self, influencer_data, brief_data):
        """Calculate brief-influencer fit score"""
        industry_match = 0.5
        if influencer_data.get('expertise_field', '').lower() in brief_data.get('industry', '').lower():
            industry_match = 0.8

        tier = influencer_data.get('tier_followers', 'Micro')
        budget = brief_data.get('budget', 0)
        
        # Budget appropriateness
        if tier == 'Nano' and budget < 50000000:
            budget_fit = 0.9
        elif tier == 'Micro' and 20000000 <= budget <= 200000000:
            budget_fit = 0.8
        elif tier == 'Mid' and 100000000 <= budget <= 500000000:
            budget_fit = 0.8
        else:
            budget_fit = 0.6

        return (industry_match + budget_fit) / 2

# Kelas untuk ranking akhir dengan multiple objective
class MultiObjectiveRanker:
    def __init__(self):
        pass

    def rank_influencers(self, scored_influencers, priorities):
        """Rank influencers berdasarkan weighted multiple objectives"""
        ranked_results = []
        
        for influencer in scored_influencers:
            final_score = 0
            for criterion, weight in priorities.items():
                score = influencer.get(criterion, 0)
                final_score += score * weight
            
            result = influencer.copy()
            result['final_score'] = final_score
            ranked_results.append(result)
        
        # Sort by final score descending
        ranked_results.sort(key=lambda x: x['final_score'], reverse=True)
        return ranked_results

# Main SOTAInfluencerMatcher Class
class SOTAInfluencerMatcher:
    def __init__(self):
        """Initialize dengan semua komponen yang sudah ditest"""
        self.budget_optimizer = BudgetOptimizer()
        self.persona_matcher = PersonaSemanticMatcher()
        self.audience_matcher = AudienceMatchingGNN() 
        self.performance_predictor = SocialMediaPerformancePredictor()
        self.final_ranker = MultiObjectiveRanker()

    def extract_brief_audience_preference(self, brief_row):
        """Extract audience preference dari brief data - REAL FORMAT"""
        try:
            audience_pref_str = brief_row['audience_preference']
            audience_pref = json.loads(audience_pref_str)
            return audience_pref
        except Exception as e:
            return {
                "top_locations": {"countries": ["Indonesia"], "cities": ["Jakarta"]},
                "age_range": ["18-24", "25-34"], 
                "gender": ["Female", "Male"]
            }

    def extract_influencer_audience_analytics(self, influencer_row):
        """Extract audience analytics dari influencer data - REAL Instagram Insights"""
        try:
            audience_analytics_str = influencer_row['audience_analytics']
            audience_analytics = json.loads(audience_analytics_str)
            return audience_analytics
        except Exception as e:
            return {
                "top_locations": {"countries": [], "cities": []},
                "age_range_overall": [],
                "gender_overall": []
            }

import json

In [12]:
# MAIN FUNCTIONS

def get_top_influencers_for_brief(brief_id, briefs_df, influencers_df, bio_df, caption_df, top_n=5, brief_priorities=None):
    """
    UPDATED: Get top influencers untuk brief dengan real data integration
    """
    if brief_priorities is None:
        brief_priorities = {
            'persona_fit': 0.25,
            'audience_fit': 0.25,  
            'performance_pred': 0.25,
            'budget_efficiency': 0.25
        }

    # Initialize matcher dengan improved components
    matcher = SOTAInfluencerMatcher()
    
    # Get brief data
    brief_data = briefs_df[briefs_df['brief_id'] == brief_id]
    if brief_data.empty:
        print(f"Brief '{brief_id}' tidak ditemukan")
        return []
    
    brief_row = brief_data.iloc[0]
    
    try:
        # Step 1: Budget filtering
        affordable_influencers = matcher.budget_optimizer.filter_and_optimize(
            brief_row['budget'], influencers_df
        )
        
        if affordable_influencers.empty:
            print(f"Tidak ada influencer yang affordable untuk budget Rp {brief_row['budget']:,}")
            return []
        
        print(f"✅ Budget filtering: {len(affordable_influencers)} influencers affordable")
        
        # Step 2: Persona matching
        persona_scores_df = matcher.persona_matcher.get_scored_df(
            brief_persona_text=brief_row['influencer_persona'],
            bio_df=bio_df,
            caption_df=caption_df
        )
        
        # Merge persona scores
        affordable_influencers = pd.merge(
            affordable_influencers,
            persona_scores_df.rename(columns={'persona_fit_score': 'persona_fit'}),
            left_on='username_instagram',
            right_on='instagram_account',
            how='left'
        )
        affordable_influencers['persona_fit'] = affordable_influencers['persona_fit'].fillna(0.5)
        
        print(f"✅ Persona matching: scores calculated")
        
        # Step 3: Audience matching menggunakan REAL data
        brief_audience = matcher.extract_brief_audience_preference(brief_row)
        
        scored_influencers = []
        for _, infl in affordable_influencers.iterrows():
            # Extract real audience analytics
            infl_audience = matcher.extract_influencer_audience_analytics(infl)
            
            # Calculate audience match menggunakan AudienceMatchingGNN
            audience_result = matcher.audience_matcher.calculate_audience_match_score(
                brief_audience, infl_audience, strategy="balanced"
            )
            
            # Performance prediction
            performance_result = matcher.performance_predictor.predict_campaign_performance(
                infl.to_dict(), brief_row.to_dict()
            )
            
            scored_influencers.append({
                'influencer': infl['username_instagram'],
                'persona_fit': infl.get('persona_fit', 0.5),
                'audience_fit': audience_result['total_score'],  # Real audience matching
                'performance_pred': performance_result['performance_score'],
                'budget_efficiency': infl['budget_efficiency'],
                'tier': infl['tier_followers'],
                'expertise': infl['expertise_field'],
                'raw_influencer_data': infl.to_dict()
            })
        
        print(f"✅ Audience & Performance scoring: {len(scored_influencers)} influencers scored")
        
        # Step 4: Final ranking
        ranked_results = matcher.final_ranker.rank_influencers(scored_influencers, brief_priorities)
        
        print(f"✅ Final ranking completed")
        return ranked_results[:top_n]
        
    except Exception as e:
        print(f"❌ Error in get_top_influencers_for_brief: {e}")
        import traceback
        traceback.print_exc()
        return []

def generate_brief_summary(briefs_df, brief_id):
    """Generate brief summary dengan informasi lengkap"""
    brief_data = briefs_df[briefs_df['brief_id'] == brief_id]
    if brief_data.empty:
        return f"Brief '{brief_id}' tidak ditemukan"
    
    brief = brief_data.iloc[0]
    
    # Format budget
    budget_str = f"Rp {brief['budget']:,}"
    
    # Extract audience preference
    try:
        audience_pref = json.loads(brief['audience_preference'])
        target_countries = ", ".join(audience_pref['top_locations']['countries'][:3])
        target_cities = ", ".join(audience_pref['top_locations']['cities'][:3])
        target_age = ", ".join(audience_pref['age_range'])
        target_gender = ", ".join(audience_pref['gender'])
    except:
        target_countries = "Indonesia"
        target_cities = "Jakarta"
        target_age = "18-34"
        target_gender = "All"
    
    summary = f"""
{brief['product_name']}

🏢 Industry: {brief['industry']}
💰 Budget: {budget_str}
🎯 Target Audience:
   📍 Countries: {target_countries}
   🏙️ Cities: {target_cities}
   👥 Age Range: {target_age}
   ⚧ Gender: {target_gender}
📝 Influencer Persona: {brief['influencer_persona'][:150]}...
"""
    return summary

def generate_influencer_insight(username, caption_df, comment_df, show_plot=False):
    """
    Generate insight untuk influencer berdasarkan caption dan comment analysis
    """
    result = ""
    
    # Filter data untuk influencer ini
    captions = caption_df[caption_df['instagram_account'] == username]
    comments = comment_df[comment_df['instagram_account'] == username]
    
    # --- Comment Insight ---
    comment_counts = None
    if not comments.empty:
        comment_counts = comments["predicted_label"].value_counts(normalize=True).mul(100).round(1)
        total_comments = len(comments)
        result += f"\n💬 Comment Quality Analysis (Total: {total_comments} comments)\n"

        # High-value comment rate 
        high_value_labels = ["relatable engagement", "product-focused response", "social virality"]
        high_value_pct = sum([comment_counts.get(lbl, 0) for lbl in comment_counts.index if any(hv in lbl.lower() for hv in high_value_labels)])
        result += f"🎯 High-Value Comment Rate: {high_value_pct:.1f}%\n"
        
        if high_value_pct > 20:
            result += "✅ Good audience engagement quality\n"
        else:
            result += "⚠️ Low engagement quality - mostly passive comments\n"

    # --- Caption Insight ---
    if not captions.empty:
        # Call-to-action habit
        cta_labels = ["call-to-action", "engagement-inviting"]
        cta_count = captions["predicted_label"].str.lower().str.contains('|'.join(cta_labels)).sum()
        total_captions = len(captions)
        result += f"\n📢 Content Style:\n"
        result += f"   🔁 CTA Frequency: {cta_count}/{total_captions} posts have call-to-action\n"

        # Product mention
        prod_labels = ["product-focused", "brand awareness"]
        prod_count = captions["predicted_label"].str.lower().str.contains('|'.join(prod_labels)).sum()
        if prod_count > 0:
            result += f"   🛍️ Brand Mentions: {prod_count}/{total_captions} posts mention products/brands\n"

        # Tone analysis
        caption_counts = captions["predicted_label"].value_counts()
        if not caption_counts.empty:
            dominant_label = caption_counts.idxmax()
            result += f"   🎭 Dominant Tone: {dominant_label}\n"

        if cta_count > total_captions * 0.3:
            result += "   ✅ Strong engagement-driving content\n"

    if not captions.empty or not comments.empty:
        result += f"\n📊 Content Insights Summary:"
        if not captions.empty:
            result += f"\n   📝 Posted {len(captions)} analyzed content pieces"
        if not comments.empty:
            result += f"\n   💬 Generated {len(comments)} audience interactions"
    else:
        result += "\n⚠️ Limited content data available for analysis"

    return result.strip()

In [None]:
# 🎯 UPDATED MAIN DISPLAY FUNCTION - Using Real Data Integration

def display_brief_recommendation(brief_id, briefs_df, influencers_df, caption_df, comment_df, bio_df, top_n=3, brief_priorities=None):
    """
    UPDATED: Display brief summary and top influencer recommendations dengan real data
    """
    print(f"\n{'='*80}")
    print(f"🎯 UPDATED INFLUENCER RECOMMENDATION SYSTEM")
    print(f"Using Real Data Integration with AudienceMatchingGNN")
    print(f"{'='*80}")
    
    # Get recommendations menggunakan updated system
    recommendations = get_top_influencers_for_brief(
        brief_id, briefs_df, influencers_df, bio_df, caption_df, top_n, brief_priorities
    )

    if not recommendations:
        print("❌ No recommendations found.")
        return

    # Display Brief Summary
    print(f"\n📊 BRIEF SUMMARY: {brief_id.upper()}")
    print(f"{'='*60}")
    print(generate_brief_summary(briefs_df, brief_id))

    print(f"\n🏆 TOP {top_n} INFLUENCER RECOMMENDATIONS:")
    print(f"{'='*60}")

    for i, rec in enumerate(recommendations, 1):
        tier_emoji = {
            "Nano": "🔥", "Micro": "⭐", "Mid": "🚀", 
            "Macro": "💎", "Mega": "👑"
        }.get(rec['tier'], "📱")

        print(f"\n{i}. {tier_emoji} @{rec['influencer']} ({rec['tier']} Influencer)")
        print(f"   🎯 Overall Match Score: {rec['final_score']:.3f}")
        print(f"   🏭 Expertise: {rec['expertise']}")

        # Score Breakdown dengan updated components
        print(f"\n   📊 Component Scores:")
        print(f"      • Persona Fit:           {rec['persona_fit']:.1%}")
        print(f"      • Audience Match:        {rec['audience_fit']:.1%}")
        print(f"      • Performance Potential: {rec['performance_pred']:.1%}")
        print(f"      • Budget Efficiency:     {rec['budget_efficiency']:.4f}")

        # Add content insights
        insight = generate_influencer_insight(
            username=rec['influencer'],
            caption_df=caption_df,
            comment_df=comment_df,
            show_plot=False
        )
        
        if insight.strip():
            print(f"\n   📝 Content Insights:")
            for line in insight.split("\n"):
                if line.strip():
                    print(f"      {line}")

        print(f"\n   💡 Recommendation Reason:")
        if rec['audience_fit'] > 0.4:
            print(f"      ✅ Strong audience alignment with brief target")
        if rec['persona_fit'] > 0.6:
            print(f"      ✅ Excellent persona match with brief values")
        if rec['performance_pred'] > 0.6:
            print(f"      ✅ High predicted campaign performance")
        if rec['budget_efficiency'] > 0.5:
            print(f"      ✅ Good budget efficiency ratio")

    # Show strategy used
    if brief_priorities:
        print(f"\n🎛️ Ranking Strategy Used:")
        for component, weight in brief_priorities.items():
            print(f"   • {component.replace('_', ' ').title()}: {weight:.1%}")

    print(f"\n✅ RECOMMENDATION COMPLETE")
    print(f"💡 Using Real Instagram Insights for Audience Matching")
    print(f"🚀 Powered by AudienceMatchingGNN + Multi-Component Analysis")

# Print confirmation that the updated system is ready
print("✅ UPDATED SYSTEM LOADED!")
print("🎯 New Features:")
print("   • AudienceMatchingGNN with real Instagram insights")
print("   • JSON-based audience matching")
print("   • Flexible weight strategies")
print("   • Real data integration")
print("   • Enhanced persona semantic matching")
print("   • Comprehensive performance prediction")
print("   • Multi-objective ranking system")
print("\n🚀 Ready for production use!")

✅ UPDATED SYSTEM LOADED!
🎯 New Features:
   • AudienceMatchingGNN with real Instagram insights
   • JSON-based audience matching
   • Flexible weight strategies
   • Real data integration
   • Enhanced persona semantic matching
   • Comprehensive performance prediction
   • Multi-objective ranking system

🚀 Ready for production use!


In [13]:
# Disesuaikan
brief_priorities = {
        'persona_fit': 0.1,
        'audience_fit': 0.45,  # Changed from demo_fit to audience_fit
        'performance_pred': 0.35,
        'budget_efficiency': 0.1
    }

display_brief_recommendation(
    brief_id="BRIEF_002",
    briefs_df=df_brief,
    influencers_df=df_instagram_influencers,
    caption_df=df_labeled_caption,
    comment_df=df_labeled_comment,
    bio_df=df_bio,
    top_n=3,
    brief_priorities=brief_priorities
)


🎯 UPDATED INFLUENCER RECOMMENDATION SYSTEM
Using Real Data Integration with AudienceMatchingGNN
✅ Budget filtering: 153 influencers affordable
✅ Persona matching: scores calculated
✅ Audience & Performance scoring: 153 influencers scored
✅ Final ranking completed

📊 BRIEF SUMMARY: BRIEF_002

TechFlow Wireless Earbuds

🏢 Industry: Sport
💰 Budget: Rp 75,000,000.0
🎯 Target Audience:
   📍 Countries: Indonesia, Singapore, Malaysia
   🏙️ Cities: Jakarta, Surabaya, Singapore
   👥 Age Range: 25-34, 35-44
   ⚧ Gender: Male, Female
📝 Influencer Persona: Tech reviewer, gadget enthusiast, early adopter...


🏆 TOP 3 INFLUENCER RECOMMENDATIONS:

1. ⭐ @dellaajocelyn (Micro Influencer)
   🎯 Overall Match Score: 0.665
   🏭 Expertise: beauty

   📊 Component Scores:
      • Persona Fit:           31.6%
      • Audience Match:        86.1% (Real Instagram Data)
      • Performance Potential: 70.2%
      • Budget Efficiency:     0.0000

   📝 Content Insights:
      💬 Comment Quality Analysis (Total: 15 co