In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load the influencer data
influencers_df = pd.read_csv('influencers_data.csv')

# Display the data
print("Dataset Shape:", influencers_df.shape)
print("\nFirst 5 rows:")
print(influencers_df.head())
print("\nColumn Info:")
print(influencers_df.info())


Dataset Shape: (30, 3)

First 5 rows:
   influencer_id                                             bio  \
0              1          Fashion blogger and lifestyle creator.   
1              2       Tech reviewer specializing in AI gadgets.   
2              3         Fitness coach focused on home workouts.   
3              4  Beauty influencer reviewing skincare products.   
4              5          Food vlogger exploring global cuisine.   

                 recent_captions  
0       Trying new recipes today  
1  Testing the latest smartphone  
2      Skincare routine updated!  
3       Trying new recipes today  
4      Skincare routine updated!  

Column Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   influencer_id    30 non-null     int64 
 1   bio              30 non-null     object
 2   recent_captions  30 non-null     object
dt

In [3]:
# Combine bio and recent_captions for richer context
influencers_df['combined_text'] = influencers_df['bio'] + ' ' + influencers_df['recent_captions']

# Clean the text
influencers_df['combined_text_clean'] = (
    influencers_df['combined_text']
    .str.lower()  # Convert to lowercase
    .str.replace('[^a-zA-Z\s]', '', regex=True)  # Remove special characters, keep only letters and spaces
    .str.strip()  # Remove leading/trailing spaces
)

# Show examples
print("Original combined text:")
print(influencers_df[['influencer_id', 'combined_text']].head(3))
print("\n" + "="*80 + "\n")
print("Cleaned text:")
print(influencers_df[['influencer_id', 'combined_text_clean']].head(3))


Original combined text:
   influencer_id                                      combined_text
0              1  Fashion blogger and lifestyle creator. Trying ...
1              2  Tech reviewer specializing in AI gadgets. Test...
2              3  Fitness coach focused on home workouts. Skinca...


Cleaned text:
   influencer_id                                combined_text_clean
0              1  fashion blogger and lifestyle creator trying n...
1              2  tech reviewer specializing in ai gadgets testi...
2              3  fitness coach focused on home workouts skincar...


In [4]:
from sentence_transformers import SentenceTransformer

# Load pre-trained model
print("Loading Sentence Transformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded successfully!")

# Generate embeddings for all influencers
print("\nGenerating embeddings for 30 influencers...")
embeddings = model.encode(
    influencers_df['combined_text_clean'].tolist(), 
    show_progress_bar=True,
    convert_to_numpy=True
)

print(f"\nEmbeddings generated!")
print(f"Shape: {embeddings.shape}")
print(f"Each influencer is represented by a {embeddings.shape[1]}-dimensional vector")
print(f"\nExample - First influencer's embedding (first 10 values):")
print(embeddings[0][:10])


Loading Sentence Transformer model...
Model loaded successfully!

Generating embeddings for 30 influencers...


Batches: 100%|██████████| 1/1 [00:00<00:00,  7.57it/s]


Embeddings generated!
Shape: (30, 384)
Each influencer is represented by a 384-dimensional vector

Example - First influencer's embedding (first 10 values):
[-0.13189654 -0.02120027  0.03573604  0.1149717   0.01638605 -0.02800681
 -0.02462328 -0.06047839 -0.04990065 -0.03304819]





In [5]:
from sklearn.cluster import KMeans

# Define number of niches (clusters)
n_clusters = 8  # You can experiment with 5-10

# Apply K-Means clustering
print(f"Clustering influencers into {n_clusters} niches...")
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
influencers_df['niche_cluster'] = kmeans.fit_predict(embeddings)

print("Clustering complete!")
print("\nNiche Distribution:")
print(influencers_df['niche_cluster'].value_counts().sort_index())

# Show some examples
print("\nSample influencers from each niche:")
for cluster in range(n_clusters):
    print(f"\n--- Niche {cluster} ---")
    sample = influencers_df[influencers_df['niche_cluster'] == cluster][['influencer_id', 'bio']].head(3)
    for idx, row in sample.iterrows():
        print(f"  ID {row['influencer_id']}: {row['bio']}")


Clustering influencers into 8 niches...
Clustering complete!

Niche Distribution:
niche_cluster
0    9
1    4
2    4
3    4
4    3
5    1
6    3
7    2
Name: count, dtype: int64

Sample influencers from each niche:

--- Niche 0 ---
  ID 4: Beauty influencer reviewing skincare products.
  ID 8: Beauty influencer reviewing skincare products.
  ID 16: Beauty influencer reviewing skincare products.

--- Niche 1 ---
  ID 2: Tech reviewer specializing in AI gadgets.
  ID 15: Tech reviewer specializing in AI gadgets.
  ID 19: Tech reviewer specializing in AI gadgets.

--- Niche 2 ---
  ID 6: Motivational speaker & mental wellness advocate.
  ID 7: Motivational speaker & mental wellness advocate.
  ID 12: Motivational speaker & mental wellness advocate.

--- Niche 3 ---
  ID 1: Fashion blogger and lifestyle creator.
  ID 5: Food vlogger exploring global cuisine.
  ID 10: Fashion blogger and lifestyle creator.

--- Niche 4 ---
  ID 9: Travel photographer capturing landscapes.
  ID 13: Travel ph

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_keywords_per_cluster(df, cluster_col='niche_cluster', text_col='combined_text_clean', n_keywords=3):
    """Extract top keywords for each cluster using TF-IDF"""
    cluster_keywords = {}
    
    for cluster in sorted(df[cluster_col].unique()):
        # Get all texts from this cluster
        cluster_texts = df[df[cluster_col] == cluster][text_col].tolist()
        
        if len(cluster_texts) > 0:
            # Apply TF-IDF to find important words
            vectorizer = TfidfVectorizer(
                max_features=n_keywords, 
                stop_words='english',  # Remove common words like 'the', 'is', etc.
                ngram_range=(1, 2)  # Consider single words and 2-word phrases
            )
            tfidf_matrix = vectorizer.fit_transform(cluster_texts)
            keywords = vectorizer.get_feature_names_out()
            cluster_keywords[cluster] = ', '.join(keywords)
    
    return cluster_keywords

# Extract keywords
print("Extracting keywords for each niche...")
keywords_dict = extract_keywords_per_cluster(influencers_df, n_keywords=3)

# Map keywords back to dataframe
influencers_df['niche_keywords'] = influencers_df['niche_cluster'].map(keywords_dict)

# Display results
print("\n" + "="*80)
print("NICHE CLASSIFICATION RESULTS")
print("="*80)
for cluster in sorted(influencers_df['niche_cluster'].unique()):
    print(f"\nNiche {cluster}: {keywords_dict[cluster]}")
    members = influencers_df[influencers_df['niche_cluster'] == cluster]['influencer_id'].tolist()
    print(f"  Members: {members}")
    print(f"  Count: {len(members)}")


Extracting keywords for each niche...

NICHE CLASSIFICATION RESULTS

Niche 0: beauty, beauty influencer, influencer
  Members: [4, 8, 16, 18, 20, 21, 23, 24, 26]
  Count: 9

Niche 1: ai, ai gadgets, gadgets
  Members: [2, 15, 19, 22]
  Count: 4

Niche 2: advocate, mental wellness, motivational
  Members: [6, 7, 12, 30]
  Count: 4

Niche 3: blogger, blogger lifestyle, creator
  Members: [1, 5, 10, 25]
  Count: 4

Niche 4: capturing, capturing landscapes, landscapes
  Members: [9, 13, 27]
  Count: 3

Niche 5: ai, ai gadgets, gadgets
  Members: [28]
  Count: 1

Niche 6: coach, coach focused, fitness
  Members: [3, 14, 29]
  Count: 3

Niche 7: blogger, blogger sharing, family tips
  Members: [11, 17]
  Count: 2


In [7]:
import os

# Create outputs folder
os.makedirs('outputs', exist_ok=True)

# Prepare output data
niche_output = influencers_df[[
    'influencer_id', 
    'bio', 
    'recent_captions',
    'combined_text_clean',
    'niche_cluster', 
    'niche_keywords'
]]

# Save to CSV
niche_output.to_csv('outputs/niche_classification.csv', index=False)

print("✓ Part 1 results saved to: outputs/niche_classification.csv")
print(f"\nFinal output shape: {niche_output.shape}")
print("\nPreview:")
print(niche_output.head(10))


✓ Part 1 results saved to: outputs/niche_classification.csv

Final output shape: (30, 6)

Preview:
   influencer_id                                               bio  \
0              1            Fashion blogger and lifestyle creator.   
1              2         Tech reviewer specializing in AI gadgets.   
2              3           Fitness coach focused on home workouts.   
3              4    Beauty influencer reviewing skincare products.   
4              5            Food vlogger exploring global cuisine.   
5              6  Motivational speaker & mental wellness advocate.   
6              7  Motivational speaker & mental wellness advocate.   
7              8    Beauty influencer reviewing skincare products.   
8              9         Travel photographer capturing landscapes.   
9             10            Fashion blogger and lifestyle creator.   

                 recent_captions  \
0       Trying new recipes today   
1  Testing the latest smartphone   
2      Skincare routin

In [8]:
# Load growth data
growth_df = pd.read_csv('growth_data.csv')

print("Growth Dataset Shape:", growth_df.shape)
print("\nColumns:", growth_df.columns.tolist())
print("\nFirst 3 rows:")
display(growth_df.head(3))


Growth Dataset Shape: (30, 5)

Columns: ['influencer_id', 'followers', 'avg_likes', 'avg_comments', 'follower_growth_last_30_days']

First 3 rows:


Unnamed: 0,influencer_id,followers,avg_likes,avg_comments,follower_growth_last_30_days
0,1,88625,2664,558,"[56645, 57484, 58163, 59051, 60462, 62351, 637..."
1,2,144172,2149,98,"[112377, 113837, 114728, 116710, 118062, 11970..."
2,3,121905,5870,104,"[92528, 92638, 94179, 95074, 95796, 96483, 974..."


In [9]:
# Calculate engagement rate: (likes + comments) / followers * 100
growth_df['engagement_rate'] = (
    (growth_df['avg_likes'] + growth_df['avg_comments']) / growth_df['followers']
) * 100

# Round to 2 decimal places
growth_df['engagement_rate'] = growth_df['engagement_rate'].round(2)

print("Engagement Rate Statistics:")
print(growth_df['engagement_rate'].describe())
print("\nSample engagement rates:")
print(growth_df[['influencer_id', 'followers', 'avg_likes', 'avg_comments', 'engagement_rate']].head(10))


Engagement Rate Statistics:
count    30.000000
mean      6.361333
std       4.787053
min       0.960000
25%       3.507500
50%       5.045000
75%       8.227500
max      18.140000
Name: engagement_rate, dtype: float64

Sample engagement rates:
   influencer_id  followers  avg_likes  avg_comments  engagement_rate
0              1      88625       2664           558             3.64
1              2     144172       2149            98             1.56
2              3     121905       5870           104             4.90
3              4     150939       5623           126             3.81
4              5      67968      10373           677            16.26
5              6      72480      10267           330            14.62
6              7      76480      13680           193            18.14
7              8     172094       1472           339             1.05
8              9      91504      14558           628            16.60
9             10     105138       7294            69    

In [10]:
import ast

# The follower_growth_last_30_days column is stored as a string representation of a list
# We need to convert it to actual Python lists

print("Before parsing:")
print(type(growth_df['follower_growth_last_30_days'].iloc[0]))
print(growth_df['follower_growth_last_30_days'].iloc[0][:100], "...")

# Parse string to list
growth_df['follower_growth_last_30_days'] = growth_df['follower_growth_last_30_days'].apply(ast.literal_eval)

print("\nAfter parsing:")
print(type(growth_df['follower_growth_last_30_days'].iloc[0]))
print(f"Length of array: {len(growth_df['follower_growth_last_30_days'].iloc[0])}")
print(f"Example array (first 10 days): {growth_df['follower_growth_last_30_days'].iloc[0][:10]}")


Before parsing:
<class 'str'>
[56645, 57484, 58163, 59051, 60462, 62351, 63798, 64992, 65240, 65678, 66564, 66891, 68122, 69478, 7 ...

After parsing:
<class 'list'>
Length of array: 30
Example array (first 10 days): [56645, 57484, 58163, 59051, 60462, 62351, 63798, 64992, 65240, 65678]


In [11]:
def analyze_follower_growth(follower_history):
    """
    Analyze follower growth pattern and detect spikes
    Returns: daily_growth, avg_growth, std_growth, spike_count, total_growth_pct
    """
    # Calculate daily growth (difference between consecutive days)
    daily_growth = np.diff(follower_history)
    
    # Statistical measures
    avg_growth = np.mean(daily_growth)
    std_growth = np.std(daily_growth)
    
    # Spike detection: growth > mean + 2*std (statistical outlier)
    spike_threshold = avg_growth + 2 * std_growth
    spikes = daily_growth > spike_threshold
    spike_count = np.sum(spikes)
    
    # Total growth percentage over 30 days
    total_growth_pct = ((follower_history[-1] - follower_history[0]) / follower_history[0]) * 100
    
    return daily_growth, avg_growth, std_growth, spike_count, total_growth_pct, spike_threshold

# Test on first influencer
test_history = growth_df['follower_growth_last_30_days'].iloc[0]
daily, avg, std, spikes, total_pct, threshold = analyze_follower_growth(test_history)

print(f"Example Analysis - Influencer ID {growth_df['influencer_id'].iloc[0]}:")
print(f"  Starting followers: {test_history[0]:,}")
print(f"  Ending followers: {test_history[-1]:,}")
print(f"  Average daily growth: {avg:.0f}")
print(f"  Std deviation: {std:.0f}")
print(f"  Spike threshold: {threshold:.0f}")
print(f"  Number of spikes detected: {spikes}")
print(f"  Total 30-day growth: {total_pct:.1f}%")


Example Analysis - Influencer ID 1:
  Starting followers: 56,645
  Ending followers: 88,625
  Average daily growth: 1103
  Std deviation: 507
  Spike threshold: 2117
  Number of spikes detected: 0
  Total 30-day growth: 56.5%


In [12]:
def calculate_fake_score(row):
    """
    Calculate fake follower score (0-100) based on multiple factors
    """
    follower_history = row['follower_growth_last_30_days']
    engagement_rate = row['engagement_rate']
    
    # Analyze growth pattern
    daily_growth, avg_growth, std_growth, spike_count, total_growth_pct, _ = analyze_follower_growth(follower_history)
    
    # SCORING COMPONENTS (max 100 points)
    
    # 1. Spike Score (0-50 points): More spikes = more suspicious
    spike_score = min(spike_count * 10, 50)
    
    # 2. Low Engagement Score (0-40 points): Lower engagement = more suspicious
    # Normal engagement is 5-10%, below 2% is very suspicious
    if engagement_rate < 2:
        engagement_score = 40
    elif engagement_rate < 5:
        engagement_score = 30
    elif engagement_rate < 8:
        engagement_score = 15
    else:
        engagement_score = 0
    
    # 3. Rapid Growth Score (0-30 points): Growth > 50% in 30 days is suspicious
    if total_growth_pct > 70:
        growth_score = 30
    elif total_growth_pct > 50:
        growth_score = 20
    elif total_growth_pct > 30:
        growth_score = 10
    else:
        growth_score = 0
    
    # Total fake score (capped at 100)
    fake_score = min(spike_score + engagement_score + growth_score, 100)
    
    return pd.Series({
        'spike_count': spike_count,
        'total_growth_pct': round(total_growth_pct, 2),
        'fake_score': round(fake_score, 2),
        'spike_component': spike_score,
        'engagement_component': engagement_score,
        'growth_component': growth_score
    })

# Apply to all influencers
print("Calculating fake scores for all influencers...")
fake_scores = growth_df.apply(calculate_fake_score, axis=1)
growth_df = pd.concat([growth_df, fake_scores], axis=1)

print("\nFake Score Distribution:")
print(growth_df['fake_score'].describe())


Calculating fake scores for all influencers...

Fake Score Distribution:
count    30.000000
mean     27.833333
std      13.175430
min       0.000000
25%      16.250000
50%      30.000000
75%      37.500000
max      50.000000
Name: fake_score, dtype: float64


In [13]:
def generate_reason(row):
    """
    Generate human-readable explanation for the fake score
    """
    reasons = []
    
    # Check each red flag
    if row['spike_count'] >= 3:
        reasons.append(f"{int(row['spike_count'])} follower spikes detected")
    elif row['spike_count'] >= 1:
        reasons.append(f"{int(row['spike_count'])} follower spike detected")
    
    if row['engagement_rate'] < 2:
        reasons.append(f"Very low engagement rate ({row['engagement_rate']:.2f}%)")
    elif row['engagement_rate'] < 5:
        reasons.append(f"Low engagement rate ({row['engagement_rate']:.2f}%)")
    
    if row['total_growth_pct'] > 70:
        reasons.append(f"Extremely rapid growth ({row['total_growth_pct']:.1f}%)")
    elif row['total_growth_pct'] > 50:
        reasons.append(f"Unusually rapid growth ({row['total_growth_pct']:.1f}%)")
    
    # If no red flags, it's normal
    if len(reasons) == 0:
        return "Normal activity pattern"
    
    return "; ".join(reasons)

# Generate reasons for all influencers
growth_df['reason'] = growth_df.apply(generate_reason, axis=1)

# Display top suspicious influencers
print("="*80)
print("TOP 10 MOST SUSPICIOUS INFLUENCERS")
print("="*80)
top_suspicious = growth_df.nlargest(10, 'fake_score')[
    ['influencer_id', 'followers', 'engagement_rate', 'spike_count', 'total_growth_pct', 'fake_score', 'reason']
]
display(top_suspicious)


TOP 10 MOST SUSPICIOUS INFLUENCERS


Unnamed: 0,influencer_id,followers,engagement_rate,spike_count,total_growth_pct,fake_score,reason
0,1,88625,3.64,0.0,56.46,50.0,Low engagement rate (3.64%); Unusually rapid g...
23,24,113174,1.61,0.0,37.87,50.0,Very low engagement rate (1.61%)
26,27,85155,4.17,0.0,55.65,50.0,Low engagement rate (4.17%); Unusually rapid g...
1,2,144172,1.56,0.0,28.29,40.0,Very low engagement rate (1.56%)
2,3,121905,4.9,0.0,31.75,40.0,Low engagement rate (4.90%)
7,8,172094,1.05,0.0,24.23,40.0,Very low engagement rate (1.05%)
18,19,219009,1.21,0.0,18.82,40.0,Very low engagement rate (1.21%)
22,23,208193,0.96,0.0,17.01,40.0,Very low engagement rate (0.96%)
3,4,150939,3.81,0.0,22.9,30.0,Low engagement rate (3.81%)
4,5,67968,16.26,0.0,73.79,30.0,Extremely rapid growth (73.8%)


In [14]:
# Prepare output data
fake_follower_output = growth_df[[
    'influencer_id',
    'followers',
    'avg_likes',
    'avg_comments',
    'engagement_rate',
    'spike_count',
    'total_growth_pct',
    'fake_score',
    'reason'
]].sort_values('fake_score', ascending=False)

# Save to CSV
fake_follower_output.to_csv('outputs/fake_follower_scores.csv', index=False)

print("✓ Part 2 results saved to: outputs/fake_follower_scores.csv")
print(f"\nOutput shape: {fake_follower_output.shape}")
print("\nScore Summary:")
print(f"  High Risk (score >= 60): {len(fake_follower_output[fake_follower_output['fake_score'] >= 60])}")
print(f"  Medium Risk (40-59): {len(fake_follower_output[(fake_follower_output['fake_score'] >= 40) & (fake_follower_output['fake_score'] < 60)])}")
print(f"  Low Risk (score < 40): {len(fake_follower_output[fake_follower_output['fake_score'] < 40])}")


✓ Part 2 results saved to: outputs/fake_follower_scores.csv

Output shape: (30, 9)

Score Summary:
  High Risk (score >= 60): 0
  Medium Risk (40-59): 8
  Low Risk (score < 40): 22


In [15]:
brands_df = pd.read_csv('brands_data.csv')

print("Brands Dataset Shape:", brands_df.shape)
print("\nBrand Information:")
display(brands_df)

Brands Dataset Shape: (10, 3)

Brand Information:


Unnamed: 0,brand_id,brand_description,keywords
0,1,Luxury fashion brand for young adults.,"fashion,lifestyle"
1,2,Tech company building AI-powered devices.,"technology,gadgets"
2,3,Fitness supplement brand.,"fitness,health"
3,4,Organic food products brand.,"food,organic"
4,5,Global travel accessories company.,"travel,adventure"
5,6,Premium skincare brand.,"beauty,skincare"
6,7,Gaming peripherals manufacturer.,"gaming,esports"
7,8,Baby products and parenting essentials.,"parenting,babies"
8,9,Investment and financial tools company.,"finance,investing"
9,10,Mental health & wellness organization.,"mental health,wellness"


In [16]:
# Combine brand description and keywords for richer context
brands_df['brand_text'] = brands_df['brand_description'] + ' ' + brands_df['keywords']

# Clean the text (same preprocessing as influencers)
brands_df['brand_text_clean'] = (
    brands_df['brand_text']
    .str.lower()
    .str.replace('[^a-zA-Z\s]', '', regex=True)
    .str.strip()
)

print("Brand Text Examples:")
for idx, row in brands_df.iterrows():
    print(f"\nBrand {row['brand_id']}: {row['brand_description']}")
    print(f"  Cleaned text: {row['brand_text_clean']}")


Brand Text Examples:

Brand 1: Luxury fashion brand for young adults.
  Cleaned text: luxury fashion brand for young adults fashionlifestyle

Brand 2: Tech company building AI-powered devices.
  Cleaned text: tech company building aipowered devices technologygadgets

Brand 3: Fitness supplement brand.
  Cleaned text: fitness supplement brand fitnesshealth

Brand 4: Organic food products brand.
  Cleaned text: organic food products brand foodorganic

Brand 5: Global travel accessories company.
  Cleaned text: global travel accessories company traveladventure

Brand 6: Premium skincare brand.
  Cleaned text: premium skincare brand beautyskincare

Brand 7: Gaming peripherals manufacturer.
  Cleaned text: gaming peripherals manufacturer gamingesports

Brand 8: Baby products and parenting essentials.
  Cleaned text: baby products and parenting essentials parentingbabies

Brand 9: Investment and financial tools company.
  Cleaned text: investment and financial tools company financeinvesting


In [17]:
from sentence_transformers import SentenceTransformer

# Use the same model as Part 1
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate brand embeddings
print("Generating embeddings for brands...")
brand_embeddings = model.encode(
    brands_df['brand_text_clean'].tolist(),
    show_progress_bar=True,
    convert_to_numpy=True
)

print(f"\nBrand embeddings shape: {brand_embeddings.shape}")
print(f"Each brand is represented by a {brand_embeddings.shape[1]}-dimensional vector")


Generating embeddings for brands...


Batches: 100%|██████████| 1/1 [00:00<00:00, 11.31it/s]


Brand embeddings shape: (10, 384)
Each brand is represented by a 384-dimensional vector





In [18]:
# We need the influencer embeddings and data from Part 1
# If you haven't saved them as variables, reload them

# Load influencer data
influencers_df = pd.read_csv('outputs/niche_classification.csv')

# Regenerate influencer embeddings (or use saved ones)
print("Loading influencer embeddings...")
influencer_embeddings = model.encode(
    influencers_df['combined_text_clean'].tolist(),
    show_progress_bar=True,
    convert_to_numpy=True
)

print(f"Influencer embeddings shape: {influencer_embeddings.shape}")
print(f"Ready to match {len(brands_df)} brands with {len(influencers_df)} influencers")


Loading influencer embeddings...


Batches: 100%|██████████| 1/1 [00:00<00:00,  9.50it/s]

Influencer embeddings shape: (30, 384)
Ready to match 10 brands with 30 influencers





In [19]:
from sklearn.metrics.pairwise import cosine_similarity

def match_influencers_to_brand(brand_idx, top_n=10):
    """
    Find top N influencers for a given brand based on semantic similarity
    """
    # Get brand embedding (reshape to 2D array for cosine_similarity)
    brand_emb = brand_embeddings[brand_idx].reshape(1, -1)
    
    # Calculate cosine similarity with all influencers
    similarities = cosine_similarity(brand_emb, influencer_embeddings)[0]
    
    # Get indices of top N influencers (sorted by similarity)
    top_indices = np.argsort(similarities)[::-1][:top_n]
    
    # Build results
    results = []
    for rank, idx in enumerate(top_indices, 1):
        results.append({
            'rank': rank,
            'brand_id': brands_df.iloc[brand_idx]['brand_id'],
            'brand_name': brands_df.iloc[brand_idx]['brand_description'],
            'brand_keywords': brands_df.iloc[brand_idx]['keywords'],
            'influencer_id': influencers_df.iloc[idx]['influencer_id'],
            'influencer_bio': influencers_df.iloc[idx]['bio'],
            'influencer_niche': influencers_df.iloc[idx]['niche_keywords'],
            'similarity_score': round(similarities[idx] * 100, 2)  # Convert to percentage
        })
    
    return results

# Test on first brand
print("Testing on Brand 1 (Luxury Fashion)...")
test_matches = match_influencers_to_brand(0, top_n=10)
test_df = pd.DataFrame(test_matches)

print(f"\nTop 10 Influencers for: {brands_df.iloc[0]['brand_description']}")
display(test_df[['rank', 'influencer_id', 'influencer_bio', 'similarity_score']])


Testing on Brand 1 (Luxury Fashion)...

Top 10 Influencers for: Luxury fashion brand for young adults.


Unnamed: 0,rank,influencer_id,influencer_bio,similarity_score
0,1,10,Fashion blogger and lifestyle creator.,49.310001
1,2,18,Beauty influencer reviewing skincare products.,46.990002
2,3,1,Fashion blogger and lifestyle creator.,35.91
3,4,25,Fashion blogger and lifestyle creator.,35.91
4,5,26,Beauty influencer reviewing skincare products.,31.860001
5,6,21,Beauty influencer reviewing skincare products.,31.68
6,7,23,Beauty influencer reviewing skincare products.,31.370001
7,8,20,Beauty influencer reviewing skincare products.,31.370001
8,9,30,Motivational speaker & mental wellness advocate.,31.280001
9,10,4,Beauty influencer reviewing skincare products.,31.209999


In [20]:
# Match all brands with influencers
print("Matching all brands with influencers...")
print("="*80)

all_matches = []

for brand_idx in range(len(brands_df)):
    brand_name = brands_df.iloc[brand_idx]['brand_description']
    print(f"\nProcessing Brand {brand_idx + 1}: {brand_name}")
    
    # Get top 10 matches for this brand
    matches = match_influencers_to_brand(brand_idx, top_n=10)
    all_matches.extend(matches)
    
    # Show preview
    print(f"  Top match: Influencer {matches[0]['influencer_id']} - {matches[0]['influencer_bio'][:50]}... (Score: {matches[0]['similarity_score']})")

# Convert to DataFrame
matches_df = pd.DataFrame(all_matches)

print(f"\n✓ Matching complete!")
print(f"Total matches: {len(matches_df)}")
print(f"Brands processed: {len(brands_df)}")
print(f"Matches per brand: {len(matches_df) // len(brands_df)}")


Matching all brands with influencers...

Processing Brand 1: Luxury fashion brand for young adults.
  Top match: Influencer 10 - Fashion blogger and lifestyle creator.... (Score: 49.310001373291016)

Processing Brand 2: Tech company building AI-powered devices.
  Top match: Influencer 2 - Tech reviewer specializing in AI gadgets.... (Score: 52.52000045776367)

Processing Brand 3: Fitness supplement brand.
  Top match: Influencer 29 - Fitness coach focused on home workouts.... (Score: 44.27000045776367)

Processing Brand 4: Organic food products brand.
  Top match: Influencer 4 - Beauty influencer reviewing skincare products.... (Score: 37.09000015258789)

Processing Brand 5: Global travel accessories company.
  Top match: Influencer 13 - Travel photographer capturing landscapes.... (Score: 28.729999542236328)

Processing Brand 6: Premium skincare brand.
  Top match: Influencer 18 - Beauty influencer reviewing skincare products.... (Score: 68.87999725341797)

Processing Brand 7: Gaming 

In [21]:
# Analyze similarity score distribution
print("Match Quality Analysis:")
print("="*80)
print("\nSimilarity Score Statistics:")
print(matches_df['similarity_score'].describe())

print("\nScore Distribution:")
print(f"  Excellent Match (>70): {len(matches_df[matches_df['similarity_score'] > 70])}")
print(f"  Good Match (60-70): {len(matches_df[(matches_df['similarity_score'] >= 60) & (matches_df['similarity_score'] <= 70)])}")
print(f"  Moderate Match (50-60): {len(matches_df[(matches_df['similarity_score'] >= 50) & (matches_df['similarity_score'] < 60)])}")
print(f"  Weak Match (<50): {len(matches_df[matches_df['similarity_score'] < 50])}")

# Show best matches overall
print("\n" + "="*80)
print("TOP 10 BEST MATCHES ACROSS ALL BRANDS")
print("="*80)
top_matches = matches_df.nlargest(10, 'similarity_score')[
    ['brand_name', 'influencer_id', 'influencer_bio', 'similarity_score']
]
display(top_matches)


Match Quality Analysis:

Similarity Score Statistics:
count    100.000000
mean      32.758499
std       14.582878
min        4.550000
25%       23.994999
50%       29.195000
75%       42.799999
max       68.879997
Name: similarity_score, dtype: float64

Score Distribution:
  Excellent Match (>70): 0
  Good Match (60-70): 7
  Moderate Match (50-60): 7
  Weak Match (<50): 86

TOP 10 BEST MATCHES ACROSS ALL BRANDS


Unnamed: 0,brand_name,influencer_id,influencer_bio,similarity_score
50,Premium skincare brand.,18,Beauty influencer reviewing skincare products.,68.879997
51,Premium skincare brand.,21,Beauty influencer reviewing skincare products.,66.059998
52,Premium skincare brand.,4,Beauty influencer reviewing skincare products.,65.699997
53,Premium skincare brand.,24,Beauty influencer reviewing skincare products.,62.220001
54,Premium skincare brand.,8,Beauty influencer reviewing skincare products.,62.220001
55,Premium skincare brand.,23,Beauty influencer reviewing skincare products.,61.880001
56,Premium skincare brand.,20,Beauty influencer reviewing skincare products.,61.880001
57,Premium skincare brand.,26,Beauty influencer reviewing skincare products.,56.630001
58,Premium skincare brand.,16,Beauty influencer reviewing skincare products.,55.849998
90,Mental health & wellness organization.,30,Motivational speaker & mental wellness advocate.,53.189999


In [22]:
# Show detailed results for each brand
print("="*80)
print("DETAILED BRAND-INFLUENCER MATCHES")
print("="*80)

for brand_id in brands_df['brand_id'].unique():
    brand_matches = matches_df[matches_df['brand_id'] == brand_id]
    brand_name = brand_matches.iloc[0]['brand_name']
    
    print(f"\n{'='*80}")
    print(f"Brand {brand_id}: {brand_name}")
    print(f"Keywords: {brand_matches.iloc[0]['brand_keywords']}")
    print(f"{'='*80}")
    
    display(brand_matches[['rank', 'influencer_id', 'influencer_bio', 'influencer_niche', 'similarity_score']])


DETAILED BRAND-INFLUENCER MATCHES

Brand 1: Luxury fashion brand for young adults.
Keywords: fashion,lifestyle


Unnamed: 0,rank,influencer_id,influencer_bio,influencer_niche,similarity_score
0,1,10,Fashion blogger and lifestyle creator.,"blogger, blogger lifestyle, creator",49.310001
1,2,18,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",46.990002
2,3,1,Fashion blogger and lifestyle creator.,"blogger, blogger lifestyle, creator",35.91
3,4,25,Fashion blogger and lifestyle creator.,"blogger, blogger lifestyle, creator",35.91
4,5,26,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",31.860001
5,6,21,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",31.68
6,7,23,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",31.370001
7,8,20,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",31.370001
8,9,30,Motivational speaker & mental wellness advocate.,"advocate, mental wellness, motivational",31.280001
9,10,4,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",31.209999



Brand 2: Tech company building AI-powered devices.
Keywords: technology,gadgets


Unnamed: 0,rank,influencer_id,influencer_bio,influencer_niche,similarity_score
10,1,2,Tech reviewer specializing in AI gadgets.,"ai, ai gadgets, gadgets",52.52
11,2,22,Tech reviewer specializing in AI gadgets.,"ai, ai gadgets, gadgets",52.299999
12,3,15,Tech reviewer specializing in AI gadgets.,"ai, ai gadgets, gadgets",48.150002
13,4,19,Tech reviewer specializing in AI gadgets.,"ai, ai gadgets, gadgets",45.959999
14,5,28,Tech reviewer specializing in AI gadgets.,"ai, ai gadgets, gadgets",42.779999
15,6,7,Motivational speaker & mental wellness advocate.,"advocate, mental wellness, motivational",31.01
16,7,8,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",24.139999
17,8,24,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",24.139999
18,9,30,Motivational speaker & mental wellness advocate.,"advocate, mental wellness, motivational",17.35
19,10,6,Motivational speaker & mental wellness advocate.,"advocate, mental wellness, motivational",17.26



Brand 3: Fitness supplement brand.
Keywords: fitness,health


Unnamed: 0,rank,influencer_id,influencer_bio,influencer_niche,similarity_score
20,1,29,Fitness coach focused on home workouts.,"coach, coach focused, fitness",44.27
21,2,14,Fitness coach focused on home workouts.,"coach, coach focused, fitness",44.27
22,3,16,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",41.240002
23,4,1,Fashion blogger and lifestyle creator.,"blogger, blogger lifestyle, creator",38.82
24,5,25,Fashion blogger and lifestyle creator.,"blogger, blogger lifestyle, creator",38.82
25,6,4,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",37.98
26,7,18,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",37.310001
27,8,30,Motivational speaker & mental wellness advocate.,"advocate, mental wellness, motivational",37.09
28,9,3,Fitness coach focused on home workouts.,"coach, coach focused, fitness",36.060001
29,10,21,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",34.799999



Brand 4: Organic food products brand.
Keywords: food,organic


Unnamed: 0,rank,influencer_id,influencer_bio,influencer_niche,similarity_score
30,1,4,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",37.09
31,2,25,Fashion blogger and lifestyle creator.,"blogger, blogger lifestyle, creator",35.380001
32,3,1,Fashion blogger and lifestyle creator.,"blogger, blogger lifestyle, creator",35.380001
33,4,5,Food vlogger exploring global cuisine.,"blogger, blogger lifestyle, creator",34.119999
34,5,18,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",29.23
35,6,21,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",26.16
36,7,23,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",26.110001
37,8,20,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",26.110001
38,9,26,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",25.049999
39,10,10,Fashion blogger and lifestyle creator.,"blogger, blogger lifestyle, creator",24.51



Brand 5: Global travel accessories company.
Keywords: travel,adventure


Unnamed: 0,rank,influencer_id,influencer_bio,influencer_niche,similarity_score
40,1,13,Travel photographer capturing landscapes.,"capturing, capturing landscapes, landscapes",28.73
41,2,9,Travel photographer capturing landscapes.,"capturing, capturing landscapes, landscapes",28.4
42,3,26,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",27.68
43,4,18,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",25.719999
44,5,27,Travel photographer capturing landscapes.,"capturing, capturing landscapes, landscapes",25.58
45,6,23,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",22.469999
46,7,20,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",22.469999
47,8,10,Fashion blogger and lifestyle creator.,"blogger, blogger lifestyle, creator",21.65
48,9,21,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",21.450001
49,10,24,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",21.129999



Brand 6: Premium skincare brand.
Keywords: beauty,skincare


Unnamed: 0,rank,influencer_id,influencer_bio,influencer_niche,similarity_score
50,1,18,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",68.879997
51,2,21,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",66.059998
52,3,4,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",65.699997
53,4,24,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",62.220001
54,5,8,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",62.220001
55,6,23,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",61.880001
56,7,20,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",61.880001
57,8,26,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",56.630001
58,9,16,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",55.849998
59,10,3,Fitness coach focused on home workouts.,"coach, coach focused, fitness",51.91



Brand 7: Gaming peripherals manufacturer.
Keywords: gaming,esports


Unnamed: 0,rank,influencer_id,influencer_bio,influencer_niche,similarity_score
60,1,15,Tech reviewer specializing in AI gadgets.,"ai, ai gadgets, gadgets",42.860001
61,2,22,Tech reviewer specializing in AI gadgets.,"ai, ai gadgets, gadgets",28.18
62,3,2,Tech reviewer specializing in AI gadgets.,"ai, ai gadgets, gadgets",27.4
63,4,28,Tech reviewer specializing in AI gadgets.,"ai, ai gadgets, gadgets",25.09
64,5,21,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",22.610001
65,6,19,Tech reviewer specializing in AI gadgets.,"ai, ai gadgets, gadgets",19.32
66,7,5,Food vlogger exploring global cuisine.,"blogger, blogger lifestyle, creator",18.280001
67,8,17,Parenting blogger sharing family tips.,"blogger, blogger sharing, family tips",16.370001
68,9,9,Travel photographer capturing landscapes.,"capturing, capturing landscapes, landscapes",15.99
69,10,7,Motivational speaker & mental wellness advocate.,"advocate, mental wellness, motivational",14.99



Brand 8: Baby products and parenting essentials.
Keywords: parenting,babies


Unnamed: 0,rank,influencer_id,influencer_bio,influencer_niche,similarity_score
70,1,11,Parenting blogger sharing family tips.,"blogger, blogger sharing, family tips",45.240002
71,2,17,Parenting blogger sharing family tips.,"blogger, blogger sharing, family tips",44.560001
72,3,20,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",35.580002
73,4,23,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",35.580002
74,5,4,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",29.16
75,6,21,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",28.709999
76,7,22,Tech reviewer specializing in AI gadgets.,"ai, ai gadgets, gadgets",26.059999
77,8,18,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",25.620001
78,9,24,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",25.32
79,10,8,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",25.32



Brand 9: Investment and financial tools company.
Keywords: finance,investing


Unnamed: 0,rank,influencer_id,influencer_bio,influencer_niche,similarity_score
80,1,12,Motivational speaker & mental wellness advocate.,"advocate, mental wellness, motivational",28.110001
81,2,11,Parenting blogger sharing family tips.,"blogger, blogger sharing, family tips",19.1
82,3,22,Tech reviewer specializing in AI gadgets.,"ai, ai gadgets, gadgets",15.19
83,4,15,Tech reviewer specializing in AI gadgets.,"ai, ai gadgets, gadgets",13.43
84,5,19,Tech reviewer specializing in AI gadgets.,"ai, ai gadgets, gadgets",11.6
85,6,2,Tech reviewer specializing in AI gadgets.,"ai, ai gadgets, gadgets",10.57
86,7,16,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",7.58
87,8,21,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",4.81
88,9,28,Tech reviewer specializing in AI gadgets.,"ai, ai gadgets, gadgets",4.78
89,10,8,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",4.55



Brand 10: Mental health & wellness organization.
Keywords: mental health,wellness


Unnamed: 0,rank,influencer_id,influencer_bio,influencer_niche,similarity_score
90,1,30,Motivational speaker & mental wellness advocate.,"advocate, mental wellness, motivational",53.189999
91,2,12,Motivational speaker & mental wellness advocate.,"advocate, mental wellness, motivational",52.939999
92,3,6,Motivational speaker & mental wellness advocate.,"advocate, mental wellness, motivational",49.84
93,4,7,Motivational speaker & mental wellness advocate.,"advocate, mental wellness, motivational",43.209999
94,5,16,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",27.549999
95,6,23,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",25.940001
96,7,20,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",25.940001
97,8,27,Travel photographer capturing landscapes.,"capturing, capturing landscapes, landscapes",23.559999
98,9,21,Beauty influencer reviewing skincare products.,"beauty, beauty influencer, influencer",20.959999
99,10,29,Fitness coach focused on home workouts.,"coach, coach focused, fitness",19.950001


In [23]:
# Save complete matching results
matches_output = matches_df[[
    'brand_id',
    'brand_name',
    'brand_keywords',
    'rank',
    'influencer_id',
    'influencer_bio',
    'influencer_niche',
    'similarity_score'
]].sort_values(['brand_id', 'rank'])

matches_output.to_csv('outputs/brand_influencer_matches.csv', index=False)

print("✓ Part 3 results saved to: outputs/brand_influencer_matches.csv")
print(f"\nOutput shape: {matches_output.shape}")
print(f"Total brands: {matches_output['brand_id'].nunique()}")
print(f"Total unique influencers matched: {matches_output['influencer_id'].nunique()}")
print(f"Average similarity score: {matches_output['similarity_score'].mean():.2f}")


✓ Part 3 results saved to: outputs/brand_influencer_matches.csv

Output shape: (100, 8)
Total brands: 10
Total unique influencers matched: 30
Average similarity score: 32.76
