# Test Notebook

In [1]:
import json
import random
import pandas as pd
import os
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans


  from pkg_resources import parse_version





In [2]:
data = [json.loads(line) for line in open("data/influencer_data.jsonl", "r", encoding="utf-8")]

In [3]:
genres = ["rock", "pop", "hip hop", "jazz", "country", "electronic", "metal", "classical"]
hashtags = [
    "#fyp", "#foryou", "#viral", "#tiktokmusic", "#music", "#musiclover", "#newmusic", "#musician",
    "#song", "#lyrics", "#musica", "#musically", "#musicvideo", "#musical", "#musictok", "#songcover",
    "#remix", "#hiphop", "#rnb", "#popmusic", "#rap", "#dancechallenge", "#edm", "#musicislife",
    "#goodmusic", "#feelthemusic", "#ilovemusic", "#musiclife", "#bestmusic", "#musicismylife",
    "#acoustic", "#cover", "#indie", "#band", "#liveperformance", "#musicproduction", "#unsignedartist",
    "#songwriter", "#singer", "#beats", "#guitarcover", "#pianocover", "#drums", "#vocalist",
    "#singingchallenge", "#musicchallenge", "#musicianlife", "#newartist", "#rapchallenge",
    "#trapmusic", "#popcover", "#dancemusic", "#musictutorial", "#indieartist", "#songwriting",
    "#hiphopmusic", "#remixchallenge", "#musiccollab", "#livemusic", "#musiclove"
]


input_folder = "data"
output_folder = "data"

In [4]:
for filename in os.listdir(input_folder):
    if filename.endswith(".jsonl"):
        input_path = os.path.join(input_folder, filename)
        output_path = os.path.join(output_folder, filename)

        # Load JSONL
        with open(input_path, "r", encoding="utf-8") as f:
            data = [json.loads(line) for line in f]

        # Update each item
        for item in data:
            # Rename MBTI Personality -> genre
            if "MBTI Personality" in item:
                item["genre"] = random.choice(genres)
                del item["MBTI Personality"]

            # Replace Backstory with random follower count
            item["followers"] = random.randint(10_000, 10_000_000)

            # Add hashtags
            item["hashtags"] = random.sample(hashtags, 10)

        # Write back updated JSONL
        with open(output_path, "w", encoding="utf-8") as f:
            for item in data:
                f.write(json.dumps(item, ensure_ascii=False) + "\n")

In [5]:
for filename in os.listdir(input_folder):
    if filename.endswith(".jsonl"):
        input_path = os.path.join(input_folder, filename)
        output_path = os.path.join(output_folder, filename)

        # Load JSONL
        with open(input_path, "r", encoding="utf-8") as f:
            data = [json.loads(line) for line in f]

        # Update each item
        for item in data:
            # Rename MBTI Personality -> genre
            if "MBTI Personality" in item:
                item["genre"] = random.choice(genres)
                del item["MBTI Personality"]

            # Replace Backstory with random follower count
            item["followers"] = random.randint(10_000, 10_000_000)

            # Add hashtags
            item["hashtags"] = random.sample(hashtags, 10)

        # Write back updated JSONL
        with open(output_path, "w", encoding="utf-8") as f:
            for item in data:
                f.write(json.dumps(item, ensure_ascii=False) + "\n")

In [6]:
for filename in os.listdir(input_folder):
    if filename.endswith(".jsonl"):
        input_path = os.path.join(input_folder, filename)

        # Load just the first line to see the keys
        with open(input_path, "r", encoding="utf-8") as f:
            first_item = json.loads(f.readline())

        print(f"✅ Keys in {filename}:")
        print(list(first_item.keys()))
        print("-" * 40)

✅ Keys in influencer_data.jsonl:
['Name', 'Age', 'Sex', 'Country of Origin', 'State or Province', 'Education Level', 'Lifestyle', 'Backstory', 'genre', 'followers', 'hashtags']
----------------------------------------


In [7]:
embed_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
influencer_texts = []













In [8]:
influencer_texts = []
numeric_features = []
influencer_raw_data = []

with open(input_path, "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)
        influencer_raw_data.append(item)
        
        # Numeric features
        age = item.get("Age", 0)
        followers = item.get("followers", 0)
        numeric_features.append([age, followers])
        
        # Text fields to embed (can include categorical ones)
        text_fields = [
            item.get("genre", ""),
            item.get("Lifestyle", ""),
            item.get("Education Level", ""),
            item.get("Sex", ""),
            item.get("Country of Origin", ""),
            item.get("State or Province", "")
        ]
        influencer_texts.append(" ".join(text_fields))

numeric_features = np.array(numeric_features, dtype=np.float32)
# Standardize numeric features
scaler = StandardScaler()
numeric_features_scaled = scaler.fit_transform(numeric_features)

# Compute embeddings in batches
batch_size = 64
all_embeddings = []
for i in range(0, len(influencer_texts), batch_size):
    batch_texts = influencer_texts[i:i+batch_size]
    batch_embeddings = embed_model(batch_texts)
    all_embeddings.append(batch_embeddings)
text_embeddings = tf.concat(all_embeddings, axis=0)

# Combine embeddings + numeric features
numeric_features_tf = tf.constant(numeric_features_scaled, dtype=tf.float32)
combined_features = tf.concat([text_embeddings, numeric_features_tf], axis=1)
combined_features_np = combined_features.numpy()

In [9]:
n_clusters = 50  # choose based on dataset size
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(combined_features_np)

In [10]:
def recommend_by_criteria(input_criteria, top_k=20, text_weight=0.6, numeric_weight=0.4):
    """
    Recommends influencers based on criteria.
    Prioritizes genre/hashtags/age/location clustering.
    
    input_criteria example:
    {
        "Age": 25,
        "followers": 5_000_000,
        "genre": "Electronic",
        "Lifestyle": "Fitness",
        "Country of Origin": "USA",
        "State or Province": "CA",
        "hashtags": ["DJ", "EDM"]
    }
    """
    # 1. Filter by strong categorical matches (genre, country/state)
    filtered_indices = []
    for i, infl in enumerate(influencer_raw_data):
        genre_match = infl.get("genre","").lower() == input_criteria.get("genre","").lower()
        country_match = infl.get("Country of Origin","").lower() == input_criteria.get("Country of Origin","").lower() if "Country of Origin" in input_criteria else True
        state_match = infl.get("State or Province","").lower() == input_criteria.get("State or Province","").lower() if "State or Province" in input_criteria else True
        if genre_match and country_match and state_match:
            filtered_indices.append(i)
    
    # If no exact matches, fallback to full dataset
    if len(filtered_indices) == 0:
        filtered_indices = list(range(len(influencer_raw_data)))
    
    # 2. Build query embedding
    text_fields = [
        input_criteria.get("genre",""),
        input_criteria.get("Lifestyle",""),
        " ".join(input_criteria.get("hashtags",[]))
    ]
    query_text_emb = embed_model([" ".join(text_fields)])
    query_text_emb_norm = tf.math.l2_normalize(query_text_emb, axis=1)
    
    # 3. Compute similarity for filtered influencers
    text_embeddings_filtered = tf.gather(text_embeddings, filtered_indices)
    text_embeddings_norm = tf.math.l2_normalize(text_embeddings_filtered, axis=1)
    text_sims = cosine_similarity(query_text_emb_norm, text_embeddings_norm)[0]
    
    # 4. Numeric similarity (age + followers)
    query_numeric = np.array([[input_criteria.get("Age",0), input_criteria.get("followers",0)]])
    query_numeric_scaled = scaler.transform(query_numeric)
    numeric_filtered = tf.gather(numeric_features_tf, filtered_indices)
    query_numeric_tf = tf.constant(query_numeric_scaled, dtype=tf.float32)
    numeric_sims = cosine_similarity(query_numeric_tf, numeric_filtered)[0]
    
    # 5. Combine weighted similarities
    sims = text_weight * text_sims + numeric_weight * numeric_sims
    
    # 6. Select top_k
    top_idx_rel = sims.argsort()[-top_k:][::-1]
    top_indices = [filtered_indices[i] for i in top_idx_rel]
    
    # 7. Return influencers with similarity scores
    return [(influencer_raw_data[i], sims[j]) for j,i in enumerate(top_indices)]


In [11]:
criteria = {
    "Age": 25,
    "followers": 5_000_000,
    "genre": "Electronic",
    "Lifestyle": "Fitness",
    "Country of Origin": "USA",
    "State or Province": "CA",
    "hashtags": ["DJ","EDM"]
}

recommended = recommend_by_criteria(criteria, top_k=20)
for infl, score in recommended:
    print(infl["Name"], infl["Age"], infl["genre"], infl["followers"], f"score={score:.3f}")


Robert Evans 38 electronic 4201296 score=-0.023
Jennifer Shepard 38 electronic 4163306 score=-0.252
Cindy Lara 32 electronic 6086648 score=-0.057
Darren Ortega 32 electronic 5914975 score=0.040
Jennifer James 38 electronic 4491555 score=0.506
Martin Macias 27 electronic 5485202 score=-0.097
Rodney Miller 37 electronic 2629209 score=-0.268
Ariel Mason 30 electronic 5944005 score=0.004
Timothy Garrett 25 electronic 5004720 score=0.484
Douglas Campbell 32 electronic 7213958 score=0.459
Laurie Merritt 26 electronic 4605666 score=-0.086
Katelyn Andrews 30 electronic 4273150 score=-0.285
Gary Williams 33 electronic 7510401 score=-0.039
Connie Owens MD 28 electronic 5253544 score=-0.315
Ann Richards 30 electronic 5110218 score=0.021
Allison Leon 31 electronic 4324182 score=0.221
Carolyn Tate 34 electronic 5436151 score=-0.076
Melinda Jones 34 electronic 4712234 score=0.088
Christopher Mccormick 40 electronic 4357616 score=-0.205
Steven Bell 41 electronic 2054862 score=-0.112
