In [93]:
# Import Statements

import pandas as pd
import time
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import jaccard_score
import numpy as np

In [94]:
# Import Data

influencers = pd.read_csv("Cleaned_Influencer_Tag_Dataset.csv")
resteraunts = pd.read_csv("Restaurants_with_Individual_Tags.csv")

# Dealing with Missing Values

influencers = influencers[["Influencer Instagram", "Tags", "IG Followers", "IG Engagement", "IG Exposure Score", "Individual_Tags"]].dropna(subset = ["Influencer Instagram"])
influencers["IG Followers"] = influencers["IG Followers"].fillna(0)
influencers["IG Engagement"] = influencers["IG Engagement"].fillna(0)
influencers["IG Exposure Score"] = influencers["IG Exposure Score"].fillna(0)

In [95]:
#Cleaning Functions

def convert_followers(val):
    if isinstance(val, str):
        val = val.strip().lower().replace(",", "")
        if "k" in val:
            return float(val.replace("k","")) * 1000
        elif "m" in val:
            return float(val.replace("m", "")) * 1000000
            
def clean_values(val):
    if isinstance(val, str):
        val = val.strip().lower().replace("%", "")
        val = val.replace("(est.)", "")
        return val

# Run cleaning functions

influencers['IG Followers'] = influencers['IG Followers'].apply(convert_followers)
influencers["IG Engagement"] = influencers["IG Engagement"].apply(clean_values)
influencers["IG Exposure Score"] = influencers["IG Exposure Score"].apply(clean_values)

In [96]:
restaurant_tags = resteraunts["Individual_Tags"].dropna().str.lower().str.strip() # Create a list of resteraunt Tags
influencer_tags = influencers["Individual_Tags"].dropna().str.lower().str.strip() # Create a list of influencer Tags

valid_tags = set(restaurant_tags).union(set(influencer_tags)) # Create a list of the tags that the two datasets share

print(valid_tags) # all posible tags

{'date night', 'fine dining', 'halal', 'coffee', 'asian', 'fast-food', 'chicken', 'vegan', 'family', 'brunch', 'healthy', 'black-owned', 'indian', 'hispanic', 'spa/self-care', 'plano/frisco', 'bakery', 'dallas/richardson', 'entertainment', 'buffet', 'spa / self-care', 'casual', 'aesthetic', 'wine/cocktails/drinks', 'mexican', 'soul/southern food', 'party', 'desserts', 'carrollton', 'fort worth'}


In [97]:
# Strip Resteraunt Tags

resteraunt_df = resteraunts.copy()
resteraunt_df = resteraunt_df[resteraunt_df["Individual_Tags"].notna()]
resteraunt_df["Individual_Tags"] = resteraunt_df["Individual_Tags"].str.lower().str.strip()

# Strip Influencers Tags

influencer_df = influencers.copy()
influencer_df = influencer_df[influencer_df["Individual_Tags"].notna()]
influencer_df["Individual_Tags"] = influencer_df["Individual_Tags"].str.lower().str.strip()

In [98]:
# Create a combined dataframe to encode

combined_tags = pd.concat([
    resteraunt_df[["Individual_Tags"]],
    influencer_df[["Individual_Tags"]]
])

# Encode the join dataframe
encoder = OneHotEncoder(sparse_output=False)
encoder.fit(combined_tags)

In [103]:
# One-hot encode restaurants

rest_encoded = encoder.transform(resteraunt_df[["Individual_Tags"]])
rest_encoded_df = pd.DataFrame(rest_encoded, columns=encoder.get_feature_names_out(["Individual_Tags"]))
resteraunt_updated = pd.concat([resteraunt_df.reset_index(drop=True), rest_encoded_df], axis=1)

# One-hot encode influencers

infl_encoded = encoder.transform(influencer_df[["Individual_Tags"]])
infl_encoded_df = pd.DataFrame(infl_encoded, columns=encoder.get_feature_names_out(["Individual_Tags"]))
influencer_updated = pd.concat([influencer_df.reset_index(drop=True), infl_encoded_df], axis=1)

# View both completed datasets
display(resteraunt_updated, influencer_updated)

['Italian Garden McKinny',
 'Italian Garden McKinny',
 'Stop N Go Gyros McKinny',
 'Stop N Go Gyros McKinny',
 'Trinity Groves',
 'Trinity Groves',
 'San Marzano',
 'San Marzano',
 "Frida's Tacolandia",
 "Frida's Tacolandia",
 'Mami Coco',
 'Mami Coco',
 "Nuno's",
 "Nuno's",
 "Nuno's",
 "Nuno's",
 'Zizikis',
 'Zizikis',
 'Empire Baking Company',
 'Empire Baking Company',
 'Empire Baking Company',
 'S&Js Hot Chick',
 'S&Js Hot Chick',
 'S&Js Hot Chick',
 'MiYa Chinese',
 'MiYa Chinese',
 'Sura BBQ',
 'Sura BBQ',
 "Uncle Julio's",
 "Uncle Julio's",
 'Freebirds',
 'Freebirds',
 'Freebirds',
 'Taco Bueno',
 'Taco Bueno',
 'Big Guys Chicken and Rice',
 'Big Guys Chicken and Rice',
 'Big Guys Chicken and Rice',
 'One and Hana',
 'One and Hana',
 'Original Pancake House',
 'Original Pancake House',
 'Original Pancake House',
 'Original Pancake House',
 'Okaeri Cafe',
 'Okaeri Cafe',
 'Okaeri Cafe',
 'Eat N dash',
 'Eat N dash',
 'Eat N dash',
 'Bougie Boozy Bears',
 'Bougie Boozy Bears',
 'Bo

In [104]:
# Create a list of unique tags

category_cols = [col.strip() for col in influencer_updated.columns if col.startswith("Individual_Tags_")]

# Define Resteraunt Chosen

restaurant_name = "Eat N dash"

# Create a list of the tags that specfic resteraunt has

input_tags = resteraunt_updated[
    resteraunt_updated["Name"] == restaurant_name
]["Individual_Tags"].tolist()

# Create a dictionary for each of the resteraunt tag

restaurant_tags = {
    tag: 1 if tag.replace("Individual_Tags_", "") in input_tags else 0
    for tag in category_cols
}

# Create a numpy array from this

rest_vector = np.array([restaurant_tags[tag] for tag in category_cols])
rest_vector

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0])

In [101]:
# Clean Performance Indicators

influencer_updated["IG Exposure Score"] = influencer_updated["IG Exposure Score"].replace(["#value!", "N/A", "", "na"], np.nan)
influencer_updated["IG Followers"] = influencer_updated["IG Followers"].replace(["#value!", "N/A", "", "na"], np.nan)

influencer_updated["Performance"] = (
    influencer_updated["IG Exposure Score"].astype(float) * 
    influencer_updated["IG Followers"].astype(float)
)
influencer_updated["IG Exposure Score"] = pd.to_numeric(influencer_updated["IG Exposure Score"], errors="coerce")

# Create Performance Column

influencer_updated["Performance"] = (
    influencer_updated["IG Exposure Score"] * influencer_updated["IG Followers"]
)

In [110]:
matches = []

# Calculate Jaccard and Final Score

for i, row in influencer_updated.iterrows():
    infl_vector = np.array([row[tag] for tag in category_cols]).astype(int) # Create an array for the influencers tags

    intersection = np.sum(np.logical_and(rest_vector, infl_vector)) # intersction
    union = np.sum(np.logical_or(rest_vector, infl_vector)) # Union
    jaccard_score = intersection / union if union != 0 else 0

    performance_score = row.get("Performance", 1)
    final_score = jaccard_score * performance_score

    if jaccard_score > 0:
        shared_tags = [
            tag.replace("Individual_Tags_", "").strip()
            for tag, r, i in zip(category_cols, rest_vector, infl_vector)
            if r == 1 and i == 1
        ]
        print(f"{row['Influencer Instagram']} → shared tags: {shared_tags}")

# Create a dictionary of top n influenceres
    
    matches.append({
        "Influencer_Name": row["Influencer Instagram"],
        "Jaccard": jaccard_score,
        "Performance": performance_score,
        "Final_Score": final_score
    })

# Sort by final score, get top 3 matches

matches_df = pd.DataFrame(matches)

top_matches = (
    matches_df
    .sort_values("Final_Score", ascending=False)
    .drop_duplicates(subset="Influencer_Name")
    .head(3)
)

# Print top matches

for _, match in top_matches.iterrows():
    print(f"{match['Influencer_Name']} → Jaccard: {match['Jaccard']:.2f}, Performance: {match['Performance']:.0f}, Final: {match['Final_Score']:.2f}")

eatindallas → shared tags: ['black-owned']
eatindallas → shared tags: ['soul/southern food']
eatindallas → shared tags: ['casual']
stephaniecamillee → shared tags: ['black-owned']
stephaniecamillee → shared tags: ['soul/southern food']
dallas_foodie_fix → shared tags: ['black-owned']
bigdawg.tv → shared tags: ['black-owned']
bigdawg.tv → shared tags: ['casual']
dubbsgrubs → shared tags: ['black-owned']
dallassocialbutterfly → shared tags: ['black-owned']
tasting.tales → shared tags: ['casual']
driaeatsdtx → shared tags: ['black-owned']
arricandcarissa → shared tags: ['black-owned']
wherebebe → shared tags: ['black-owned']
gimme_comidita → shared tags: ['casual']
hungryasskid → shared tags: ['casual']
monaeporter_explores → shared tags: ['black-owned']
unapologeticallyseasoned → shared tags: ['black-owned']
dakorean_qt6 → shared tags: ['black-owned']
driaeatsdtx → Jaccard: 0.33, Performance: 1, Final: 0.33
dakorean_qt6 → Jaccard: 0.33, Performance: 1, Final: 0.33
stephaniecamillee → Jac