In [1]:

import streamlit as st
import pandas as pd
import sqlite3

def load_user_profiles():
    conn = sqlite3.connect("../test_profiles.db")
    df = pd.read_sql_query("SELECT * FROM user_profiles", conn)
    print(df.head())
    conn.close()
    return df

# Load profiles dataframe
user_df = load_user_profiles()


   id first_name last_name  age  gender marital_status  \
0   1       John       Doe   28    Male         Single   
1   2        Aya    Shaker   22  Female         Single   
2   3        Aya    Shaker   22  Female         Single   
3   4      Nadia     Fawzy   34  Female       Divorced   
4   5    Hussein      Adel   20    Male         Single   

                                education                      job  \
0          Bachelor's in Computer Science        Software Engineer   
1                             High School  Intern Graphic Designer   
2                             High School  Intern Graphic Designer   
3                   BEd Arabic Literature      High School Teacher   
4  Undergraduate BSc Computer Engineering                  Student   

         location first_interest  ...  third_interest_percentage  \
0    Cairo, Egypt     Technology  ...                         10   
1    Cairo, Egypt         Design  ...                         15   
2    Cairo, Egypt         

In [2]:
user_df.head().T

Unnamed: 0,0,1,2,3,4
id,1,2,3,4,5
first_name,John,Aya,Aya,Nadia,Hussein
last_name,Doe,Shaker,Shaker,Fawzy,Adel
age,28,22,22,34,20
gender,Male,Female,Female,Female,Male
marital_status,Single,Single,Single,Divorced,Single
education,Bachelor's in Computer Science,High School,High School,BEd Arabic Literature,Undergraduate BSc Computer Engineering
job,Software Engineer,Intern Graphic Designer,Intern Graphic Designer,High School Teacher,Student
location,"Cairo, Egypt","Cairo, Egypt","Cairo, Egypt","Asyut, Egypt","Zagazig, Egypt"
first_interest,Technology,Design,Design,Education,Technology


In [3]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
ARTIFACT_DIR = "artifacts/cross_encoder_ms_marco"
MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L6-v2"

# --- Download & Store Model in Artifact Directory ---

# def download_and_cache_model():
#     os.makedirs(ARTIFACT_DIR, exist_ok=True)
#     # Download tokenizer and model to artifact directory
#     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
#     model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
#     tokenizer.save_pretrained(ARTIFACT_DIR)
#     model.save_pretrained(ARTIFACT_DIR)
#     return

# download_and_cache_model()

def load_cross_encoder():
    tokenizer = AutoTokenizer.from_pretrained(ARTIFACT_DIR)
    model = AutoModelForSequenceClassification.from_pretrained(ARTIFACT_DIR)
    model.eval()
    return tokenizer, model

tokenizer, model = load_cross_encoder()


In [9]:


def compute_similarity(row, campaign_desc):
    # Build user-side concatenated string
    interests = [
        f"{row['first_interest']} ({row['first_interest_percentage']}%)",
        f"{row['second_interest']} ({row['second_interest_percentage']}%)",
        f"{row['third_interest']} ({row['third_interest_percentage']}%)"
    ]
    user_text = (
        row['personality_summary'] + " \n" +
        ' | '.join(interests) + " \n" +
        'Key Activities: ' + ', '.join(eval(row['key_activities'])) + " \n" +
        'Top Habits: ' + ', '.join(eval(row['top_habits'])) + " \n" +
        f"Top Hobby: {row['top_hobby']} \n" +
        f"Travel Indicator: {row['travel_indicators']} \n" +
        'Life Indicators: ' + ', '.join(eval(row['life_indicators'])) + " \n" +
        'Spending Indicators: ' + ', '.join(eval(row['spending_indicators']))
    )
    # Tokenize inputs as per cross-encoder API
    features = tokenizer(
        [user_text],
        [campaign_desc],
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    # Model inference
    with torch.no_grad():
        scores = model(**features).logits
        print(scores)


In [10]:
# Apply scoring
campaign_desc = """

    Our product is a platform for users to learn about technology and programming.
"""
df_filtered = user_df.copy()
df_filtered['score'] = df_filtered.apply(lambda r: compute_similarity(r, campaign_desc), axis=1)


tensor([[-6.3092]])
tensor([[-6.4282]])
tensor([[-6.4282]])
tensor([[-8.0495]])
tensor([[-5.3047]])
tensor([[-8.3583]])
tensor([[-7.6297]])
tensor([[-7.1215]])
tensor([[-8.2317]])
tensor([[-9.1109]])
tensor([[-7.6572]])
tensor([[-10.1229]])
tensor([[-9.1442]])
tensor([[-8.8902]])
tensor([[-9.4644]])
tensor([[-8.9396]])
tensor([[-8.6862]])
tensor([[-8.1889]])
tensor([[-8.8608]])
tensor([[-9.8341]])
tensor([[-8.0833]])
tensor([[-8.9438]])
tensor([[-8.3893]])


In [None]:

# Sidebar controls for threshold and top-n
threshold = st.sidebar.slider("Score Threshold", 0.0, 1.0, 0.5, 0.01)
top_n = st.sidebar.number_input("Max Users to Display", min_value=1, max_value=len(df_filtered), value=50)

df_results = df_filtered[df_filtered['score'] >= threshold]
df_results = df_results.nlargest(top_n, 'score')

st.subheader(f"Matched Users: {len(df_results)}")
st.dataframe(df_results[['first_name', 'last_name', 'age', 'gender', 'location', 'score']])

# Export button
csv = df_results.to_csv(index=False).encode('utf-8')
st.download_button(
    label="Download Results as CSV",
    data=csv,
    file_name='matched_users.csv',
    mime='text/csv'
)