# Code: Build Features and Recommender

In [14]:
import pandas as pd
import numpy as np
import math

In [2]:
users = pd.read_csv("Dataset/users.csv", parse_dates=["signup_date"])
pings = pd.read_csv("Dataset/pings.csv", parse_dates=["created_at"])
interactions = pd.read_csv("Dataset/interactions (1).csv", parse_dates=["event_timestamp"])

In [3]:
weight_map = {
    "impression": 0.0,
    "view": 0.2,
    "like": 1.0,
    "comment": 1.5,
    "share": 2.0,
    "follow_creator": 2.5
}

# Compute interaction_score again
inter = interactions.copy()
inter["event_weight"] = inter["event_type"].map(weight_map)

inter = inter.merge(
    pings[["ping_id", "duration_sec"]],
    on="ping_id",
    how="left"
)

inter["watch_ratio"] = (
    inter["watch_time_sec"] / inter["duration_sec"]
).fillna(0).clip(0, 1)

inter["interaction_score"] = inter["event_weight"] + inter["watch_ratio"]

# Aggregate engagement per ping
eng = (
    inter.groupby(["user_id", "ping_id"], as_index=False)
         .agg({"interaction_score": "sum"})
)

eng_by_ping = (
    eng.groupby("ping_id")["interaction_score"]
       .sum()
       .reset_index()
)

# Popularity: log-normalized
eng_by_ping["popularity"] = eng_by_ping["interaction_score"].apply(lambda x: math.log1p(x))
pop_map = eng_by_ping.set_index("ping_id")["popularity"].to_dict()


# Code: Build user affinities

In [8]:
# Merge metadata
inter2 = inter.merge(
    pings[["ping_id", "category", "main_hashtag", "creator_id"]],
    on="ping_id",
    how="left"
)

# Category affinity
user_cat = inter2.pivot_table(
    index="user_id",
    columns="category",
    values="interaction_score",
    aggfunc="sum",
    fill_value=0
)

# Hashtag affinity
user_tag = inter2.pivot_table(
    index="user_id",
    columns="main_hashtag",
    values="interaction_score",
    aggfunc="sum",
    fill_value=0
)

# Creator affinity
user_creator = inter2.pivot_table(
    index="user_id",
    columns="creator_id",
    values="interaction_score",
    aggfunc="sum",
    fill_value=0
)

# Normalize each row to sum to 1
def normalize(df):
    s = df.sum(axis=1).replace(0, np.nan)
    return df.div(s, axis=0).fillna(0)

user_cat_pref = normalize(user_cat)
user_tag_pref = normalize(user_tag)
user_creator_pref = normalize(user_creator)


# Code: Freshness score

In [9]:
pings2 = pings.copy()
latest_created = pings2["created_at"].max()

pings2["days_old"] = (latest_created - pings2["created_at"]).dt.days
pings2["fresh_raw"] = 1 / (1 + pings2["days_old"])

# Normalize freshness
pings2["freshness"] = (
    (pings2["fresh_raw"] - pings2["fresh_raw"].min()) /
    (pings2["fresh_raw"].max() - pings2["fresh_raw"].min() + 1e-9)
)

ping_features = pings2.set_index("ping_id")[["category", "main_hashtag", "creator_id", "freshness"]]


# Recommendation function

In [15]:
def recommend(user_id, top_k=10, alpha=1.0, beta=3.0, gamma=1.0):
    scores = []

    # Get user preference rows
    cat_pref = user_cat_pref.loc[user_id] if user_id in user_cat_pref.index else None
    tag_pref = user_tag_pref.loc[user_id] if user_id in user_tag_pref.index else None
    creator_pref = user_creator_pref.loc[user_id] if user_id in user_creator_pref.index else None

    for pid, row in ping_features.iterrows():
        pop = pop_map.get(pid, 0)

        affinity = 0
        if cat_pref is not None:
            affinity += cat_pref.get(row["category"], 0)
        if tag_pref is not None:
            affinity += tag_pref.get(row["main_hashtag"], 0)
        if creator_pref is not None:
            affinity += creator_pref.get(row["creator_id"], 0)

        fresh = row["freshness"]

        score = alpha * pop + beta * affinity + gamma * fresh

        scores.append((pid, score, pop, affinity, fresh))

    df = pd.DataFrame(scores, columns=["ping_id", "score", "popularity", "affinity", "freshness"])
    df = df.merge(pings, on="ping_id", how="left")
    return df.sort_values("score", ascending=False).head(top_k)


In [13]:
sample_users

['u16', 'u10', 'u18']

In [12]:
sample_users = users["user_id"].sample(3).tolist()

for u in sample_users:
    recs = recommend(u, top_k=10)
    print("User:", u)
    display(recs[["ping_id", "score", "popularity", "affinity", "freshness", "category", "main_hashtag", "creator_id"]])
    print()


User: u16


Unnamed: 0,ping_id,score,popularity,affinity,freshness,category,main_hashtag,creator_id
28,p29,5.741478,1.812962,0.976172,1.0,dance,food,u19
5,p6,5.221912,2.155982,1.012507,0.028409,dance,comedy,u18
32,p33,4.648854,2.188189,0.818584,0.004914,dance,fitness,u16
13,p14,4.298676,1.774952,0.839846,0.004187,dance,makeup,u18
7,p8,3.917633,1.85468,0.666818,0.0625,beauty,comedy,u4
24,p25,3.89871,2.125583,0.257709,1.0,beauty,makeup,u11
19,p20,3.697792,2.251292,0.148833,1.0,education,travel,u14
35,p36,3.669252,1.68165,0.499655,0.488636,dance,music,u17
20,p21,3.426241,1.435085,0.659119,0.013799,dance,fitness,u15
6,p7,3.412698,2.222419,0.233881,0.488636,travel,music,u8



User: u10


Unnamed: 0,ping_id,score,popularity,affinity,freshness,category,main_hashtag,creator_id
19,p20,4.816257,2.251292,0.521655,1.0,education,travel,u14
24,p25,4.281583,2.125583,0.385333,1.0,beauty,makeup,u11
28,p29,4.254391,1.812962,0.480477,1.0,dance,food,u19
12,p13,4.176997,2.115008,0.683908,0.010264,food,makeup,u18
13,p14,4.027829,1.774952,0.749563,0.004187,dance,makeup,u18
38,p39,3.930257,2.024553,0.63307,0.006494,education,music,u19
5,p6,3.612391,2.155982,0.476,0.028409,dance,comedy,u18
1,p2,3.589946,1.794696,0.520765,0.232955,gaming,coding,u19
0,p1,3.571573,2.168645,0.466696,0.002841,education,football,u3
6,p7,3.47014,2.222419,0.253028,0.488636,travel,music,u8



User: u18


Unnamed: 0,ping_id,score,popularity,affinity,freshness,category,main_hashtag,creator_id
19,p20,6.612449,2.251292,1.120386,1.0,education,travel,u14
24,p25,5.012726,2.125583,0.629048,1.0,beauty,makeup,u11
38,p39,4.72496,2.024553,0.897971,0.006494,education,music,u19
0,p1,4.722352,2.168645,0.850289,0.002841,education,football,u3
21,p22,4.365835,1.498996,0.947693,0.02376,beauty,travel,u14
28,p29,4.168153,1.812962,0.45173,1.0,dance,food,u19
36,p37,3.654448,1.664059,0.662068,0.004187,education,football,u9
33,p34,3.444593,1.547563,0.624423,0.02376,beauty,travel,u2
25,p26,3.429401,2.021793,0.408597,0.181818,travel,travel,u5
30,p31,3.40484,2.272126,0.368913,0.025974,comedy,music,u5





# Goal
- The objective is to build a simple recommendation method that can generate a Top 10 ranked list of pings for any user using only the offline dataset. The solution should rely on available interaction history, content attributes, and simple heuristics that can be computed efficiently.

### Approach
I combined three major signals into a single ranking score: popularity, user affinity, and freshness. Popularity captures what is broadly appealing across the entire platform. User affinity captures how much a user tends to engage with specific categories, hashtags, or creators. Freshness introduces a recency factor so that newer content can surface even if it has not accumulated a lot of popularity yet.

### Popularity Score
I computed the global engagement score for each ping from Task 1 and transformed it using log(1 + engagement). This gives higher score to items that have accumulated strong interaction while reducing the impact of extreme outliers.

### User Affinity
For each user, I summed interaction_score across the category, main_hashtag, and creator_id of the pings they have interacted with. These were normalized row-wise so that each user map becomes a distribution. The affinity for a recommended ping is the sum of the user’s preference weight for its category, hashtag, and creator. The more a user has engaged with similar content, the higher its affinity.

### Freshness
Freshness was defined as an inverse age measure: 1 divided by (1 + days since creation). This was then normalized between 0 and 1. This gives higher values to recently uploaded pings and lower values to older ones.

### Final Score
score = alpha * popularity + beta * affinity + gamma * freshness
I set alpha = 1, beta = 3, and gamma = 1. These values give user affinity the strongest influence, since personalization is more important than global popularity or recency.

### Results
Below are examples for three users in the dataset. I list the top recommended pings together with their popularity, affinity, and freshness components. This shows why each ping appears high in the ranking.

* User u16
    - The top items for u16 have strong affinity values, especially p29 and p6. These pings combine high affinity with moderate to high popularity. Items like p29 and p25 also benefit from freshness equal to 1.0 since they were recently uploaded. For this user, dance related content appears repeatedly, which reflects their strong prior engagement with similar categories and creators.

* User u10
    - Recommendations for u10 are driven by a mix of high freshness and moderate affinity. Items p20, p25, and p29 all have freshness equal to 1.0 and good popularity scores. This user interacts broadly with education and beauty content, and those categories appear near the top. The model correctly places p13 and p14 higher due to stronger affinity even when freshness is low.

* User u18
    - This user has strong and diverse historical interactions, which leads to high affinity scores across several categories. Item p20 ranks first with high values in all three components. Items p25 and p39 also rise due to a combination of good popularity and strong affinity. Education and beauty related content shows up frequently, mirroring the user’s previous activity patterns.

### Evaluation Metrics
I would evaluate this recommender offline using two metrics.
Hit Rate at 10: For each user, I check whether any of the pings they engaged with in a held out time window appear in the Top 10 recommendations produced using earlier data. This measures the system’s ability to retrieve relevant items.
Normalized Discounted Cumulative Gain at 10: This compares the order of recommended items with the order of the user’s true interactions. NDCG rewards placing more relevant or more engaged items higher in the ranking. These two metrics together provide a good view of retrieval quality and ranking quality.