# Task 1: Sequential Next-Streamer Prediction

The goal of this task is to predict which streamer a user will watch next, given their most recent viewing history.
Each row in the dataset represents a 10-minute interval in which a user watched a particular streamer.

To build a supervised learning dataset, we must:

1. Construct chronological sequences of (user, streamer, timestamp)
2. Sort interactions per user by time
3. Create training examples of the form:

\[
(\text{current\_streamer}) \rightarrow (\text{next\_streamer})
\]

Each (user, streamer_start_time) entry becomes a step in the sequence.
We derive transition pairs such that:

- Input: the streamer at time \( t \)  
- Target: the streamer at time \( t + 1 \)

These transitions serve as ground truth for sequence-aware recommenders such as Markov models, item-item CF sequences, and hybrid recommenders.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

In [2]:
full_path = "data/100k_a.csv"
colnames = ["user_id", "stream_id", "streamer_username", "time_start", "time_stop"]
CHUNK_SIZE = 2_000_000

user_sequences = defaultdict(list)
for chunk in pd.read_csv(full_path, 
                         header=None, 
                         names=colnames,
                         chunksize=CHUNK_SIZE):

    minimal = chunk[["user_id", "streamer_username", "time_start"]]
    for row in minimal.itertuples(index=False):
        user_sequences[row.user_id].append((row.time_start, row.streamer_username))

    print(f"Processed {len(user_sequences):,} unique users so far...", end="\r")

print("\nFinished building raw user sequences.")


Processed 100,000 unique users so far...
Finished building raw user sequences.


In [3]:
for u in user_sequences:
    user_sequences[u].sort(key=lambda x: x[0])

print("User sequences sorted chronologically.")

User sequences sorted chronologically.


In [4]:
pairs_user = []
pairs_current = []
pairs_next = []
pairs_time = []

for u, seq in user_sequences.items():
    if len(seq) < 2:
        continue

    for i in range(len(seq) - 1):
        t_now, s_now = seq[i]
        t_next, s_next = seq[i + 1]

        pairs_user.append(u)
        pairs_current.append(s_now)
        pairs_next.append(s_next)
        pairs_time.append(t_next)

In [5]:
transitions_df = pd.DataFrame({
    "user_id": pairs_user,
    "current_streamer": pairs_current,
    "next_streamer": pairs_next,
    "time": pairs_time
})

transitions_df = transitions_df.sort_values("time").reset_index(drop=True)
transitions_df.head()

Unnamed: 0,user_id,current_streamer,next_streamer,time
0,13763,fantaplay,aboutseppi,0
1,13763,swagzyox,ilcapitanglitchchannel,0
2,13763,ilcapitanglitchchannel,antoniodistasio,0
3,13763,antoniodistasio,imperius84,0
4,13763,imperius84,nezak_,0


In [6]:
N = len(transitions_df)
train_end = int(0.70 * N)
val_end = int(0.80 * N)

train_df = transitions_df.iloc[:train_end]
val_df   = transitions_df.iloc[train_end:val_end]
test_df  = transitions_df.iloc[val_end:]

In [7]:
train_users = set(train_df["user_id"])

val_df = val_df[val_df["user_id"].isin(train_users)].reset_index(drop=True)
test_df = test_df[test_df["user_id"].isin(train_users)].reset_index(drop=True)

train_df.shape, val_df.shape, test_df.shape

((2066213, 4), (285749, 4), (558362, 4))

## BASELINE

In [8]:
global_counts = train_df["next_streamer"].value_counts()
global_topK = list(global_counts.head(20).index)

def predict_popularity(K=10):
    return global_topK[:K]


from collections import defaultdict, Counter

markov_counts = defaultdict(Counter)

for row in train_df.itertuples(index=False):
    markov_counts[row.current_streamer][row.next_streamer] += 1

markov_probs = {}

for s_now, counts in markov_counts.items():
    total = sum(counts.values())
    markov_probs[s_now] = {s_next: c/total for s_next, c in counts.items()}

len(markov_probs)


In [9]:
from collections import defaultdict, Counter

markov_counts = defaultdict(Counter)

for row in train_df.itertuples(index=False):
    markov_counts[row.current_streamer][row.next_streamer] += 1

markov_probs = {}

for s_now, counts in markov_counts.items():
    total = sum(counts.values())
    markov_probs[s_now] = {s_next: c/total for s_next, c in counts.items()}

len(markov_probs)

129318

In [10]:
def predict_markov(s_now, K=10):
    if s_now not in markov_probs:
        return global_topK[:K]
    ranked = sorted(markov_probs[s_now].items(), key=lambda x:-x[1])
    return [s for s, _ in ranked[:K]]


In [11]:
def hit_at_k(preds, true_item, K):
    return int(true_item in preds[:K])

def mrr(preds, true_item):
    for rank, item in enumerate(preds, start=1):
        if item == true_item:
            return 1/rank
    return 0.0


In [12]:
def evaluate(model_fn, df, K=10):
    hits = 0
    mrr_sum = 0
    N = len(df)

    for row in df.itertuples(index=False):
        preds = model_fn(row.current_streamer, K=K)
        hits += hit_at_k(preds, row.next_streamer, K)
        mrr_sum += mrr(preds, row.next_streamer)

    return hits/N, mrr_sum/N

hit10, mrr10 = evaluate(predict_markov, val_df, K=10)
hit10, mrr10


(0.25780317691400495, 0.12722675861548924)

In [13]:
# Normalize global popularity
global_pop = (global_counts / global_counts.sum()).to_dict()

def predict_hybrid(s_now, w=0.7, K=10):
    scores = Counter()

    if s_now in markov_probs:
        for nxt, p in markov_probs[s_now].items():
            scores[nxt] += w * p

    for streamer, p in global_pop.items():
        scores[streamer] += (1 - w) * p

    ranked = scores.most_common(K)
    return [s for s, _ in ranked]


In [14]:
weights = [0.1, 0.5, 0.9]
results = {}

for w in weights:
    acc, _ = evaluate(lambda s_now, K: predict_hybrid(s_now, w=w, K=K),
                      val_df, K=10)
    results[w] = acc
    print(f"Weight {w}: Hit@10 = {acc:.4f}")

results
best_w = max(results, key=results.get)
best_w

In [None]:
final_hit, final_mrr = evaluate(
    lambda s_now, K: predict_hybrid(s_now, w=best_w, K=K),
    test_df,
    K=10
)

final_hit, final_mrr