In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import os
import datetime as dt
import itertools
import networkx as nx

In [2]:
test_df = pd.read_csv("test_source_events.csv")
train_df = pd.read_csv("train_source_events.csv")

## Step 1: Calculate Drama Popularity Metrics

In [4]:
# Calculate total views for each drama
drama_total_views = train_df['title_id'].value_counts()

# Calculate unique users who watched each drama
drama_unique_users = train_df.groupby('title_id')['user_id'].nunique()

# Calculate average completion ratio for each drama
drama_avg_completion_ratio = train_df.groupby('title_id')['played_duration'].mean()


## Step 2: Filter Drama Connections Based on Shared Users

In [7]:
# Calculate the median number of shared users as the threshold
shared_users_threshold = train_df.groupby(['title_id'])['user_id'].nunique().median()

# Create drama connections dataframe
drama_connections = train_df.groupby('user_id')['title_id'].apply(set).reset_index()

# Generate drama pairs based on shared users
drama_pairs = (
    drama_connections['title_id']
    .apply(lambda x: [(a, b) for a, b in itertools.combinations(x, 2)])
    .explode()
)

# Filter drama pairs based on shared users threshold
filtered_drama_pairs = (
    drama_pairs.groupby(drama_pairs)
    .size()
    .reset_index(name='shared_users')
    .query('shared_users >= @shared_users_threshold')
)


## Step 3: Apply DeepWalk Algorithm for Drama Embeddings

DeepWalk is a graph embedding algorithm that learns low-dimensional representations (embeddings) of nodes in a graph.
This step applies the DeepWalk algorithm to the graph formed by the filtered drama pairs.
The code uses the DeepWalk class (or a similar graph embedding algorithm) to run the DeepWalk algorithm and obtain embeddings for the dramas.
The resulting drama embeddings capture the underlying structure and relationships of the dramas based on their shared viewership patterns.
These embeddings can be used for various downstream tasks, such as similarity analysis, recommendation systems, or clustering of dramas.

In [None]:
import networkx as nx
from node2vec import Node2Vec

# Create graph from filtered drama pairs
G = nx.Graph()
G.add_edges_from(filtered_drama_pairs.to_numpy())

# Generate random walks on the graph using Node2Vec
node2vec = Node2Vec(G, dimensions=32, walk_length=80, num_walks=10)
model = node2vec.fit(window=10, min_count=0, sg=1, workers=1)

# Obtain drama embeddings
drama_embeddings = {int(node): model.wv[str(node)] for node in G.nodes()}


Computing transition probabilities:   0%|          | 0/192712 [00:00<?, ?it/s]

In [None]:
# Step 4: Group by user and derive user-level features
user_grouped = train_df.groupby('user_id')
user_features = user_grouped.agg({
    'played_duration': ['sum', 'count'],
    'title_id': 'nunique'
})
user_features.columns = ['total_duration', 'total_views', 'unique_dramas']

# Step 5: Merge all features with train_df based on user_id
train_merged = pd.merge(train_df, drama_counts, on='title_id', how='left')
train_merged = pd.merge(train_merged, drama_unique_users, on='title_id', how='left')
train_merged = pd.merge(train_merged, drama_total_episodes, on='title_id', how='left')
train_merged = pd.merge(train_merged, drama_avg_completion_ratio, on='title_id', how='left')
train_merged = pd.merge(train_merged, user_features, on='user_id', how='left')

# Step 6: Merge drama embeddings with train_merged based on title_id
train_merged = pd.merge(train_merged, pd.DataFrame.from_dict(drama_embeddings, orient='index'), left_on='title_id', right_index=True, how='left')

# Step 7: Format the data with one user_id per row
user_features_final = user_grouped[['user_id', 'total_duration', 'total_views', 'unique_dramas']].first().reset_index(drop=True)
train_final = pd.merge(user_features_final, train_merged.drop_duplicates(subset=['user_id']), on='user_id', how='inner')

# Step 8: Train your prediction model using train_final

In [14]:
# save the result to file
df.to_csv("Procecssed_test_source_events", index=True)

Extracting frequency counts for time=29201
Extracting frequency counts for time=10103
Extracting frequency counts for time=13313
Extracting frequency counts for time=12271
Extracting frequency counts for time=18061
Extracting frequency counts for time=19153
Extracting frequency counts for time=19151
Extracting frequency counts for time=37143
Extracting frequency counts for time=16211
Extracting frequency counts for time=33180
Extracting frequency counts for time=05051
Extracting frequency counts for time=07150
Extracting frequency counts for time=07151
Extracting frequency counts for time=08211
Extracting frequency counts for time=13302
Extracting frequency counts for time=15162
Extracting frequency counts for time=18020
Extracting frequency counts for time=20192
Extracting frequency counts for time=21281
Extracting frequency counts for time=03173
Extracting frequency counts for time=25203
Extracting frequency counts for time=19100
Extracting frequency counts for time=08210
Extracting 

Extracting frequency counts for time=16202
Extracting frequency counts for time=16221
Extracting frequency counts for time=10112
Extracting frequency counts for time=11191
Extracting frequency counts for time=30261
Extracting frequency counts for time=01063
Extracting frequency counts for time=14071
Extracting frequency counts for time=02110
Extracting frequency counts for time=21251
Extracting frequency counts for time=27100
Extracting frequency counts for time=05020
Extracting frequency counts for time=11182
Extracting frequency counts for time=31063
Extracting frequency counts for time=25221
Extracting frequency counts for time=24191
Extracting frequency counts for time=37121
Extracting frequency counts for time=33171
Extracting frequency counts for time=06120
Extracting frequency counts for time=07142
Extracting frequency counts for time=34231
Extracting frequency counts for time=10121
Extracting frequency counts for time=19102
Extracting frequency counts for time=08222
Extracting 

Extracting frequency counts for time=24181
Extracting frequency counts for time=19091
Extracting frequency counts for time=14043
Extracting frequency counts for time=34270
Extracting frequency counts for time=07192
Extracting frequency counts for time=10111
Extracting frequency counts for time=15122
Extracting frequency counts for time=13011
Extracting frequency counts for time=36083
Extracting frequency counts for time=04303
Extracting frequency counts for time=10131
Extracting frequency counts for time=08273
Extracting frequency counts for time=12252
Extracting frequency counts for time=01090
Extracting frequency counts for time=21270
Extracting frequency counts for time=30282
Extracting frequency counts for time=20191
Extracting frequency counts for time=36111
Extracting frequency counts for time=16182
Extracting frequency counts for time=25212
Extracting frequency counts for time=34232
Extracting frequency counts for time=16232
Extracting frequency counts for time=33170
Extracting 

Extracting frequency counts for time=02150
Extracting frequency counts for time=35311
Extracting frequency counts for time=36053
Extracting frequency counts for time=35040
Extracting frequency counts for time=14060
Extracting frequency counts for time=17282
Extracting frequency counts for time=03172
Extracting frequency counts for time=22031
Extracting frequency counts for time=32091
Extracting frequency counts for time=18032
Extracting frequency counts for time=28120
Extracting frequency counts for time=30263
Extracting frequency counts for time=01083
Extracting frequency counts for time=24183
Extracting frequency counts for time=23100
Extracting frequency counts for time=34242
Extracting frequency counts for time=34243
Extracting frequency counts for time=19130
Extracting frequency counts for time=12261
Extracting frequency counts for time=28112
Extracting frequency counts for time=02113
Extracting frequency counts for time=23101
Extracting frequency counts for time=16192
Extracting 

Extracting frequency counts for time=27080
Extracting frequency counts for time=29180
Extracting frequency counts for time=27083
Extracting frequency counts for time=20222
Extracting frequency counts for time=08223
Extracting frequency counts for time=19140
Extracting frequency counts for time=30260
Extracting frequency counts for time=23071
Extracting frequency counts for time=26292
Extracting frequency counts for time=06091
Extracting frequency counts for time=33213
Extracting frequency counts for time=03183
Extracting frequency counts for time=27043
Extracting frequency counts for time=17263
Extracting frequency counts for time=13023
Extracting frequency counts for time=32122
Extracting frequency counts for time=23081
Extracting frequency counts for time=27071
Extracting frequency counts for time=03203
Extracting frequency counts for time=35023
Extracting frequency counts for time=14072
Extracting frequency counts for time=30312
Extracting frequency counts for time=32081
Extracting 

Extracting frequency counts for time=30302
Extracting frequency counts for time=07202
Extracting frequency counts for time=25223
Extracting frequency counts for time=33162
Extracting frequency counts for time=12272
Extracting frequency counts for time=23073
Extracting frequency counts for time=13293
Extracting frequency counts for time=02112
Extracting frequency counts for time=15141
Extracting frequency counts for time=16242
Extracting frequency counts for time=19143
Extracting frequency counts for time=21283
Extracting frequency counts for time=29221
Extracting frequency counts for time=37131
Extracting frequency counts for time=02141
Extracting frequency counts for time=25242
Extracting frequency counts for time=11142
Extracting frequency counts for time=18021
Extracting frequency counts for time=01072
Extracting frequency counts for time=03221
Extracting frequency counts for time=19141
Extracting frequency counts for time=16181
Extracting frequency counts for time=20162
Extracting 