In [1]:
from pathlib import Path
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import catboost
from sklearn.metrics import roc_auc_score

In [2]:
data_path = Path('data')
songs = pd.read_csv(data_path / 'songs.csv')
listens = pd.read_csv(data_path / 'train.csv')
members = pd.read_csv(data_path / 'members.csv')

In [5]:
users_counter = Counter(listens['msno'])
active_users = [u for u, c in users_counter.most_common() if c > 50]
user_ids = {u: i for i, u in enumerate(active_users)}
n_users = len(active_users)
songs_counter = Counter(listens['song_id'])
popular_songs = [s for s, c in songs_counter.most_common() if c > 50]
song_ids = {s: i for i, s in enumerate(popular_songs)}
n_songs = len(popular_songs)

In [6]:
listens['uid'] = listens['msno'].apply(lambda u: user_ids.get(u, -1))
listens['sid'] = listens['song_id'].apply(lambda s: song_ids.get(s, -1))

In [7]:
genre_counter = Counter(
    g
    for x in songs['genre_ids'] if isinstance(x, str)
    for g in x.split('|')
)
popular_genres = {g for g, c in genre_counter.most_common() if c >= 100}

def extract_genre(genre_ids):
    if not isinstance(genre_ids, str):
        return None
    genres_ids = [g for g in genre_ids.split('|') if g in popular_genres]
    return genres_ids[0] if genres_ids else None
    
listens['gid'] = listens.join(songs.set_index('song_id'), on='song_id', how='left')['genre_ids']\
    .apply(extract_genre)\
    .reset_index()['genre_ids']

In [8]:
users_songs_mask = (listens['uid'] >= 0) & (listens['sid'] >= 0)
data = listens.join(members[['msno', 'gender', 'bd']].set_index('msno'), on='msno', how='left')[users_songs_mask]

In [9]:
data['gender'] = data['gender'].fillna('unk')
data['source_screen_name'] = data['source_screen_name'].fillna('unk')
data['source_system_tab'] = data['source_system_tab'].fillna('unk')
data['source_type'] = data['source_type'].fillna('unk')
data['gid'] = data['gid'].fillna(-1)

In [10]:
def calc_roc(pt):
    pred, target = np.array(list(pt.values)).T
    if len(set(target)) < 2:
        return 0
    return roc_auc_score(target, pred)


def evaluate(val_data, model):
    pred = model.predict(val_data)[:, 1]
    pred_data = val_data[['uid', 'target']].assign(pred=pred)
    pred_data['pt'] = pred_data[['pred', 'target']].apply(tuple, axis=1)
    return pred_data[['uid', 'pt']].groupby('uid').agg(calc_roc)['pt'].mean()
    

def cross_validate(data, model_factory, n_folds=5):
    fold_ids = np.random.randint(0, n_folds, len(data))
    roc_aucs = []
    for i in range(n_folds):
        val_mask = fold_ids == i
        train_data = data[~val_mask]
        val_data = data[val_mask]
        train_users, val_users = set(train_data['uid']), set(val_data['uid'])
        cus = train_users & val_users
        train_songs, val_songs = set(train_data['sid']), set(val_data['sid'])
        css = train_songs & val_songs
        train_data = train_data[train_data['uid'].apply(cus.__contains__) & train_data['sid'].apply(css.__contains__)]
        val_data = val_data[val_data['uid'].apply(cus.__contains__) & val_data['sid'].apply(css.__contains__)]
        
        model = model_factory()
        model.fit(train_data)
        roc_auc = evaluate(val_data, model)
        roc_aucs.append(roc_auc)
        print(f'Fold {i}, ROC AUC {roc_auc}')
    print(f'Average ROC AUC {np.mean(roc_aucs)}')

In [11]:
def collect_listens(data):
    users_listened = defaultdict(list)
    songs_listeners = defaultdict(list)
    users_degrees = np.zeros(n_users, dtype=np.int32)
    songs_degrees = np.zeros(n_songs, dtype=np.int32)
    for _, u, s in data[['uid', 'sid']].itertuples():
        users_listened[u].append(s)
        songs_listeners[s].append(u)
        users_degrees[u] += 1
        songs_degrees[s] += 1
    return users_listened, songs_listeners, users_degrees, songs_degrees
    
def make_edges_list(edges_dict):
    rows, cols, data = [], [], []
    for v, neighbours in edges_dict.items():
        rows.extend([v] * len(neighbours))
        cols.extend(range(len(neighbours)))
        data.extend(neighbours)
    return csr_matrix((data, (rows, cols)))

In [12]:
def make_walk_step(vs, degrees, edges):
    edge_indexes = np.zeros(len(vs), dtype=np.int32)
    mask = degrees[vs] > 0
    edge_indexes[mask] = np.random.randint(degrees[vs][mask])
    return np.array(edges[vs, edge_indexes]).ravel()

def walk(users, users_edges, songs_edges, users_degrees, songs_degrees, steps, walks_per_user):
    pos = np.array(users).repeat(walks_per_user)
    for _ in range(steps):
        pos = make_walk_step(pos, users_degrees, users_edges)
        pos = make_walk_step(pos, songs_degrees, songs_edges)
    return pos.reshape((-1, walks_per_user))

In [13]:
class PageRank:
    def __init__(self):
        self.user_song_reaches = {}
        
    def fit(self, data):
        users_listened, songs_listeners, users_degrees, songs_degrees = collect_listens(data[data['target'] == 1])
        users_edges = make_edges_list(users_listened)
        songs_edges = make_edges_list(songs_listeners)
        user_list = list(set(data['uid']))
        reached = walk(user_list, users_edges, songs_edges, users_degrees, songs_degrees, 25, 800)
        self.user_song_reaches = {}
        for u, u_reaches in zip(user_list, reached):
            r_songs, reaches = np.unique(u_reaches, return_counts=True)
            for s, rs in zip(r_songs, reaches):
                self.user_song_reaches[(u, s)] = rs
        return self
        
    def predict(self, data):
        return [self.user_song_reaches.get((u, s), 0) for _, u, s in data[['uid', 'sid']].itertuples()]

In [14]:
class GBModel:
    _all_features = ['bd', 'gid', 'gender', 'source_screen_name', 'source_system_tab', 'source_type']
    _cat_features = ['gid', 'gender', 'source_screen_name', 'source_system_tab', 'source_type']
    
    def __init__(self):
        self.clf = catboost.CatBoostClassifier(
            learning_rate=.5, task_type='GPU', n_estimators=100, max_depth=10, logging_level='Silent')
        self.pr = PageRank()
    
    def fit(self, data):
        self.pr.fit(data)
        x = data[GBModel._all_features].assign(pr=self.pr.predict(data))
        y = data['target']
        self.clf.fit(x, y, cat_features=GBModel._cat_features)
        return self
    
    def predict(self, data):
        x = data[GBModel._all_features].assign(pr=self.pr.predict(data))
        return self.clf.predict_proba(x)

In [15]:
cross_validate(data, GBModel)

Fold 0, ROC AUC 0.6002626124952338
Fold 1, ROC AUC 0.5990600420025464
Fold 2, ROC AUC 0.5985713389834931
Fold 3, ROC AUC 0.6002730834978369
Fold 4, ROC AUC 0.599017883198402
Average ROC AUC 0.5994369920355024
