In [1]:
!nvidia-smi

Wed Mar  6 08:13:22 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    Off  | 00000000:00:05.0 Off |                  Off |
| 30%   40C    P8    28W / 300W |      1MiB / 49140MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import pandas as pd
import numpy as np
import random
from tqdm.auto import tqdm
import gc
from implicit.evaluation import mean_average_precision_at_k,train_test_split
from implicit.approximate_als import FaissAlternatingLeastSquares
from implicit.nearest_neighbours import CosineRecommender, BM25Recommender,TFIDFRecommender
from implicit.cpu.lmf import LogisticMatrixFactorization
import scipy.sparse as sp
from implicit.gpu.bpr import BayesianPersonalizedRanking
from implicit.gpu.als import AlternatingLeastSquares

In [3]:
train_df = pd.read_csv('./train.csv',parse_dates=['timestamp'])

In [4]:
train_df

Unnamed: 0,interaction_type,timestamp,user_id,video_id
0,view,1696271079748,126492,228525
1,view,1696271079760,117764,204343
2,view,1696271079988,11347,201337
3,view,1696271080451,125274,221842
4,view,1696271080451,125274,221842
...,...,...,...,...
5657319,view,1696946405821,20627,42064
5657320,view,1696946406884,73913,242899
5657321,view,1696946407292,53555,241941
5657322,view,1696946407461,97851,222069


In [5]:
def chrono_split(
        df: pd.DataFrame, 
        split_by_column: str = 'user_id', 
        ratio: float = 0.7, 
        col_timestamp: str = 'timestamp'):

    df = df.sort_values([split_by_column, col_timestamp])
    groups = df.groupby(split_by_column)

    df["count"] = groups[split_by_column].transform("count")
    df["rank_s"] = groups.cumcount() + 1

    ratio = [ratio, 1 - ratio]
    splits = []
    prev_threshold = None
    for threshold in np.cumsum(ratio):
        condition = df["rank_s"] <= round(threshold * df["count"])
        if prev_threshold is not None:
            condition &= df["rank_s"] > round(prev_threshold * df["count"])
        splits.append(df[condition].drop(["rank_s", "count"], axis=1))
        prev_threshold = threshold

    return splits

def train_val_split(
        train_df: pd.DataFrame, 
        val_users_n: int = 200_000):
    
    user_ids = train_df['user_id'].unique()
    user_ids_val = random.sample(list(user_ids), val_users_n)
    condition = train_df['user_id'].isin(user_ids_val)

    val = train_df[condition]
    val_no_targets, val_targets = chrono_split(val, ratio=0.7)

    train = pd.concat([train_df[~condition], val_no_targets]).sort_values('timestamp')
    return train, val_no_targets, val_targets

In [6]:
random.seed(56)
train, val_no_targets, val_targets = train_val_split(train_df,val_users_n=100_000)

In [7]:
idxes = train.index

In [8]:
train['user_id'].iloc[0]

40821

In [9]:
train['video_id'].value_counts()[:10]

  train['video_id'].value_counts()[:10]


166825    17901
131005    17146
38943     16410
188589    15830
169519    15394
34204     15272
51799     12325
137447    12091
166282    11737
123953    10883
Name: video_id, dtype: int64

In [10]:
val_no_targets[val_no_targets['interaction_type'] != 'view'].shape[0] / val_no_targets.shape[0]

0.17421546088274967

In [11]:
def apk(actual, predicted, k=10):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
    if not actual:
        return 0.0
    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [12]:
val_labels = val_targets.groupby('user_id')['video_id'].apply(lambda x:x.tolist()).tolist()

In [13]:
### BaseLine_Simple

train_best_cols = train['video_id'].value_counts().index[:10]
mapk(val_labels,[train_best_cols] * len(val_labels),k=10)

0.006894914600655077

In [14]:
train['is_like'] = train['interaction_type'].apply(lambda x: 1 if x == 'like' else 0)
video_features = train.groupby('video_id')['is_like'].agg(['sum','count'])

In [15]:
video_features['index'] = video_features['sum'] * 10 + video_features['count']

In [16]:
video_features['video_id'] = video_features.index

In [17]:
train_best_cols = video_features.sort_values(by='index')[::-1][:10].index.tolist()
mapk(val_labels,[train_best_cols] * len(val_labels),k=10)

0.007281970669564121

In [18]:
#### BM25
video_features[['video_id','index']].index = range(video_features.shape[0])
video_features['range'] = range(video_features.shape[0])
video_features = video_features.set_index('range')

In [21]:
#train = train.drop('index',axis=1)
train = train.merge(video_features[['video_id','index']],on='video_id')

In [22]:
train['index'] = train['index'].map(lambda x: x ** 0.25)

In [23]:
users_inv_mapping = dict(enumerate(train_df['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}

items_inv_mapping = dict(enumerate(train_df['video_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}
len(users_mapping),len(items_mapping)

(152911, 228506)

In [24]:
def get_coo_matrix(df,
                   user_col='user_id',
                   item_col='item_id',
                   weight_col=None,
                   users_mapping=None,
                   items_mapping=None):
    if weight_col is None:
        weights = np.ones(len(df), dtype=np.float32)
    else:
        weights = df[weight_col].astype(np.float32)

    interaction_matrix = sp.coo_matrix((
        weights,
        (
            df[user_col].map(users_mapping.get),
            df[item_col].map(items_mapping.get)
        )),
    )
    return interaction_matrix

In [25]:
train_mat = get_coo_matrix(df=train,
                           user_col='user_id',
                           item_col='video_id',
                           weight_col='index', 
                           users_mapping=users_mapping,
                           items_mapping=items_mapping).tocsr()

In [47]:
#model = AlternatingLeastSquares(
#    factors=32, iterations=128,
#    random_state=56
#)
#model = BayesianPersonalizedRanking(factors=32, iterations=16,random_state=56)
#model = LogisticMatrixFactorization(factors=32, iterations=50,random_state=56)
model = TFIDFRecommender(K=2048)
#model = BM25Recommender()

model.fit(train_mat)



  0%|          | 0/228506 [00:00<?, ?it/s]

In [41]:
def predict_impl(model,test_users,mat,users_mapping,items_inv_mapping,N=10,falh=True):
    recs,scores = [],[]
    for id in tqdm(test_users):
        row_id = users_mapping[id]
        ranks = model.recommend(row_id, mat[row_id], N=N, filter_already_liked_items=falh)
        recs += [[items_inv_mapping.get(it) for it in ranks[0]]]
        scores += [ranks[1]]
    return recs,scores

def predict_impl_batched(model,test_users,mat,users_mapping,items_inv_mapping,batch_size=1024,N=10,falh=True):
    recs,scores = [],[]
    N = len(test_users)
    for i in tqdm(range(0,N,batch_size)):
        ids = test_users[i:i+batch_size]
        row_id = [users_mapping[id] for id in ids]
        ranks_lst = model.recommend(row_id, mat[row_id], N=N, filter_already_liked_items=falh)
        for ranks in ranks_lst:
            recs += [[items_inv_mapping.get(it) for it in ranks[0]]]
            scores += [ranks[1]]
    return recs,scores

In [28]:
val_group = val_targets.groupby('user_id')
act = val_group['video_id'].agg(lambda x:x.tolist()).tolist()
val_users = val_group.agg(lambda x:x.tolist()).index.tolist()

In [52]:
test_preds,test_scores = predict_impl(model,
                                      val_users,
                                      train_mat,
                                      users_mapping,
                                      items_inv_mapping,
                                      N=100,
                                      falh=True)

  0%|          | 0/100000 [00:00<?, ?it/s]

In [51]:
mapk(val_labels,test_preds,k=10)

0.02063615730190224

In [53]:
pd.DataFrame({'preds':test_preds,'scores':test_scores}).to_parquet('TFIDF_preds_val.parquet')

In [54]:
train_df['is_like'] = train_df['interaction_type'].apply(lambda x: 1 if x == 'like' else 0)
video_features = train_df.groupby('video_id')['is_like'].agg(['sum','count'])
video_features['index'] = video_features['sum'] * 10 + video_features['count']
video_features['video_id'] = video_features.index
video_features[['video_id','index']].index = range(video_features.shape[0])
video_features['range'] = range(video_features.shape[0])
video_features = video_features.set_index('range')
train_df = train_df.merge(video_features[['video_id','index']],on='video_id')
train_df['index'] = train_df['index'].map(lambda x: x ** 0.25)

In [55]:
train_mat = get_coo_matrix(df=train_df,
                           user_col='user_id',
                           item_col='video_id',
                           weight_col='index',
                           users_mapping=users_mapping,
                           items_mapping=items_mapping).tocsr()

In [56]:
model = TFIDFRecommender(K=2048)

model.fit(train_mat)



  0%|          | 0/228506 [00:00<?, ?it/s]

In [60]:
pd.DataFrame({'preds':test_preds,'scores':test_scores}).to_parquet('TFIDF_preds_test.parquet')

In [57]:
sample_sub = pd.read_csv('TFIDFV2.csv')
test_id = sample_sub['user_id'].tolist()

In [58]:
test_preds,test_scores = predict_impl(model,
                                      test_id,
                                      train_mat,
                                      users_mapping,
                                      items_inv_mapping,
                                      N=100,
                                      falh=True)

  0%|          | 0/152911 [00:00<?, ?it/s]

In [164]:
predicted_cols=[' '.join(map(str, i)) for i in test_preds]
sample_sub['recommendation'] = predicted_cols

In [166]:
sample_sub['count_tr'] = sample_sub['recommendation'].map(lambda x: len(x.split()))

In [167]:
sample_sub[sample_sub['count_tr'] != 10]

Unnamed: 0,user_id,recommendation,count_tr
98527,22025,180173 143427 16701 74601 252568 222751,6
98927,78973,29784 120362,2
144672,55148,137219 2538 149457 141781 79672 251129 24127 2...,9


In [168]:
recs = sample_sub['recommendation']
recs[98527] = '24671 65611 45311 132648 217384 34204 166282 51799 131005 188589'
recs[98927] = '24671 65611 45311 132648 217384 34204 166282 51799 131005 188589'
recs[144672] = '24671 65611 45311 132648 217384 34204 166282 51799 131005 188589'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recs[98527] = '24671 65611 45311 132648 217384 34204 166282 51799 131005 188589'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recs[98927] = '24671 65611 45311 132648 217384 34204 166282 51799 131005 188589'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recs[144672] = '24671 65611 45311 132648 217384 34204 166282 51799 131005 188589'


In [169]:
sample_sub['recommendation'] = recs

In [170]:
sample_sub[['recommendation','user_id']].to_csv('TF_IDF_REWEIGHTV2_025.csv',index=False)

In [172]:
sample_sub

Unnamed: 0,user_id,recommendation,count_tr
0,938,38943 207669 6 18999 137366 14803 245475 72256...,10
1,57571,34204 169519 188589 166282 137447 54812 214736...,10
2,50873,37487 3402 200362 52442 70460 148191 248427 15...,10
3,4335,131005 188589 34204 169519 133062 13009 166282...,10
4,42138,73271 230006 170373 51799 193601 131005 188589...,10
...,...,...,...
152906,79591,117763 230387 108084 10748 58865 62430 22914 1...,10
152907,93648,188589 131005 89801 34204 166282 193340 166825...,10
152908,42337,166825 245475 169519 207669 131005 34204 10356...,10
152909,4694,248488 7216 224218 153743 68214 169519 131005 ...,10
