In [52]:
!pip install implicit faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
[0m

In [3]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from implicit.gpu.als import AlternatingLeastSquares
from implicit.approximate_als import FaissAlternatingLeastSquares
import scipy.sparse as sp

In [4]:
df = pd.read_csv("train.gz")

In [5]:
df_rec = df[['device_ip','site_id']]

In [6]:
df_rec['weight'] = df_rec.groupby('device_ip').transform('count')
df_rec['weight'] = df_rec['weight'].map(lambda x:x**0.5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rec['weight'] = df_rec.groupby('device_ip').transform('count')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rec['weight'] = df_rec['weight'].map(lambda x:x**0.5)


In [7]:
df_rec = df_rec.drop_duplicates()

In [8]:
del df
df = df_rec

In [9]:
users_inv_mapping = dict(enumerate(df['device_ip'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}

items_inv_mapping = dict(enumerate(df['site_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}
len(users_mapping),len(items_mapping)

(6729486, 4737)

In [10]:
def get_coo_matrix(df,
                   user_col='user_id',
                   item_col='item_id',
                   weight_col=None,
                   users_mapping=None,
                   items_mapping=None):
    if weight_col is None:
        weights = np.ones(len(df), dtype=np.float32)
    else:
        weights = df[weight_col].astype(np.float32)

    interaction_matrix = sp.coo_matrix((
        weights,
        (
            df[user_col].map(users_mapping.get),
            df[item_col].map(items_mapping.get)
        )),
    )
    return interaction_matrix

In [12]:
train_mat = get_coo_matrix(df=df,
                           user_col='device_ip',
                           item_col='site_id',
                           weight_col='weight',
                           users_mapping=users_mapping,
                           items_mapping=items_mapping).tocsr()

In [13]:
model = AlternatingLeastSquares(factors = 128,
                                iterations = 32,
                                #use_gpu = True ,
                                calculate_training_loss = False,
                                regularization = 0.1)
model.fit(train_mat)

  0%|          | 0/32 [00:00<?, ?it/s]

In [14]:
factors = model.user_factors.to_numpy()

In [15]:
factors_df = pd.DataFrame(factors,columns=[f'factor_{i}' for i in range(128)])

In [16]:
factors_df['user_id'] = users_mapping.keys()

In [17]:
factors_df.to_parquet('als_features.parquet.gzip')