In [2]:
!pip install deepctr_torch

[0m

In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from deepctr_torch.models import DeepFM
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names

In [2]:
!nvidia-smi

Sun Nov 12 11:05:25 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:05.0 Off |                    0 |
| N/A   31C    P0    52W / 400W |      0MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
df = pd.read_csv("train.gz")[:10_000_000]

In [4]:
train_df,test_df = train_test_split(df,test_size=0.2,random_state=56)

In [5]:
cat_features = ['hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
features = cat_features
label_col = 'click'

In [6]:
class MultyLabelEncoder():
    def __init__(self,cat_cols):
        self.cat_cols = cat_cols
    
    def fit_transform(self,df):
        self.encoders = []
        for col in tqdm(self.cat_cols):
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            self.encoders += [le]
        return df
    
    def transform(self,df):
        for col,le in tqdm(zip(self.cat_cols,self.encoders),total=len(self.cat_cols)):
            df[col] = le.transform(df[col])
        return df

In [7]:
le = MultyLabelEncoder(cat_features)
df = le.fit_transform(df)

  0%|          | 0/22 [00:00<?, ?it/s]

In [8]:
fixlen_feature_columns = [SparseFeat(col,df[col].nunique(),embedding_dim=64) for col in cat_features]
feature_names = get_feature_names(fixlen_feature_columns)

In [9]:
train_df,test_df = train_test_split(df,test_size=0.2,random_state=56)

In [11]:
train_model_input = {col: train_df[col]for col in feature_names}
test_model_input = {col: test_df[col]for col in feature_names}

In [12]:
params = {'linear_feature_columns':fixlen_feature_columns,
          'dnn_feature_columns':fixlen_feature_columns,
          'use_fm':True,
          'dnn_hidden_units':(256,128,64),
          'l2_reg_linear':1e-5,
          'l2_reg_embedding':1e-5,
          'dnn_dropout':0.1,
          'dnn_activation':'prelu',
          'dnn_use_bn':True,
          'seed':56,
          'task':'binary',
          'device':'cuda'}
model = DeepFM(**params)
model

DeepFM(
  (embedding_dict): ModuleDict(
    (hour): Embedding(53, 64)
    (C1): Embedding(7, 64)
    (banner_pos): Embedding(7, 64)
    (site_id): Embedding(3496, 64)
    (site_domain): Embedding(4585, 64)
    (site_category): Embedding(23, 64)
    (app_id): Embedding(5469, 64)
    (app_domain): Embedding(390, 64)
    (app_category): Embedding(33, 64)
    (device_id): Embedding(786741, 64)
    (device_ip): Embedding(2129662, 64)
    (device_model): Embedding(6863, 64)
    (device_type): Embedding(4, 64)
    (device_conn_type): Embedding(4, 64)
    (C14): Embedding(1030, 64)
    (C15): Embedding(8, 64)
    (C16): Embedding(9, 64)
    (C17): Embedding(226, 64)
    (C18): Embedding(4, 64)
    (C19): Embedding(47, 64)
    (C20): Embedding(168, 64)
    (C21): Embedding(42, 64)
  )
  (linear_model): Linear(
    (embedding_dict): ModuleDict(
      (hour): Embedding(53, 1)
      (C1): Embedding(7, 1)
      (banner_pos): Embedding(7, 1)
      (site_id): Embedding(3496, 1)
      (site_domain): E

In [13]:
model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['binary_crossentropy','auc'])

In [None]:
model.fit(train_model_input,
          train_df[label_col].values,
          batch_size=1024,
          epochs=1,
          verbose=1,
          validation_split=0.2)

  return [None if x is None else x[start:stop] for x in arrays]


cuda
Train on 6400000 samples, validate on 1600000 samples, 6250 steps per epoch


4438it [02:45, 28.20it/s]

In [16]:
preds = model.predict(test_model_input)

In [21]:
from sklearn.metrics import roc_auc_score
roc_auc_score(test_df['click'],preds)

0.7916861265077576

In [19]:
test_df['click']

6141725    0
8317400    0
6616162    0
7676727    0
5759427    0
          ..
3337546    0
8470632    0
4909823    1
4523224    0
9868897    1
Name: click, Length: 2000000, dtype: int64