## DeepFM demonstration

This notebook allows running DeepFM CTR prediction model on custom datasets. 

To use this note book:


1, Clone the FuxiCTR repo
```
git clone https://github.com/Bernardo1998/FuxiCTR.git
```

2, Install Required Packages:
```
pip install -r requirements.txt
```

3, keep this notebook under "demo" folder. For path consistency, please run this notebook under UNIX environment.



#### Part 1: Make configuration file for custom dataset

In [1]:
import sys
sys.path.append('../')
import logging
from datetime import datetime
from fuxictr import datasets
from fuxictr.utils import load_config, set_logger, print_to_json
from fuxictr.features import FeatureMap
from fuxictr.pytorch.torch_utils import seed_everything
from fuxictr.pytorch.dataloaders import H5DataLoader
from fuxictr.preprocess import FeatureProcessor, build_dataset
from model_zoo import DeepFM

First, save your csv file under data/your_dataset folder.

In [63]:
# Make configuration automatically for custom datasets:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import yaml

def split_save_data_create_yaml(df, config_dir, random_seed, data_root, dataset_name, label_column=None,num_cols=None,train_size=0.33, val_size=0.33):
    # Create data_root and config_dir if they don't exist
    os.makedirs(data_root, exist_ok=True)
    os.makedirs(config_dir, exist_ok=True)
    
    # Split df into train/val/test
    train_df, temp_df = train_test_split(df, test_size=1-train_size, random_state=random_seed)
    val_df, test_df = train_test_split(temp_df, test_size=val_size/(1-train_size), random_state=random_seed)
    
    # File paths
    train_path = os.path.join(data_root, 'train_sample.csv')
    valid_path = os.path.join(data_root, 'valid_sample.csv')
    test_path = os.path.join(data_root, 'test_sample.csv')
    
    # Save the split dfs
    for path, data in zip([train_path, valid_path, test_path], [train_df, val_df, test_df]):
        os.makedirs(os.path.dirname(path), exist_ok=True)
        data.to_csv(path, index=False)

    # Assume the last column is always the clk label
    if label_column is None:
        label_column = df.columns[-1]
        feature_cols = df.columns[:(len(df.columns)-1)]
    else:
        feature_cols = [c for c in df.columns if c != label_column]
        
    # Check if label_column is numeric
    if not pd.api.types.is_numeric_dtype(df[label_column]):
        # Convert to 'category' dtype if it's not numeric
        df[label_column] = df[label_column].astype('category')

        # Convert category labels to codes (integer)
        df[label_column] = df[label_column].cat.codes

    # Ensure label_column is float
    df[label_column] = df[label_column].astype(float)

    # Detect num_cols if not given
    if num_cols is None:
        # Select columns that are of numeric type
        # Getting the names of numeric columns
        num_cols = df.select_dtypes(include='number').columns

    df[num_cols] = df[num_cols].astype(float)
    cat_cols = [fc for fc in feature_cols if fc not in num_cols]

    feature_cols = []
    if len(cat_cols) > 0:
        feature_cols.append({
                "name": cat_cols,
                "active": True,
                "dtype": "str",
                "type": "categorical"
            })
    if len(num_cols) > 0:
        feature_cols.append({
                "name": num_cols,
                "active": True,
                "dtype": "float",
                "type": "continuous"
            })
    
    # YAML configuration
    yaml_config = {
        dataset_name: {
            "data_root": data_root,
            "data_format": "csv",
            "train_data": train_path,
            "valid_data": valid_path,
            "test_data": test_path,
            "min_categr_count": 1,
            "feature_cols": feature_cols,
            "label_col": {"name": label_column, "dtype": "float"}
        }
    }
    
    # Save YAML configuration
    yaml_path = os.path.join(config_dir, 'dataset_config.yaml')
    with open(yaml_path, 'w') as yaml_file:
        yaml.dump(yaml_config, yaml_file, default_flow_style=False)

    print(f"Data split and saved. Configuration saved at {yaml_path}.")

def save_model_config(config_dir, experiment_id, dataset_name):
    # Configuration dictionary
    config = {
        "Base": {
            "model_root": "./checkpoints/",
            "num_workers": 3,
            "verbose": 1,
            "early_stop_patience": 5,
            "pickle_feature_encoder": True,
            "save_best_only": True,
            "eval_steps": None,
            "debug_mode": False,
            "group_id": None,
            "use_features": None,
            "feature_specs": None,
            "feature_config": None
        },
        experiment_id: {
            "model": "DeepFM",
            "dataset_id": dataset_name,
            "loss": "binary_crossentropy",
            "metrics": ["logloss", "AUC"],
            "task": "binary_classification",
            "optimizer": "adam",
            "hidden_units": [64, 32],
            "hidden_activations": "relu",
            "net_regularizer": 0,
            "embedding_regularizer": 1.e-8,
            "learning_rate": 1.e-3,
            "batch_norm": False,
            "net_dropout": 0,
            "batch_size": 128,
            "embedding_dim": 4,
            "epochs": 30,
            "shuffle": True,
            "seed": 2023,
            "monitor": "AUC",
            "monitor_mode": "max"
        }
    }

    # Ensure the configuration directory exists
    os.makedirs(config_dir, exist_ok=True)

    # YAML file path
    yaml_file_path = os.path.join(config_dir, f"model_config.yaml")

    # Save the configuration as a YAML file
    with open(yaml_file_path, 'w') as file:
        yaml.dump(config, file, sort_keys=False, default_flow_style=False)

    print(f"Configuration saved to {yaml_file_path}")

In [56]:
# Example usage
csv_path = "../data/test_real_data/real_49998.csv" # Path to your csv
dataset_name = csv_path.split("/")[-1].split(".")[0] 
config_dir = './config/test_real_data' # Path to save your config file
random_seed = 42 
data_root = '../data/test_real_data' # root dir where your csv files is located. Splitted df will be saved here as well
experiment_id = 'DeepFM_test_real_csv'
label_col = 'label' # The binary label column indicating clicked/not clicked.
num_cols = [] # Numerical features in the data. Leave blank if not numerical features
train_size = 0.6 # proportion of train data
val_size = 0.2 # proportion of validation data (for cross validation, not final evaluation)

df = pd.read_csv(csv_path) 

# Call the function with your DataFrame and paths
split_save_data_create_yaml(df, config_dir, random_seed, data_root, dataset_name, label_col,num_cols,train_size, val_size)
save_model_config(config_dir, experiment_id, dataset_name)

Data split and saved. Configuration saved at ./config/test_real_data\dataset_config.yaml.
Configuration saved to ./config/test_real_data\model_config.yaml


In [57]:
params = load_config(config_dir, experiment_id)
print("params.keys:",params.keys())

params.keys: dict_keys(['model_root', 'num_workers', 'verbose', 'early_stop_patience', 'pickle_feature_encoder', 'save_best_only', 'eval_steps', 'debug_mode', 'group_id', 'use_features', 'feature_specs', 'feature_config', 'model', 'dataset_id', 'loss', 'metrics', 'task', 'optimizer', 'hidden_units', 'hidden_activations', 'net_regularizer', 'embedding_regularizer', 'learning_rate', 'batch_norm', 'net_dropout', 'batch_size', 'embedding_dim', 'epochs', 'shuffle', 'seed', 'monitor', 'monitor_mode', 'model_id', 'data_format', 'data_root', 'feature_cols', 'label_col', 'min_categr_count', 'test_data', 'train_data', 'valid_data'])


In [58]:
# set up logger and random seed
set_logger(params)
logging.info("Params: " + print_to_json(params))
seed_everything(seed=params['seed'])

2024-02-04 09:07:17,474 P76144 INFO Params: {
    "batch_norm": "False",
    "batch_size": "128",
    "data_format": "csv",
    "data_root": "../data/test_real_data",
    "dataset_id": "real_49998",
    "debug_mode": "False",
    "early_stop_patience": "3",
    "embedding_dim": "4",
    "embedding_regularizer": "1e-08",
    "epochs": "30",
    "eval_steps": "None",
    "feature_cols": "[{'active': True, 'dtype': 'str', 'name': ['I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I12', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26'], 'type': 'categorical'}]",
    "feature_config": "None",
    "feature_specs": "None",
    "group_id": "None",
    "hidden_activations": "relu",
    "hidden_units": "[64, 32]",
    "label_col": "{'dtype': 'float', 'name': 'label'}",
    "learning_rate": "0.001",
    "loss": "binary_crossentropy",
    "metrics": "

#### Part 2: Setup Dataset

In [59]:
# Set feature_encoder that defines how to preprocess data
feature_encoder = FeatureProcessor(feature_cols=params["feature_cols"],
                                    label_col=params["label_col"],
                                    dataset_id=params["dataset_id"], 
                                    data_root=params["data_root"])

# Build dataset from csv to h5, and remap data paths to h5 files
params["train_data"], params["valid_data"], params["test_data"] = \
    build_dataset(feature_encoder, 
                    train_data=params["train_data"],
                    valid_data=params["valid_data"],
                    test_data=params["test_data"])

# Get feature_map that defines feature specs
data_dir = os.path.join(params['data_root'], params['dataset_id'])
feature_map = FeatureMap(params['dataset_id'], data_dir)
feature_map.load(os.path.join(data_dir, "feature_map.json"), params)
logging.info("Feature specs: " + print_to_json(feature_map.features))

2024-02-04 09:07:19,746 P76144 INFO Set up feature processor...
2024-02-04 09:07:19,751 P76144 INFO Load feature_map from json: ../data/test_real_data\real_49998\feature_map.json
2024-02-04 09:07:19,759 P76144 INFO Set column index...
2024-02-04 09:07:19,759 P76144 INFO Feature specs: {
    "C1": "{'source': '', 'type': 'categorical', 'padding_idx': 0, 'oov_idx': 224, 'vocab_size': 225}",
    "C10": "{'source': '', 'type': 'categorical', 'padding_idx': 0, 'oov_idx': 4397, 'vocab_size': 4398}",
    "C11": "{'source': '', 'type': 'categorical', 'padding_idx': 0, 'oov_idx': 2507, 'vocab_size': 2508}",
    "C12": "{'source': '', 'type': 'categorical', 'padding_idx': 0, 'oov_idx': 5155, 'vocab_size': 5156}",
    "C13": "{'source': '', 'type': 'categorical', 'padding_idx': 0, 'oov_idx': 2038, 'vocab_size': 2039}",
    "C14": "{'source': '', 'type': 'categorical', 'padding_idx': 0, 'oov_idx': 26, 'vocab_size': 27}",
    "C15": "{'source': '', 'type': 'categorical', 'padding_idx': 0, 'oov_idx'

In [60]:
# Get train and validation data generators from h5
train_gen, valid_gen = H5DataLoader(feature_map, 
                                    stage='train', 
                                    train_data=params['train_data'],
                                    valid_data=params['valid_data'],
                                    batch_size=params['batch_size'],
                                    shuffle=params['shuffle']).make_iterator()


2024-02-04 09:07:23,272 P76144 INFO Loading data...
2024-02-04 09:07:23,279 P76144 INFO Loading data from h5: ../data/test_real_data\real_49998\train.h5
2024-02-04 09:07:23,332 P76144 INFO Train samples: total/16499, blocks/1
2024-02-04 09:07:23,333 P76144 INFO Loading data from h5: ../data/test_real_data\real_49998\valid.h5
2024-02-04 09:07:23,353 P76144 INFO Validation samples: total/16999, blocks/1
2024-02-04 09:07:23,353 P76144 INFO Loading train and validation data done.


#### Part 3: Setup model and training loop

In [61]:
# Model initialization and fitting
model = DeepFM(feature_map, **params)
model.fit(train_gen, validation_data=valid_gen, epochs=params['epochs'])

2024-02-04 09:07:25,120 P76144 INFO Start training: 129 batches/epoch
2024-02-04 09:07:25,121 P76144 INFO ************ Epoch=1 start ************


 97%|█████████▋| 125/129 [00:05<00:00, 59.14it/s]

2024-02-04 09:07:30,659 P76144 INFO Train loss: 0.536044
2024-02-04 09:07:30,659 P76144 INFO Evaluation @epoch 1 - batch 129: 


100%|██████████| 133/133 [00:02<00:00, 58.67it/s]

2024-02-04 09:07:32,960 P76144 INFO [Metrics] AUC: 0.727590
2024-02-04 09:07:32,961 P76144 INFO Save best model: monitor(max)=0.727590



100%|██████████| 129/129 [00:08<00:00, 16.05it/s]

2024-02-04 09:07:33,161 P76144 INFO ************ Epoch=1 end ************



 98%|█████████▊| 126/129 [00:03<00:00, 60.14it/s]

2024-02-04 09:07:37,081 P76144 INFO Train loss: 0.382920
2024-02-04 09:07:37,082 P76144 INFO Evaluation @epoch 2 - batch 129: 


100%|██████████| 133/133 [00:02<00:00, 61.84it/s]

2024-02-04 09:07:39,243 P76144 INFO [Metrics] AUC: 0.685390
2024-02-04 09:07:39,243 P76144 INFO Monitor(max)=0.685390 STOP!
2024-02-04 09:07:39,243 P76144 INFO Reduce learning rate on plateau: 0.000100



100%|██████████| 129/129 [00:06<00:00, 20.54it/s]

2024-02-04 09:07:39,443 P76144 INFO ************ Epoch=2 end ************



 99%|█████████▉| 128/129 [00:03<00:00, 62.41it/s]

2024-02-04 09:07:43,253 P76144 INFO Train loss: 0.198809
2024-02-04 09:07:43,262 P76144 INFO Evaluation @epoch 3 - batch 129: 


100%|██████████| 133/133 [00:02<00:00, 60.14it/s]

2024-02-04 09:07:45,485 P76144 INFO [Metrics] AUC: 0.677030
2024-02-04 09:07:45,485 P76144 INFO Monitor(max)=0.677030 STOP!
2024-02-04 09:07:45,485 P76144 INFO Reduce learning rate on plateau: 0.000010



100%|██████████| 129/129 [00:06<00:00, 20.71it/s]

2024-02-04 09:07:45,684 P76144 INFO ************ Epoch=3 end ************



 97%|█████████▋| 125/129 [00:03<00:00, 61.20it/s]

2024-02-04 09:07:49,435 P76144 INFO Train loss: 0.172706
2024-02-04 09:07:49,435 P76144 INFO Evaluation @epoch 4 - batch 129: 


100%|██████████| 133/133 [00:02<00:00, 62.82it/s]

2024-02-04 09:07:51,559 P76144 INFO [Metrics] AUC: 0.676837
2024-02-04 09:07:51,561 P76144 INFO Monitor(max)=0.676837 STOP!
2024-02-04 09:07:51,561 P76144 INFO Reduce learning rate on plateau: 0.000001
2024-02-04 09:07:51,561 P76144 INFO ********* Epoch==4 early stop *********



 99%|█████████▉| 128/129 [00:06<00:00, 21.05it/s]

2024-02-04 09:07:51,765 P76144 INFO Training finished.
2024-02-04 09:07:51,766 P76144 INFO Load best model: c:\Research\FuxiCTR\demo\checkpoints\real_49998\DeepFM_test_real_csv.model





In [None]:
# Show run time record

#### Part 4: Evaluation

In [62]:
logging.info('***** Validation evaluation *****')
model.evaluate(valid_gen)

logging.info('***** Test evaluation *****')
test_gen = H5DataLoader(feature_map, 
                        stage='test',
                        test_data=params['test_data'],
                        batch_size=params['batch_size'],
                        shuffle=False).make_iterator()
model.evaluate(test_gen)

2024-02-04 09:08:07,463 P76144 INFO ***** Validation evaluation *****


100%|██████████| 133/133 [00:02<00:00, 62.18it/s]

2024-02-04 09:08:09,626 P76144 INFO [Metrics] logloss: 0.503698 - AUC: 0.727590
2024-02-04 09:08:09,626 P76144 INFO ***** Test evaluation *****
2024-02-04 09:08:09,626 P76144 INFO Loading data...
2024-02-04 09:08:09,626 P76144 INFO Loading data from h5: ../data/test_real_data\real_49998\test.h5
2024-02-04 09:08:09,654 P76144 INFO Test samples: total/16500, blocks/1
2024-02-04 09:08:09,654 P76144 INFO Loading test data done.



100%|██████████| 129/129 [00:02<00:00, 61.11it/s]

2024-02-04 09:08:11,788 P76144 INFO [Metrics] logloss: 0.492791 - AUC: 0.741702





OrderedDict([('logloss', 0.49279085242171367), ('AUC', 0.7417023501482575)])