In [1]:
import sys
sys.path.append('./FuxiCTR-main/')
import fuxictr
fuxictr.__version__

'1.2.3'

In [2]:
import os
import logging
from datetime import datetime
from fuxictr import datasets
from fuxictr.datasets.taobao import FeatureEncoder
from fuxictr.features import FeatureMap
from fuxictr.utils import load_config, set_logger, print_to_json
from fuxictr.pytorch.models import DeepFM
from fuxictr.pytorch.torch_utils import seed_everything
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
df_reviews = pd.read_feather("../FilteredData/../FilteredData/review_philly.feather")

In [4]:
def create_target(row):
    if row["stars"] <= 3:
        return 0
    else:
        return 1

In [5]:
df_reviews["target"] = df_reviews.apply(lambda x: create_target(x), axis=1)

In [6]:
df_reviews.head(3)

Unnamed: 0,_id,review_id,user_id,business_id,stars,useful,funny,cool,text,date,compliment_count,target
0,631e9f7fedf65856ab0dfe3b,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1.0,1.0,2.0,1.0,I am a long term frequent customer of this est...,2015-09-23 23:10:31,,0
1,631e9f7fedf65856ab0dfe3e,8JFGBuHMoiNDyfcxuWNtrA,smOvOajNG0lS4Pq7d8g4JQ,RZtGWDLCAtuipwaZ-UfjmQ,4.0,0.0,0.0,0.0,Good food--loved the gnocchi with marinara\nth...,2009-10-14 19:57:14,,1
2,631e9f7fedf65856ab0dfe42,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1.0,0.0,1.0,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,,1


In [7]:
df_train, df_val = train_test_split(
    df_reviews, 
    test_size=0.2, 
    shuffle=True, 
    random_state=3
)

In [8]:
df_train.to_csv("./data/train_trial.csv", index=False)
df_val.to_csv("./data/valid_trial.csv", index=False)

In [40]:
feature_cols = [{'name': ["user_id", "business_id"],
                 'active': True, 'dtype': 'str', 'type': 'categorical'}]
label_col = {'name': 'target', 'dtype': float}

params = {'model_id': 'DeepFM_trial',
          'dataset_id': 'yelp_trial',
          'train_data': './data/train_trial.csv',
          'valid_data': './data/valid_trial.csv',
          # 'test_data': 'FuxiCTR-main/data/tiny_data/test_sample.csv',
          'model_root': './models/',
          'data_root': './data/',
          'feature_cols': feature_cols,
          'label_col': label_col,
          'embedding_regularizer': 0,
          'net_regularizer': 0,
          'hidden_units': [64, 64],
          'hidden_activations': "relu",
          'learning_rate': 1e-3,
          'net_dropout': 0,
          'batch_norm': False,
          'optimizer': 'adam',
          'task': 'binary_classification',
          'loss': 'binary_crossentropy',
          'metrics': ['logloss', 'AUC'],
          'min_categr_count': 1,
          'embedding_dim': 10,
          'batch_size': 16,
          'epochs': 3,
          'shuffle': True,
          'seed': 2019,
          'monitor': 'AUC',
          'monitor_mode': 'max',
          'use_hdf5': True,
          'pickle_feature_encoder': True,
          'save_best_only': True,
          'every_x_epochs': 1,
          'patience': 2,
          'num_workers': 1,
          'partition_block_size': -1,
          'verbose': 1,
          'version': 'pytorch',
          'gpu': -1}

# Set the logger and random seed
set_logger(params)
logging.info(print_to_json(params))
seed_everything(seed=params['seed'])

2022-10-21 17:42:57,318 P75476 INFO {
    "batch_norm": "False",
    "batch_size": "16",
    "data_root": "./data/",
    "dataset_id": "yelp_no_feature",
    "embedding_dim": "10",
    "embedding_regularizer": "0",
    "epochs": "3",
    "every_x_epochs": "1",
    "feature_cols": "[{'name': ['user_id', 'business_id'], 'active': True, 'dtype': 'str', 'type': 'categorical'}]",
    "gpu": "-1",
    "hidden_activations": "relu",
    "hidden_units": "[64, 64]",
    "label_col": "{'name': 'target', 'dtype': <class 'float'>}",
    "learning_rate": "0.001",
    "loss": "binary_crossentropy",
    "metrics": "['logloss', 'AUC']",
    "min_categr_count": "1",
    "model_id": "DeepFM_trial",
    "model_root": "./models/",
    "monitor": "AUC",
    "monitor_mode": "max",
    "net_dropout": "0",
    "net_regularizer": "0",
    "num_workers": "1",
    "optimizer": "adam",
    "partition_block_size": "-1",
    "patience": "2",
    "pickle_feature_encoder": "True",
    "save_best_only": "True",
    "se

In [41]:
# Set feature_encoder that defines how to preprocess data
feature_encoder = FeatureEncoder(feature_cols, 
                                 label_col, 
                                 dataset_id=params['dataset_id'], 
                                 data_root=params["data_root"])

# Build dataset from csv to h5
datasets.build_dataset(feature_encoder, 
                       train_data=params["train_data"], 
                       valid_data=params["valid_data"],)
                       #test_data=params["test_data"])

2022-10-21 17:42:57,519 P75476 INFO Set up feature encoder...
2022-10-21 17:42:57,521 P75476 INFO Reading file: ./data/train.csv
2022-10-21 17:42:57,623 P75476 INFO Reading file: ./data/valid.csv
2022-10-21 17:42:57,658 P75476 INFO Preprocess feature columns...
2022-10-21 17:42:57,661 P75476 INFO Fit feature encoder...
2022-10-21 17:42:57,662 P75476 INFO Processing column: {'name': 'user_id', 'active': True, 'dtype': 'str', 'type': 'categorical'}
2022-10-21 17:42:57,671 P75476 INFO Processing column: {'name': 'business_id', 'active': True, 'dtype': 'str', 'type': 'categorical'}
2022-10-21 17:42:57,674 P75476 INFO Set feature index...
2022-10-21 17:42:57,675 P75476 INFO Pickle feature_encoder: ./data/yelp_no_feature/feature_encoder.pkl
2022-10-21 17:42:57,676 P75476 INFO Save feature_map to json: ./data/yelp_no_feature/feature_map.json
2022-10-21 17:42:57,677 P75476 INFO Set feature encoder done.
2022-10-21 17:42:57,677 P75476 INFO Transform feature columns...
2022-10-21 17:42:57,679 P7

In [42]:
# Get feature_map that defines feature specs
feature_map = feature_encoder.feature_map

# Get train and validation data generator from h5
data_dir = os.path.join(params['data_root'], params['dataset_id'])
train_gen, valid_gen = datasets.h5_generator(feature_map, 
                                             stage='train', 
                                             train_data=os.path.join(data_dir, 'train.h5'),
                                             valid_data=os.path.join(data_dir, 'valid.h5'),
                                             batch_size=params['batch_size'],
                                             shuffle=params['shuffle'])

2022-10-21 17:43:02,728 P75476 INFO Loading data...
2022-10-21 17:43:02,730 P75476 INFO Loading data from h5: ./data/yelp_no_feature/train.h5
2022-10-21 17:43:02,735 P75476 INFO Loading data from h5: ./data/yelp_no_feature/valid.h5
2022-10-21 17:43:02,738 P75476 INFO Train samples: total/8000, pos/5503, neg/2497, ratio/68.79%, blocks/1
2022-10-21 17:43:02,738 P75476 INFO Validation samples: total/2000, pos/1401, neg/599, ratio/70.05%, blocks/1
2022-10-21 17:43:02,739 P75476 INFO Loading train data done.


In [None]:
model = DeepFM(feature_map, **params)
model.count_parameters() # print number of parameters used in model
model.fit_generator(train_gen, 
                    validation_data=valid_gen, 
                    epochs=params['epochs'],
                    verbose=params['verbose'])

In [27]:
model.load_weights(model.checkpoint) # reload the best checkpoint
logging.info('***** validation results *****')
model.evaluate_generator(valid_gen)

# logging.info('***** validation results *****')
# test_gen = datasets.h5_generator(feature_map, 
#                                  stage='test',
#                                  test_data=os.path.join(data_dir, 'test.h5'),
#                                  batch_size=params['batch_size'],
#                                  shuffle=False)
# model.evaluate_generator(test_gen)

2022-10-15 16:53:51,874 P56629 INFO ***** validation results *****


100%|████████████████████████████████████████████████████████████████████████████| 10689/10689 [00:11<00:00, 939.72it/s]

2022-10-15 16:54:03,367 P56629 INFO [Metrics] logloss: 0.536154 - AUC: 0.739063





{'logloss': 0.5361543050809597, 'AUC': 0.739063152050163}

In [14]:
from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k

# Load the MovieLens 100k dataset. Only five
# star ratings are treated as positive.
data = fetch_movielens(min_rating=5.0)

ModuleNotFoundError: No module named 'lightfm'