# DeepFM Inference

In [65]:
import pandas as pd 
import numpy as np
import torch
import sys, os, logging
import importlib

sys.path.append('../FuxiCTR-main/')
import fuxictr
from fuxictr import datasets
from datetime import datetime
from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list
from fuxictr.features import FeatureMap, FeatureEncoder
from fuxictr.pytorch import models
from fuxictr.pytorch.torch_utils import seed_everything
import gc
from pathlib import Path


## Preprocess - Construct Inference Data

### 1. Construct a dataset with one user and all businesses

In [66]:
# Select a user for testing
user = pd.read_csv("../data/all_user.csv").loc[1]
user

user_id                    smOvOajNG0lS4Pq7d8g4JQ
average_stars                                3.46
philly_reviews                                 43
philly_share_of_reviews                  0.244318
gender_score                                  0.0
Name: 1, dtype: object

In [67]:
# Join the user data with all business data and save to a csv file
df_inference = pd.read_csv("../data/all_business.csv")
for col_name, col_val in user.iteritems():
    df_inference[col_name] = col_val
df_inference["target"] = 0 # a redundant value in order to use the framework
df_inference.to_csv("../data/inference.csv", index=False)

In [79]:
buz = pd.read_csv("../data/all_business.csv")

In [81]:
buz.shape

(10391, 7)

### 2. Transform the dataset to hdf5 generator

In [68]:
# Read model and data parameters for feature mapping
experiment_id = "DeepFM_yelp_all_feature_019_eabe7106" # change to desired model name
config_dir = "./tuner_config/DeepFM_all_feature"
params = load_config(config_dir, experiment_id)
params['gpu'] = 0
params['version'] = "pytorch"
# params['train_data'] = ''
# params['valid_data'] = ''
params['test_data'] = os.path.join(params['data_root'], 'inference.csv') # define inference data location
set_logger(params)
logging.info(print_to_json(params))
seed_everything(seed=params['seed'])

2022-11-25 20:02:29,472 P49649 INFO {
    "batch_norm": "False",
    "batch_size": "1024",
    "data_format": "csv",
    "data_root": "../data/",
    "dataset_id": "yelp_all_feature_2cca3d6c",
    "debug": "False",
    "embedding_dim": "64",
    "embedding_regularizer": "0.01",
    "epochs": "50",
    "every_x_epochs": "1",
    "feature_cols": "[{'active': True, 'dtype': 'str', 'name': ['user_id', 'business_id', 'bucketed_sentiment_scores_received', 'postal_code'], 'type': 'categorical'}, {'active': True, 'dtype': 'float', 'name': ['average_stars', 'philly_reviews', 'philly_share_of_reviews', 'gender_score', 'business_stars', 'business_review_count'], 'type': 'numeric'}]",
    "gpu": "0",
    "hidden_activations": "relu",
    "hidden_units": "[400, 400, 400]",
    "label_col": "{'dtype': 'float', 'name': 'target'}",
    "learning_rate": "0.01",
    "loss": "binary_crossentropy",
    "metrics": "['logloss', 'AUC']",
    "min_categr_count": "1",
    "model": "DeepFM",
    "model_id": "De

In [69]:
feature_encoder = FeatureEncoder(**params)
datasets.build_dataset(feature_encoder, **params)

2022-11-25 20:02:29,566 P49649 INFO Set up feature encoder...
2022-11-25 20:02:29,567 P49649 INFO Reading file: ../data/train.csv
2022-11-25 20:02:30,217 P49649 INFO Reading file: ../data/valid.csv
2022-11-25 20:02:30,383 P49649 INFO Reading file: ../data/inference.csv
2022-11-25 20:02:30,405 P49649 INFO Preprocess feature columns...
2022-11-25 20:02:30,494 P49649 INFO Fit feature encoder...
2022-11-25 20:02:30,495 P49649 INFO Processing column: {'active': True, 'dtype': 'str', 'name': 'user_id', 'type': 'categorical'}
2022-11-25 20:02:30,859 P49649 INFO Processing column: {'active': True, 'dtype': 'str', 'name': 'business_id', 'type': 'categorical'}
2022-11-25 20:02:30,908 P49649 INFO Processing column: {'active': True, 'dtype': 'str', 'name': 'bucketed_sentiment_scores_received', 'type': 'categorical'}
2022-11-25 20:02:30,946 P49649 INFO Processing column: {'active': True, 'dtype': 'str', 'name': 'postal_code', 'type': 'categorical'}
2022-11-25 20:02:30,977 P49649 INFO Processing col

In [70]:
# # preporcess the dataset
# dataset = params['dataset_id'].split('_')[0].lower()
# data_dir = os.path.join(params['data_root'], params['dataset_id'])
# if params.get("data_format") == 'h5': # load data from h5
#     feature_map = FeatureMap(params['dataset_id'], data_dir, params['version'])
#     json_file = os.path.join(os.path.join(params['data_root'], params['dataset_id']), "feature_map.json")
#     if os.path.exists(json_file):
#         feature_map.load(json_file)
#     else:
#         raise RuntimeError('feature_map not exist!')
# else: # load data from csv
#     try:
#         feature_encoder = getattr(datasets, dataset).FeatureEncoder(**params)
#     except:
#         feature_encoder = FeatureEncoder(**params)
#     if os.path.exists(feature_encoder.json_file):
#         feature_encoder.feature_map.load(feature_encoder.json_file)
#     else: # Build feature_map and transform h5 data
#         datasets.build_dataset(feature_encoder, **params)
#     # params["train_data"] = os.path.join(data_dir, 'train*.h5')
#     # params["valid_data"] = os.path.join(data_dir, 'valid*.h5')
#     # params["test_data"] = os.path.join(data_dir, 'test*.h5')
#     feature_map = feature_encoder.feature_map

## Inference

Load model

In [71]:
feature_map = feature_encoder.feature_map

In [72]:
# initialize model
model_class = getattr(models, params['model'])
model = model_class(feature_map, **params)
# print number of parameters used in model
model.count_parameters()
# fit the model
# model.fit_generator(train_gen, validation_data=valid_gen, **params)

# load the best model checkpoint
logging.info("Load best model: {}".format(model.checkpoint))
model.load_weights(model.checkpoint)

2022-11-25 20:02:33,066 P49649 INFO Total number of parameters: 15216706.
2022-11-25 20:02:33,067 P49649 INFO Load best model: /Users/jun/Library/Mobile Documents/com~apple~CloudDocs/jun_folder/skills/mids/w210-capstone/proj/deepfm/models/yelp_all_feature_2cca3d6c/DeepFM_yelp_all_feature_019_eabe7106.model


Getting predictions

In [73]:
dataset = params['dataset_id'].split('_')[0].lower()
data_dir = os.path.join(params['data_root'], params['dataset_id'])
params["train_data"] = os.path.join(data_dir, 'train*.h5')
params["valid_data"] = os.path.join(data_dir, 'valid*.h5')
params["test_data"] = os.path.join(data_dir, 'test*.h5')

In [74]:
# # testing prediction for train and valid dataset
# train_gen, valid_gen = datasets.h5_generator(feature_map, stage='train', **params)
# train_predict = model.predict_generator(train_gen)
# valid_predict = model.predict_generator(valid_gen)
# print(train_predict.size)
# print(valid_predict.size)

In [75]:
# making inference
logging.info('******** Making Inference ********')
inference_gen = datasets.h5_generator(feature_map, stage='test', **params)
inference_result = model.predict_generator(inference_gen)

2022-11-25 20:02:33,104 P49649 INFO ******** Making Inference ********
2022-11-25 20:02:33,104 P49649 INFO Loading data...
2022-11-25 20:02:33,105 P49649 INFO Loading data from h5: ../data/yelp_all_feature_2cca3d6c/test.h5
2022-11-25 20:02:33,107 P49649 INFO Test samples: total/10391, pos/0, neg/10391, ratio/0.00%, blocks/1
2022-11-25 20:02:33,107 P49649 INFO Loading test data done.


100%|███████████████████████████████████████████████████| 11/11 [00:17<00:00,  1.57s/it]


In [76]:
df_inference = pd.read_csv("../data/inference.csv")
df_inference["pred"] = inference_result
df_inference = df_inference.drop("target", axis=1)
df_inference = df_inference.sort_values("pred", ascending=False)
df_inference.to_csv("../data/inference.csv", index=False)

## Postprocess

In [82]:
df_inference = pd.read_csv("../data/inference.csv")

In [83]:
df_inference.shape

(10391, 13)

---
## Construct user and business data

In [14]:
feature_lst = [var_type["name"] for var_type in params['feature_cols']]
features = []
for name in feature_lst:
    features += name
features

['user_id',
 'business_id',
 'bucketed_sentiment_scores_received',
 'postal_code',
 'average_stars',
 'philly_reviews',
 'philly_share_of_reviews',
 'gender_score',
 'business_stars',
 'business_review_count']

In [15]:
buz_features = ['business_id', 'bucketed_sentiment_scores_received', 'postal_code', 'business_stars', 'business_review_count']
buz_features

['business_id',
 'bucketed_sentiment_scores_received',
 'postal_code',
 'business_stars',
 'business_review_count']

In [16]:
user_features = []
for name in features:
    if name not in buz_features:
        user_features.append(name)
user_features

['user_id',
 'average_stars',
 'philly_reviews',
 'philly_share_of_reviews',
 'gender_score']

In [17]:
df_merged

NameError: name 'df_merged' is not defined

In [213]:
df_reviews = pd.read_feather("../../FilteredData/review_philly.feather")
df_users = pd.read_feather("../../FilteredData/user_philly.feather")
df_buz = pd.read_feather("../../FilteredData/business_philly.feather")
df_users.rename(columns={"review_count":"user_review_count"}, inplace=True)
df_buz.rename(columns={"name":"business_name", "stars":"business_stars", "review_count":"business_review_count"}, inplace=True)
df_buz["postal_code"] = df_buz["postal_code"].fillna(0)
df_reviews = df_reviews[~df_reviews.stars.isna()]
df_users.replace([np.inf, -np.inf], 1, inplace=True)
df_merged = df_reviews.merge(
    df_buz, 
    how="left", 
    left_on="business_id", 
    right_on="business_id"
)
df_merged = df_merged.merge(
    df_users,
    how="left",
    left_on="user_id",
    right_on="user_id"
)
df_merged = df_merged[features + ["business_name", "categories"]]

df_buz_from_review = df_merged[buz_features + ["business_name", "categories"]]
df_user_from_review = df_merged[user_features]

df_buz_from_review = df_buz_from_review.drop_duplicates()
df_user_from_review = df_user_from_review.drop_duplicates()

df_buz_from_review.to_csv("../data/all_business.csv", index=False)
df_user_from_review.to_csv("../data/all_user.csv", index=False)