In [None]:
import os
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

LOG_NAME = "recommender-model-1"
REMARK = ""
BATCH_SIZE = 128
LEARNING_RATE = 1e-5
EPOCH = 45
OPTIMIZER = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
LOSS_FN = tf.keras.losses.MeanAbsoluteError()

# Skip this: Avg Rating ver

## Data Extraction

### Load data from CSV

Not using user data for a while

Data is generated randomly using Excel

In [None]:
DATA_DIR = os.path.join("../data/synt_data_with_average_rating/")
INFLUENCER_FILE = os.path.join(DATA_DIR, "data_content_influencer_categ.csv")
OWNER_FILE = os.path.join(DATA_DIR, "data_content_owner_categ.csv")
HISTORY_FILE = os.path.join(DATA_DIR, "historical_data.csv")

df_influencer = pd.read_csv(INFLUENCER_FILE)
# df_owner = pd.read_csv(OWNER_FILE)
df_history = pd.read_csv(HISTORY_FILE)

### Data Exploration

In [None]:
df_influencer.head(5)

In [None]:
df_influencer.info()

In [None]:
df_influencer.describe()

In [None]:
df_history.head(5)

In [None]:
df_history.info()

In [None]:
df_history.describe()

## Data Transformation

### Data cleaning

#### Missing value

In [None]:
df_influencer.isnull().sum()

In [None]:
df_history.isnull().sum()

No missing value

#### Irrelevant Data / Invalid Data

Check if all history has valid influencer and owner ID

In [None]:
df_history["inf_id"].isin(df_influencer["id"]).all()

All history data has valid influencer and owner ID

### Data Normalization

Normalize influencer data: Scale follower count and One-hot categories

In [None]:
def one_hot(df, column):
    one_hot = df[column].str.get_dummies()
    col_name = one_hot.columns
    new_name = list(map(lambda name: column + "_" + name, col_name))
    one_hot.rename(columns={k: v for k, v in zip(col_name, new_name)}, inplace=True)

    df = pd.concat([df, one_hot], axis=1)
    df = df.drop(column, axis=1)
    return df


In [None]:
# follower_scaler = MinMaxScaler()
rating_scaler = MinMaxScaler()

df_inf_norm = df_influencer.copy()
df_inf_norm['avg_rating'] = rating_scaler.fit_transform(df_inf_norm[['avg_rating']])
# df_inf_norm[["insta_follower", "tiktok", "youtube"]] = follower_scaler.fit_transform(df_inf_norm[["insta_follower", "tiktok", "youtube"]])

df_inf_norm = one_hot(df_inf_norm, 'price_category') 

one_hot_categories = df_inf_norm['categories'].str.get_dummies(sep=',')
df_inf_norm = pd.concat([df_inf_norm, one_hot_categories], axis=1)
df_inf_norm = df_inf_norm.drop('categories', axis=1)

df_inf_norm = one_hot(df_inf_norm, 'youtube') 
df_inf_norm = one_hot(df_inf_norm, 'tiktok') 
df_inf_norm = one_hot(df_inf_norm, 'insta_follower') 

df_inf_norm.head()

Combine star and sentiment rating

In [None]:
STAR_WEIGHT = 0.6
SENTIMENT_WEIGHT = 0.4

df_history["combined_rating"] = STAR_WEIGHT * df_history["star_rating"] / 5 + SENTIMENT_WEIGHT * df_history["sentiment_rating"]
df_history

In [None]:
rating_count = df_history.groupby("star_rating").count()
rating_count

In [None]:
# Chart labels
rating_count = rating_count["own_id"].to_numpy()
ratings = range(1, 6)

# Show pie chart
plt.title("Label Distribution")
plt.bar(x=ratings, height=rating_count)
plt.show()

### Data Splitting

##### Creating user profile

In [None]:
df_history = df_history.drop(["star_rating", "sentiment_rating"], axis=1)
df_inf_features = pd.merge(df_history, df_inf_norm, left_on='inf_id', right_on='id', how='left')
df_inf_features.head()

In [None]:
OWNER_FEATURES = df_inf_norm.columns[1:]

# Copy influencer features combined with history data
df_own_norm = df_inf_features.copy()

# Multiply influencer feature with user rating
df_own_norm[OWNER_FEATURES] = df_own_norm[OWNER_FEATURES].mul(df_own_norm['combined_rating'], axis=0) 

# Drop unimportant features
df_own_norm = df_own_norm.drop(["inf_id", "id", "combined_rating"], axis=1)

# Average those with same owner id to make user profile
df_own_norm = df_own_norm.groupby('own_id').mean().reset_index()
df_own_norm.rename(columns={'own_id': 'id'}, inplace=True)

df_own_norm = df_own_norm.drop(['avg_rating'], axis=1)
df_own_norm.head()

##### Process feature and label

Influencer features

In [None]:
# Remove ID and labels
df_inf_features = df_inf_features.drop(["own_id", "inf_id", "id"], axis=1)

df_inf_features.head()

In [None]:
df_inf_features_pos = df_inf_features[df_inf_features["combined_rating"] > 0.6].drop(["combined_rating"], axis=1)
df_inf_features_neg = df_inf_features[df_inf_features["combined_rating"] <= 0.6].drop(["combined_rating"], axis=1)

df_inf_features_pos

In [None]:
INFLUENCER_FEATURE_COUNT = len(df_inf_features.drop("combined_rating", axis=1).columns)

Owner features

In [None]:
# Join history and owner data by own_id
df_own_features = pd.merge(df_history, df_own_norm, left_on='own_id', right_on='id', how='left')

df_own_features.head()

In [None]:
# Remove ID and labels
df_own_features = df_own_features.drop(["own_id", "inf_id", "id"], axis=1)

df_own_features.head()

In [None]:
df_own_features_pos = df_own_features[df_own_features["combined_rating"] > 0.6].drop(["combined_rating"], axis=1)
df_own_features_neg = df_own_features[df_own_features["combined_rating"] <= 0.6].drop(["combined_rating"], axis=1)

df_own_features_pos

In [None]:
OWNER_FEATURE_COUNT = len(df_own_features.drop("combined_rating", axis=1).columns)

Labels

In [None]:
# Get labels from history data
df_labels = df_history["combined_rating"]
df_labels.head()

In [None]:
df_labels_pos = df_labels[df_labels > 0.6]
df_labels_neg = df_labels[df_labels <= 0.6]

df_labels_pos

##### Generate train, validation, and test dataset

In [None]:
SHUFFLE_BUFFER = 1000

dataset = tf.data.Dataset.from_tensor_slices(({"inf_feature": df_inf_features, "own_feature": df_own_features}, df_labels))
# dataset = dataset.shuffle(SHUFFLE_BUFFER) 

dataset.element_spec

In [None]:
# Generate training, validation, and testing data
DATASET_SIZE = dataset.cardinality().numpy()
TRAIN_SIZE = int(DATASET_SIZE * 0.9)
VAL_SIZE = int(DATASET_SIZE * 0.05)
TEST_SIZE = DATASET_SIZE - TRAIN_SIZE - VAL_SIZE

train_dataset = dataset.take(TRAIN_SIZE)
val_dataset = dataset.skip(TRAIN_SIZE).take(VAL_SIZE)
test_dataset = dataset.skip(TRAIN_SIZE + VAL_SIZE).take(TEST_SIZE)

print(f"Training dataset has {train_dataset.cardinality().numpy()} data")
print(f"Validation dataset has {val_dataset.cardinality().numpy()} data")
print(f"Testing dataset has {test_dataset.cardinality().numpy()} data")

In [None]:
# Batching
REPEAT = 2

train_dataset = train_dataset.batch(BATCH_SIZE).repeat(REPEAT)
val_dataset = val_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

##### Generate train, validation, and test dataset (positive and negative)

In [None]:
SHUFFLE_BUFFER = 1000

dataset_pos = tf.data.Dataset.from_tensor_slices(({"inf_feature": df_inf_features_pos, "own_feature": df_own_features_pos}, df_labels_pos))
# dataset_pos = dataset_pos.shuffle(SHUFFLE_BUFFER) 
dataset_neg = tf.data.Dataset.from_tensor_slices(({"inf_feature": df_inf_features_neg, "own_feature": df_own_features_neg}, df_labels_neg))
# dataset_neg = dataset_neg.shuffle(SHUFFLE_BUFFER) 

print("Positive data:", dataset_pos.element_spec)
print("Count positive data:", dataset_pos.cardinality().numpy())

print("\nNegative data:", dataset_neg.element_spec)
print("Count negative data:", dataset_neg.cardinality().numpy())

In [None]:
# Generate training, validation, and testing data
POSITIVE_SIZE = dataset_pos.cardinality().numpy()
TRAIN_POS_SIZE = int(POSITIVE_SIZE * 0.9)
VAL_POS_SIZE = int(POSITIVE_SIZE * 0.05)
TEST_POS_SIZE = POSITIVE_SIZE - TRAIN_POS_SIZE - VAL_POS_SIZE

train_dataset_pos = dataset_pos.take(TRAIN_POS_SIZE)
val_dataset_pos = dataset_pos.skip(TRAIN_POS_SIZE).take(VAL_POS_SIZE)
test_dataset_pos = dataset_pos.skip(TRAIN_POS_SIZE + VAL_POS_SIZE).take(TEST_POS_SIZE)

print(f"Positive training dataset has {train_dataset_pos.cardinality().numpy()} data")
print(f"Positive validation dataset has {val_dataset_pos.cardinality().numpy()} data")
print(f"Positive testing dataset has {test_dataset_pos.cardinality().numpy()} data")

In [None]:
# Generate training, validation, and testing data
NEGATIVE_SIZE = dataset_neg.cardinality().numpy()
TRAIN_NEG_SIZE = int(NEGATIVE_SIZE * 0.5)
VAL_NEG_SIZE = int(NEGATIVE_SIZE * 0.25)
TEST_NEG_SIZE = NEGATIVE_SIZE - TRAIN_NEG_SIZE - VAL_NEG_SIZE

train_dataset_neg = dataset_neg.take(TRAIN_NEG_SIZE)
val_dataset_neg = dataset_neg.skip(TRAIN_NEG_SIZE).take(VAL_NEG_SIZE)
test_dataset_neg = dataset_neg.skip(TRAIN_NEG_SIZE + VAL_NEG_SIZE).take(TEST_NEG_SIZE)

print(f"Negative training dataset has {train_dataset_neg.cardinality().numpy()} data")
print(f"Negative validation dataset has {val_dataset_neg.cardinality().numpy()} data")
print(f"Negative testing dataset has {test_dataset_neg.cardinality().numpy()} data")

In [None]:
train_dataset = train_dataset_pos.concatenate(train_dataset_neg)
val_dataset = val_dataset_pos.concatenate(val_dataset_neg)
test_dataset = test_dataset_pos.concatenate(test_dataset_neg)

print(f"Training dataset has {train_dataset.cardinality().numpy()} data")
print(f"Validation dataset has {val_dataset.cardinality().numpy()} data")
print(f"Testing dataset has {test_dataset.cardinality().numpy()} data")

In [None]:
# Batching
REPEAT = 2

train_dataset = train_dataset.batch(BATCH_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

# Error Analysis

In [None]:
import os

MODEL = "recommender-avgrating-moredropout"

In [None]:
with open(os.path.join("model/summary/", MODEL + ".txt")) as f:
    print("".join(f.readlines()))

In [None]:
import tensorflow as tf

print(tf.__version__)

export_path = f"./model/savedmodel/{MODEL}/"
model = tf.saved_model.load(export_path)
infer = model.signatures["serving_default"]
print(infer.inputs, "\n\n", infer.outputs)

Helpers

In [None]:
def convert_tensor(array):
    return tf.expand_dims(tf.convert_to_tensor(array, tf.float32), 0)

def convert_dataset_to_numpy(dataset):
    inputs = []
    labels = np.array([])
    for batch in dataset:
        # batch_input = []
        # for data in batch[0]['inf_feature']:
        #     batch_input.append({'inputs_0': data})
        
        # for i, data in enumerate(batch[0]['own_feature']):
        #     batch_input[i]['inputs_1'] = data

        # inputs.append(batch_input)
        inf_feature = batch[0]['inf_feature'].numpy()
        own_feature = batch[0]['own_feature'].numpy()

        
        # print(inf_feature, own_feature)
        inputs += [{"inf_feature": convert_tensor(a_val), "own_feature": convert_tensor(b_val)} for a_val, b_val in zip(inf_feature, own_feature)]

        labels = np.concatenate([labels, batch[1].numpy()])

    return inputs, labels

def get_comparation_from_dataset(dataset, infer):
    inputs, labels = convert_dataset_to_numpy(dataset)
    
    predict = []
    for data in inputs:
        predict.append(infer(**data))

    predict = list(pred['dot_2'].numpy()[0] for pred in predict)

    compare = pd.concat([pd.DataFrame(predict, columns=["predicted"]), 
                     pd.DataFrame(labels, columns=["real"])], axis=1)

    compare["error"] = abs(compare["predicted"] - compare["real"])

    return compare


In [None]:
# compare_train = get_comparation_from_dataset(train_dataset, infer)
# compare_val = get_comparation_from_dataset(val_dataset, infer)
compare_test = get_comparation_from_dataset(dataset_pos.batch(BATCH_SIZE).concatenate(dataset_neg.batch(BATCH_SIZE)), infer)

In [None]:
compare_test

In [None]:
compare_test['error'].mean()

In [None]:
# Get owner and influencer data
pos_owner_inf = df_history.iloc[df_inf_features_pos.index]
neg_owner_inf = df_history.iloc[df_inf_features_neg.index]
combined_owner_inf = pd.concat([pos_owner_inf, neg_owner_inf], axis=0).drop('combined_rating', axis=1)
combined_owner_inf

In [None]:
test = pd.concat([combined_owner_inf.reset_index(), compare_test.reset_index()], axis=1)
test = test.drop('index', axis=1)
test

In [None]:
real_rank = test.groupby('own_id').apply(lambda x: x.sort_values(by='real', ascending=False)['inf_id'].tolist())
predicted_rank = test.groupby('own_id').apply(lambda x: x.sort_values(by='predicted', ascending=False)['inf_id'].tolist())

In [None]:
real_rank

In [None]:
from scipy.stats import kendalltau

corr_score = []
count = []
for id, rank in real_rank.iteritems():
    # print(predicted_rank[id])
    correlation, _ = kendalltau(list(rank), list(predicted_rank[id]))
    corr_score.append(correlation)
    count.append(len(list(rank)))

df_corr = pd.concat([real_rank, predicted_rank, pd.Series(corr_score, index=real_rank.index), pd.Series(count, index=real_rank.index)], axis=1)
df_corr

In [None]:
test = pd.merge(test, df_influencer, how='left', left_on ='inf_id', right_on='id')
test = pd.merge(test, df_own_norm, how='left', left_on='own_id', right_on='id')

In [None]:
test.to_csv("test.csv")
df_corr.to_csv("correlations.csv")

In [None]:
# compare_val['error'].mean()

In [None]:
# compare_test['error'].mean()

In [None]:
Stopper

# Skip this

## Data Extraction

### Load data from CSV

Not using user data for a while

Data is generated randomly using Excel

In [None]:
DATA_DIR = os.path.join("../data/synt_data_with_average_rating/")
INFLUENCER_FILE = os.path.join(DATA_DIR, "data_content_influencer_categ.csv")
OWNER_FILE = os.path.join(DATA_DIR, "data_content_owner_categ.csv")
HISTORY_FILE = os.path.join(DATA_DIR, "historical_data.csv")

df_influencer = pd.read_csv(INFLUENCER_FILE)
# df_owner = pd.read_csv(OWNER_FILE)
df_history = pd.read_csv(HISTORY_FILE)

### Data Exploration

In [None]:
df_influencer.head(5)

In [None]:
df_influencer.info()

In [None]:
df_influencer.describe()

In [None]:
df_history.head(5)

In [None]:
df_history.info()

In [None]:
df_history.describe()

## Data Transformation

### Data cleaning

#### Missing value

In [None]:
df_influencer.isnull().sum()

In [None]:
df_history.isnull().sum()

No missing value

#### Irrelevant Data / Invalid Data

Check if all history has valid influencer and owner ID

In [None]:
df_history["inf_id"].isin(df_influencer["id"]).all()

All history data has valid influencer and owner ID

### Data Normalization

Normalize influencer data: Scale follower count and One-hot categories

In [None]:
follower_scaler = MinMaxScaler()

df_inf_norm = df_influencer.copy()
df_inf_norm[["insta_follower", "tiktok", "youtube"]] = follower_scaler.fit_transform(df_inf_norm[["insta_follower", "tiktok", "youtube"]])

one_hot_price = df_inf_norm['price_category'].str.get_dummies()
df_inf_norm = pd.concat([df_inf_norm, one_hot_price], axis=1)
df_inf_norm = df_inf_norm.drop('price_category', axis=1)

one_hot_categories = df_inf_norm['categories'].str.get_dummies(sep=',')
df_inf_norm = pd.concat([df_inf_norm, one_hot_categories], axis=1)
df_inf_norm = df_inf_norm.drop('categories', axis=1)

df_inf_norm.head()

Remove some category

In [None]:
categories_count = df_inf_norm[df_inf_norm.columns[8:]].sum().sort_values(ascending=False)

column_name = list(categories_count.nlargest(10).index)
column_name
# plt.plot

Combine star and sentiment rating

In [None]:
STAR_WEIGHT = 0.6
SENTIMENT_WEIGHT = 0.4

df_history["combined_rating"] = STAR_WEIGHT * df_history["star_rating"] / 5 + SENTIMENT_WEIGHT * df_history["sentiment_rating"]
df_history

In [None]:
rating_count = df_history.groupby("star_rating").count()
rating_count

In [None]:
# Chart labels
rating_count = rating_count["own_id"].to_numpy()
ratings = range(1, 6)

# Show pie chart
plt.title("Label Distribution")
plt.bar(x=ratings, height=rating_count)
plt.show()

### Data Splitting

##### Creating user profile

In [None]:
df_history = df_history.drop(["star_rating", "sentiment_rating"], axis=1)
df_inf_features = pd.merge(df_history, df_inf_norm, left_on='inf_id', right_on='id', how='left')
df_inf_features.head()

In [None]:
OWNER_FEATURES = df_inf_norm.columns[1:]

# Copy influencer features combined with history data
df_own_norm = df_inf_features.copy()

# Multiply influencer feature with user rating
df_own_norm[OWNER_FEATURES] = df_own_norm[OWNER_FEATURES].mul(df_own_norm['combined_rating'], axis=0) 

# Drop unimportant features
df_own_norm = df_own_norm.drop(["inf_id", "id", "combined_rating"], axis=1)

# Average those with same owner id to make user profile
df_own_norm = df_own_norm.groupby('own_id').mean().reset_index()
df_own_norm.rename(columns={'own_id': 'id'}, inplace=True)

df_own_norm.head()

##### Process feature and label

Influencer features

In [None]:
# Remove ID and labels
df_inf_features = df_inf_features.drop(["own_id", "inf_id", "id"], axis=1)

df_inf_features.head()

In [None]:
df_inf_features_pos = df_inf_features[df_inf_features["combined_rating"] > 0.6].drop(["combined_rating"], axis=1)
df_inf_features_neg = df_inf_features[df_inf_features["combined_rating"] <= 0.6].drop(["combined_rating"], axis=1)

df_inf_features_pos

In [None]:
INFLUENCER_FEATURE_COUNT = len(df_inf_features.drop("combined_rating", axis=1).columns)

Owner features

In [None]:
# Join history and owner data by own_id
df_own_features = pd.merge(df_history, df_own_norm, left_on='own_id', right_on='id', how='left')

df_own_features.head()

In [None]:
# Remove ID and labels
df_own_features = df_own_features.drop(["own_id", "inf_id", "id"], axis=1)

df_own_features.head()

In [None]:
df_own_features_pos = df_own_features[df_own_features["combined_rating"] > 0.6].drop(["combined_rating"], axis=1)
df_own_features_neg = df_own_features[df_own_features["combined_rating"] <= 0.6].drop(["combined_rating"], axis=1)

df_own_features_pos

In [None]:
OWNER_FEATURE_COUNT = len(df_own_features.drop("combined_rating", axis=1).columns)

Labels

In [None]:
# Get labels from history data
df_labels = df_history["combined_rating"]
df_labels.head()

In [None]:
df_labels_pos = df_labels[df_labels > 0.6]
df_labels_neg = df_labels[df_labels <= 0.6]

df_labels_pos

##### Generate train, validation, and test dataset

In [None]:
SHUFFLE_BUFFER = 1000

dataset = tf.data.Dataset.from_tensor_slices(({"inf_feature": df_inf_features, "own_feature": df_own_features}, df_labels))
dataset = dataset.shuffle(SHUFFLE_BUFFER) 

dataset.element_spec

In [None]:
# Generate training, validation, and testing data
DATASET_SIZE = dataset.cardinality().numpy()
TRAIN_SIZE = int(DATASET_SIZE * 0.9)
VAL_SIZE = int(DATASET_SIZE * 0.05)
TEST_SIZE = DATASET_SIZE - TRAIN_SIZE - VAL_SIZE

train_dataset = dataset.take(TRAIN_SIZE)
val_dataset = dataset.skip(TRAIN_SIZE).take(VAL_SIZE)
test_dataset = dataset.skip(TRAIN_SIZE + VAL_SIZE).take(TEST_SIZE)

print(f"Training dataset has {train_dataset.cardinality().numpy()} data")
print(f"Validation dataset has {val_dataset.cardinality().numpy()} data")
print(f"Testing dataset has {test_dataset.cardinality().numpy()} data")

In [None]:
# Batching
REPEAT = 2

train_dataset = train_dataset.batch(BATCH_SIZE).repeat(REPEAT)
val_dataset = val_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

##### Generate train, validation, and test dataset (positive and negative)

In [None]:
SHUFFLE_BUFFER = 1000

dataset_pos = tf.data.Dataset.from_tensor_slices(({"inf_feature": df_inf_features_pos, "own_feature": df_own_features_pos}, df_labels_pos))
# dataset_pos = dataset_pos.shuffle(SHUFFLE_BUFFER) 
dataset_neg = tf.data.Dataset.from_tensor_slices(({"inf_feature": df_inf_features_neg, "own_feature": df_own_features_neg}, df_labels_neg))
# dataset_neg = dataset_neg.shuffle(SHUFFLE_BUFFER) 

print("Positive data:", dataset_pos.element_spec)
print("Count positive data:", dataset_pos.cardinality().numpy())

print("\nNegative data:", dataset_neg.element_spec)
print("Count negative data:", dataset_neg.cardinality().numpy())

In [None]:
# Generate training, validation, and testing data
POSITIVE_SIZE = dataset_pos.cardinality().numpy()
TRAIN_POS_SIZE = int(POSITIVE_SIZE * 0.9)
VAL_POS_SIZE = int(POSITIVE_SIZE * 0.05)
TEST_POS_SIZE = POSITIVE_SIZE - TRAIN_POS_SIZE - VAL_POS_SIZE

train_dataset_pos = dataset_pos.take(TRAIN_POS_SIZE)
val_dataset_pos = dataset_pos.skip(TRAIN_POS_SIZE).take(VAL_POS_SIZE)
test_dataset_pos = dataset_pos.skip(TRAIN_POS_SIZE + VAL_POS_SIZE).take(TEST_POS_SIZE)

print(f"Positive training dataset has {train_dataset_pos.cardinality().numpy()} data")
print(f"Positive validation dataset has {val_dataset_pos.cardinality().numpy()} data")
print(f"Positive testing dataset has {test_dataset_pos.cardinality().numpy()} data")

In [None]:
# Generate training, validation, and testing data
NEGATIVE_SIZE = dataset_neg.cardinality().numpy()
TRAIN_NEG_SIZE = int(NEGATIVE_SIZE * 0.5)
VAL_NEG_SIZE = int(NEGATIVE_SIZE * 0.25)
TEST_NEG_SIZE = NEGATIVE_SIZE - TRAIN_NEG_SIZE - VAL_NEG_SIZE

train_dataset_neg = dataset_neg.take(TRAIN_NEG_SIZE)
val_dataset_neg = dataset_neg.skip(TRAIN_NEG_SIZE).take(VAL_NEG_SIZE)
test_dataset_neg = dataset_neg.skip(TRAIN_NEG_SIZE + VAL_NEG_SIZE).take(TEST_NEG_SIZE)

print(f"Negative training dataset has {train_dataset_neg.cardinality().numpy()} data")
print(f"Negative validation dataset has {val_dataset_neg.cardinality().numpy()} data")
print(f"Negative testing dataset has {test_dataset_neg.cardinality().numpy()} data")

In [None]:
train_dataset = train_dataset_pos.concatenate(train_dataset_neg)
val_dataset = val_dataset_pos.concatenate(val_dataset_neg)
test_dataset = test_dataset_pos.concatenate(test_dataset_neg)

print(f"Training dataset has {train_dataset.cardinality().numpy()} data")
print(f"Validation dataset has {val_dataset.cardinality().numpy()} data")
print(f"Testing dataset has {test_dataset.cardinality().numpy()} data")

In [None]:
# Batching
REPEAT = 2

train_dataset = train_dataset.batch(BATCH_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)