# <center> Recommender System </center>

## Data Extraction

### Load data from CSV

Not using user data for a while

Data is generated randomly using Excel

In [143]:
import pandas as pd
import os

DATA_DIR = os.path.join("data/synt_data/")
INFLUENCER_FILE = os.path.join(DATA_DIR, "data_content_influencer_categ.csv")
OWNER_FILE = os.path.join(DATA_DIR, "data_content_owner_categ.csv")
HISTORY_FILE = os.path.join(DATA_DIR, "historical_data.csv")

df_influencer = pd.read_csv(INFLUENCER_FILE)
# df_owner = pd.read_csv(OWNER_FILE)
df_history = pd.read_csv(HISTORY_FILE)

### Data Exploration

In [144]:
df_influencer.head(5)

Unnamed: 0,id,insta_follower,tiktok,youtube,price_normalized,categories
0,1723,200630,116775,188693,0.723575,"Drama,Romance,School,Supernatural"
1,82,793665,549225,827288,0.896555,"Action,Adventure,Drama,Fantasy,Magic,Military,..."
2,296,114262,113731,107155,0.684796,"Action,Comedy,Historical,Parody,Samurai,Sci-Fi..."
3,127,673572,548014,633078,0.817022,"Sci-Fi,Thriller"
4,137,151266,109278,75716,0.692096,"Action,Comedy,Historical,Parody,Samurai,Sci-Fi..."


In [145]:
df_influencer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6406 entries, 0 to 6405
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                6406 non-null   int64  
 1   insta_follower    6406 non-null   int64  
 2   tiktok            6406 non-null   int64  
 3   youtube           6406 non-null   int64  
 4   price_normalized  6406 non-null   float64
 5   categories        6406 non-null   object 
dtypes: float64(1), int64(4), object(1)
memory usage: 300.4+ KB


In [146]:
df_influencer.describe()

Unnamed: 0,id,insta_follower,tiktok,youtube,price_normalized
count,6406.0,6406.0,6406.0,6406.0,6406.0
mean,3505.326881,33120.34,29526.59616,30101.9,0.49672
std,2153.76109,71933.96,65957.336585,68447.43,0.068586
min,1.0,43.0,36.0,41.0,0.142142
25%,1607.25,2005.5,1612.0,1726.25,0.456907
50%,3312.5,7290.5,6249.0,6372.5,0.494965
75%,5384.75,29825.25,25697.5,25972.0,0.533565
max,7387.0,1013917.0,805312.0,1188454.0,0.8978


In [147]:
df_history.head(5)

Unnamed: 0,own_id,inf_id,star_rating,sentiment_rating
0,1,1,4,0.874921
1,1,3,3,0.566991
2,1,5,5,0.945937
3,1,6,5,0.83295
4,1,7,4,0.826545


In [148]:
df_history.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413446 entries, 0 to 413445
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   own_id            413446 non-null  int64  
 1   inf_id            413446 non-null  int64  
 2   star_rating       413446 non-null  int64  
 3   sentiment_rating  413446 non-null  float64
dtypes: float64(1), int64(3)
memory usage: 12.6 MB


In [149]:
df_history.describe()

Unnamed: 0,own_id,inf_id,star_rating,sentiment_rating
count,413446.0,413446.0,413446.0,413446.0
mean,2443.219533,1076.748458,4.14117,0.777523
std,1438.596784,1148.850785,0.789503,0.161688
min,1.0,1.0,1.0,0.00045
25%,1214.0,260.0,4.0,0.679722
50%,2456.0,639.0,4.0,0.791544
75%,3699.0,1532.0,5.0,0.900134
max,5000.0,7385.0,5.0,1.0


## Data Transformation

### Data cleaning

#### Missing value

In [150]:
df_influencer.isnull().sum()

id                  0
insta_follower      0
tiktok              0
youtube             0
price_normalized    0
categories          0
dtype: int64

In [151]:
df_history.isnull().sum()

own_id              0
inf_id              0
star_rating         0
sentiment_rating    0
dtype: int64

No missing value

#### Irrelevant Data / Invalid Data

Check if all history has valid influencer and owner ID

In [152]:
df_history["inf_id"].isin(df_influencer["id"]).all()

True

All history data has valid influencer and owner ID

### Data Normalization

This process will be using Tensorflow Dataset (when I actually have time lol)

Normalize influencer data: Scale follower count and One-hot categories

In [153]:
from sklearn.preprocessing import MinMaxScaler

follower_scaler = MinMaxScaler()

df_inf_norm = df_influencer.copy()
df_inf_norm[["insta_follower", "tiktok", "youtube"]] = follower_scaler.fit_transform(df_inf_norm[["insta_follower", "tiktok", "youtube"]])

one_hot_categories = df_inf_norm['categories'].str.get_dummies(sep=',')
df_inf_norm = pd.concat([df_inf_norm, one_hot_categories], axis=1)
df_inf_norm = df_inf_norm.drop('categories', axis=1)
df_inf_norm.head()

Unnamed: 0,id,insta_follower,tiktok,youtube,price_normalized,Action,Adventure,Cars,Comedy,Dementia,...,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire
0,1723,0.197842,0.144968,0.158743,0.723575,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,82,0.782762,0.681989,0.696094,0.896555,1,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,296,0.112656,0.141188,0.090132,0.684796,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
3,127,0.664312,0.680485,0.532674,0.817022,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,137,0.149154,0.135658,0.063677,0.692096,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0


I give up documenting

In [154]:
STAR_WEIGHT = 0.6
SENTIMENT_WEIGHT = 0.

df_history["combined_rating"] = STAR_WEIGHT * df_history["star_rating"] / 5 + SENTIMENT_WEIGHT * df_history["sentiment_rating"]
df_history

Unnamed: 0,own_id,inf_id,star_rating,sentiment_rating,combined_rating
0,1,1,4,0.874921,0.48
1,1,3,3,0.566991,0.36
2,1,5,5,0.945937,0.60
3,1,6,5,0.832950,0.60
4,1,7,4,0.826545,0.48
...,...,...,...,...,...
413441,4999,49,5,0.987383,0.60
413442,4999,670,5,0.823814,0.60
413443,4999,62,5,0.985877,0.60
413444,4999,770,5,0.847512,0.60


### Data Splitting

##### Creating user profile

In [155]:
df_history = df_history.drop(["star_rating", "sentiment_rating"], axis=1)
df_inf_features = pd.merge(df_history, df_inf_norm, left_on='inf_id', right_on='id', how='left')
df_inf_features.head()

Unnamed: 0,own_id,inf_id,combined_rating,id,insta_follower,tiktok,youtube,price_normalized,Action,Adventure,...,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire
0,1,1,0.48,1,0.125135,0.165684,0.059806,0.552815,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,3,0.36,3,0.115435,0.113069,0.121231,0.523593,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,5,0.6,5,0.123297,0.079398,0.116411,0.567288,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1,6,0.6,6,1.0,0.917396,1.0,0.885963,0,0,...,0,0,0,0,0,0,0,1,1,0
4,1,7,0.48,7,0.558893,0.816575,0.273925,0.743871,0,0,...,0,0,0,1,0,0,0,1,0,0


In [156]:
OWNER_FEATURES = df_inf_norm.columns[1:]

# Copy influencer features combined with history data
df_own_norm = df_inf_features.copy()

# Multiply influencer feature with user rating
df_own_norm[OWNER_FEATURES] = df_own_norm[OWNER_FEATURES].mul(df_own_norm['combined_rating'], axis=0) 

# Drop unimportant features
df_own_norm = df_own_norm.drop(["inf_id", "id", "combined_rating"], axis=1)

# Average those with same owner id to make user profile
df_own_norm = df_own_norm.groupby('own_id').mean().reset_index()
df_own_norm.rename(columns={'own_id': 'id'}, inplace=True)

df_own_norm.head()

Unnamed: 0,id,insta_follower,tiktok,youtube,price_normalized,Action,Adventure,Cars,Comedy,Dementia,...,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire
0,1,0.218517,0.240286,0.206128,0.348299,0.18,0.0,0.0,0.225,0.0,...,0.0,0.0,0.0,0.27,0.0,0.0,0.0,0.27,0.075,0.0
1,2,0.096795,0.107419,0.075397,0.324532,0.228144,0.111377,0.0,0.251856,0.0,...,0.001078,0.174611,0.011138,0.089102,0.003234,0.036287,0.052455,0.148743,0.013293,0.018323
2,3,0.150886,0.163595,0.116479,0.349016,0.158824,0.134118,0.0,0.321176,0.0,...,0.0,0.091765,0.0,0.067059,0.0,0.0,0.049412,0.116471,0.014118,0.0
3,4,0.201203,0.214352,0.165891,0.345844,0.389268,0.163902,0.0,0.242927,0.0,...,0.011707,0.196098,0.0,0.0,0.0,0.0,0.12878,0.24,0.040976,0.026341
4,5,0.148409,0.163049,0.11486,0.32423,0.173933,0.086292,0.0,0.265618,0.004045,...,0.004045,0.082247,0.031011,0.13618,0.0,0.013483,0.033708,0.167191,0.051236,0.017528


##### Process feature and label

In [157]:
# Remove ID and labels
df_inf_features = df_inf_features.drop(["own_id", "inf_id", "id", "combined_rating"], axis=1)
INFLUENCER_FEATURE_COUNT = len(df_inf_features.columns)

df_inf_features.head()

Unnamed: 0,insta_follower,tiktok,youtube,price_normalized,Action,Adventure,Cars,Comedy,Dementia,Demons,...,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire
0,0.125135,0.165684,0.059806,0.552815,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.115435,0.113069,0.121231,0.523593,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.123297,0.079398,0.116411,0.567288,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1.0,0.917396,1.0,0.885963,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
4,0.558893,0.816575,0.273925,0.743871,0,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0


In [158]:
# Join history and owner data by own_id
df_own_features = pd.merge(df_history, df_own_norm, left_on='own_id', right_on='id', how='left')

df_own_features.head()

Unnamed: 0,own_id,inf_id,combined_rating,id,insta_follower,tiktok,youtube,price_normalized,Action,Adventure,...,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire
0,1,1,0.48,1,0.218517,0.240286,0.206128,0.348299,0.18,0.0,...,0.0,0.0,0.0,0.27,0.0,0.0,0.0,0.27,0.075,0.0
1,1,3,0.36,1,0.218517,0.240286,0.206128,0.348299,0.18,0.0,...,0.0,0.0,0.0,0.27,0.0,0.0,0.0,0.27,0.075,0.0
2,1,5,0.6,1,0.218517,0.240286,0.206128,0.348299,0.18,0.0,...,0.0,0.0,0.0,0.27,0.0,0.0,0.0,0.27,0.075,0.0
3,1,6,0.6,1,0.218517,0.240286,0.206128,0.348299,0.18,0.0,...,0.0,0.0,0.0,0.27,0.0,0.0,0.0,0.27,0.075,0.0
4,1,7,0.48,1,0.218517,0.240286,0.206128,0.348299,0.18,0.0,...,0.0,0.0,0.0,0.27,0.0,0.0,0.0,0.27,0.075,0.0


In [159]:
# Remove ID and labels
df_own_features = df_own_features.drop(["own_id", "inf_id", "id", "combined_rating"], axis=1)
OWNER_FEATURE_COUNT = len(df_own_features.columns)

df_own_features.head()

Unnamed: 0,insta_follower,tiktok,youtube,price_normalized,Action,Adventure,Cars,Comedy,Dementia,Demons,...,Shoujo Ai,Shounen,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire
0,0.218517,0.240286,0.206128,0.348299,0.18,0.0,0.0,0.225,0.0,0.0,...,0.0,0.0,0.0,0.27,0.0,0.0,0.0,0.27,0.075,0.0
1,0.218517,0.240286,0.206128,0.348299,0.18,0.0,0.0,0.225,0.0,0.0,...,0.0,0.0,0.0,0.27,0.0,0.0,0.0,0.27,0.075,0.0
2,0.218517,0.240286,0.206128,0.348299,0.18,0.0,0.0,0.225,0.0,0.0,...,0.0,0.0,0.0,0.27,0.0,0.0,0.0,0.27,0.075,0.0
3,0.218517,0.240286,0.206128,0.348299,0.18,0.0,0.0,0.225,0.0,0.0,...,0.0,0.0,0.0,0.27,0.0,0.0,0.0,0.27,0.075,0.0
4,0.218517,0.240286,0.206128,0.348299,0.18,0.0,0.0,0.225,0.0,0.0,...,0.0,0.0,0.0,0.27,0.0,0.0,0.0,0.27,0.075,0.0


In [160]:
# Get labels from history data
df_labels = df_history["combined_rating"]
df_labels.head()

0    0.48
1    0.36
2    0.60
3    0.60
4    0.48
Name: combined_rating, dtype: float64

##### Generate train, validation, and test dataset

In [161]:
# Shuffle and batch data
import tensorflow as tf

SHUFFLE_BUFFER = 1000

dataset = tf.data.Dataset.from_tensor_slices(({"inf_feature": df_inf_features, "own_feature": df_own_features}, df_labels))
dataset = dataset.shuffle(SHUFFLE_BUFFER) 

dataset.element_spec

({'inf_feature': TensorSpec(shape=(44,), dtype=tf.float64, name=None),
  'own_feature': TensorSpec(shape=(44,), dtype=tf.float64, name=None)},
 TensorSpec(shape=(), dtype=tf.float64, name=None))

In [162]:
# Generate training, validation, and testing data
DATASET_SIZE = dataset.cardinality().numpy()
TRAIN_SIZE = int(DATASET_SIZE * 0.8)
VAL_SIZE = int(DATASET_SIZE * 0.1)
TEST_SIZE = DATASET_SIZE - TRAIN_SIZE - VAL_SIZE

train_dataset = dataset.take(TRAIN_SIZE)
val_dataset = dataset.skip(TRAIN_SIZE).take(VAL_SIZE)
test_dataset = dataset.skip(TRAIN_SIZE + VAL_SIZE).take(TEST_SIZE)

print(f"Training dataset has {train_dataset.cardinality().numpy()} data")
print(f"Validation dataset has {val_dataset.cardinality().numpy()} data")
print(f"Testing dataset has {test_dataset.cardinality().numpy()} data")

Training dataset has 330756 data
Validation dataset has 41344 data
Testing dataset has 41346 data


In [163]:
# Batching
BATCH_SIZE = 128
REPEAT = 2

train_dataset = train_dataset.batch(BATCH_SIZE).repeat(REPEAT)
val_dataset = val_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

## Creating Model

Model consists of two neural networks that would be combined with Dot layer. The first neural network has influencer features as input and a vector as an output. The second one has owner features as input and a vector as an output. These two vectors will be combined with Dot layer and produces a single combined rating

In [198]:
VECTOR_SIZE = 128
# tf.random.set_seed(1)

model_influencer = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dense(units=128, activation='relu'),
    tf.keras.layers.Dense(units=VECTOR_SIZE, activation='linear'),
])

# create the influencer input and point to the base network
input_influencer = tf.keras.layers.Input(shape=(INFLUENCER_FEATURE_COUNT), name="inf_feature")
vi = model_influencer(input_influencer)
vi = tf.linalg.l2_normalize(vi, axis=1)

model_owner = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dense(units=128, activation='relu'),
    tf.keras.layers.Dense(units=VECTOR_SIZE, activation='linear'),
])

# create the owner input and point to the base network
input_owner = tf.keras.layers.Input(shape=(OWNER_FEATURE_COUNT), name="own_feature")
vo = model_owner(input_owner)
vo = tf.linalg.l2_normalize(vo, axis=1)

# compute the dot product of the two vectors vi and vo
output = tf.keras.layers.Dot(axes=1)([vi, vo])

# specify the inputs and output of the model
model = tf.keras.Model([input_influencer, input_owner], output)

model.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 inf_feature (InputLayer)       [(None, 44)]         0           []                               
                                                                                                  
 own_feature (InputLayer)       [(None, 44)]         0           []                               
                                                                                                  
 sequential_10 (Sequential)     (None, 128)          60928       ['inf_feature[0][0]']            
                                                                                                  
 sequential_11 (Sequential)     (None, 128)          60928       ['own_feature[0][0]']            
                                                                                            

In [199]:
LEARNING_RATE = 1e-5

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE), 
              loss=tf.keras.losses.MeanSquaredError(),
              metrics=["accuracy", "mae"])

model.fit(train_dataset, validation_data=val_dataset, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1a3b7fb62c0>

In [200]:
test_predict = model.predict(test_dataset)
test_predict



array([[0.50647813],
       [0.47089675],
       [0.36893234],
       ...,
       [0.5339343 ],
       [0.55262804],
       [0.46460187]], dtype=float32)

In [201]:
import numpy as np

# Get real label
labels = np.array([])
for batch in test_dataset:
    labels = np.concatenate([labels, batch[1].numpy()])

labels

array([0.48, 0.36, 0.48, ..., 0.48, 0.6 , 0.48])

In [202]:
compare = pd.concat([pd.DataFrame(test_predict, columns=["predicted"]), 
                     pd.DataFrame(labels, columns=["real"])], axis=1)

compare["error"] = abs(compare["predicted"] - compare["real"])

compare

Unnamed: 0,predicted,real,error
0,0.506478,0.48,0.026478
1,0.470897,0.36,0.110897
2,0.368932,0.48,0.111068
3,0.475623,0.48,0.004377
4,0.497700,0.48,0.017700
...,...,...,...
41341,0.516915,0.48,0.036915
41342,0.504742,0.60,0.095258
41343,0.533934,0.48,0.053934
41344,0.552628,0.60,0.047372


In [208]:
compare[compare["error"] >= 5e-1]

Unnamed: 0,predicted,real,error
21135,0.627383,0.12,0.507383
