# <center> Recommender System </center>

## Data Extraction

### Load data from CSV

Data is generated randomly using Excel

In [217]:
import pandas as pd
import os

DATA_DIR = os.path.join("data/random/")
INFLUENCER_FILE = os.path.join(DATA_DIR, "data_content_influencer_categ.csv")
OWNER_FILE = os.path.join(DATA_DIR, "data_content_owner_categ.csv")
HISTORY_FILE = os.path.join(DATA_DIR, "historical_data.csv")

df_influencer = pd.read_csv(INFLUENCER_FILE)
df_owner = pd.read_csv(OWNER_FILE)
df_history = pd.read_csv(HISTORY_FILE)

### Data Exploration

In [218]:
df_influencer.head(5)

Unnamed: 0,id,insta_follower,tiktok,youtube,price_normalized,categories
0,INF1,868194,969648,616323,0.643695,Music;Fashion
1,INF2,362656,122314,693275,0.572601,Game;General
2,INF3,209193,828402,76364,0.441951,Fashion;Music;Tech
3,INF4,628052,265953,529438,0.445078,Game;Music;Tech;Sport
4,INF5,271472,531116,951829,0.590743,General;Fashion;Music;Pop Culture;Game;Sport


In [219]:
df_influencer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4924 entries, 0 to 4923
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                4924 non-null   object 
 1   insta_follower    4924 non-null   int64  
 2   tiktok            4924 non-null   int64  
 3   youtube           4924 non-null   int64  
 4   price_normalized  4924 non-null   float64
 5   categories        4924 non-null   object 
dtypes: float64(1), int64(3), object(2)
memory usage: 230.9+ KB


In [220]:
df_influencer.describe()

Unnamed: 0,insta_follower,tiktok,youtube,price_normalized
count,4924.0,4924.0,4924.0,4924.0
mean,497558.71446,503623.4671,507027.295491,0.499488
std,287672.99961,284878.137868,287343.08838,0.291946
min,4409.0,4003.0,4504.0,3.8e-05
25%,246925.0,257822.25,260310.5,0.245207
50%,496332.5,505953.5,508959.0,0.499055
75%,742327.75,745958.5,753429.0,0.753487
max,999940.0,999910.0,999710.0,0.999777


In [221]:
df_owner.head(5)

Unnamed: 0,id,categories
0,OWN1,Music;Sport;General;Tech;Fashion;Game;Pop Culture
1,OWN2,Pop Culture;Game
2,OWN3,Fashion;Pop Culture;Music;Tech;General;Game;Sport
3,OWN4,Fashion;Pop Culture;Game;General
4,OWN5,General;Sport;Game


In [222]:
df_owner.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1792 entries, 0 to 1791
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          1792 non-null   object
 1   categories  1792 non-null   object
dtypes: object(2)
memory usage: 28.1+ KB


In [223]:
df_owner.describe()

Unnamed: 0,id,categories
count,1792,1792
unique,1792,1156
top,OWN1,Music
freq,1,43


In [224]:
df_history.head(5)

Unnamed: 0,own_id,inf_id,star_rating,sentiment_rating
0,OWN1735,INF341,5,0.921058
1,OWN275,INF1253,2,0.0014
2,OWN1430,INF1423,4,0.702913
3,OWN176,INF580,2,0.398565
4,OWN1005,INF1495,3,0.122339


In [225]:
df_history.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17280 entries, 0 to 17279
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   own_id            17280 non-null  object 
 1   inf_id            17280 non-null  object 
 2   star_rating       17280 non-null  int64  
 3   sentiment_rating  17280 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 540.1+ KB


In [226]:
df_history.describe()

Unnamed: 0,star_rating,sentiment_rating
count,17280.0,17280.0
mean,3.010822,0.5002024
std,1.406766,0.2890345
min,1.0,6.9297e-07
25%,2.0,0.2501317
50%,3.0,0.5012116
75%,4.0,0.7483516
max,5.0,0.9998797


## Data Transformation

### Data cleaning

#### Missing value

In [227]:
df_influencer.isnull().sum()

id                  0
insta_follower      0
tiktok              0
youtube             0
price_normalized    0
categories          0
dtype: int64

In [228]:
df_owner.isnull().sum()

id            0
categories    0
dtype: int64

In [229]:
df_history.isnull().sum()

own_id              0
inf_id              0
star_rating         0
sentiment_rating    0
dtype: int64

No missing value

#### Irrelevant Data / Invalid Data

Check if all history has valid influencer and owner ID

In [230]:
df_history["inf_id"].isin(df_influencer["id"]).all()

True

In [231]:
df_history["own_id"].isin(df_owner["id"]).all()

True

All history data has valid influencer and owner ID

## Data Normalization

This process will be using Tensorflow Dataset (when I actually have time lol)

Normalize influencer data: Scale follower count and One-hot categories

In [232]:
from sklearn.preprocessing import StandardScaler

follower_scaler = StandardScaler()

df_inf_norm = df_influencer.copy()
df_inf_norm[["insta_follower", "tiktok", "youtube"]] = follower_scaler.fit_transform(df_inf_norm[["insta_follower", "tiktok", "youtube"]])

one_hot_categories = df_inf_norm['categories'].str.get_dummies(sep=';')
df_inf_norm = pd.concat([df_inf_norm, one_hot_categories], axis=1)
df_inf_norm = df_inf_norm.drop('categories', axis=1)
df_inf_norm.head()

Unnamed: 0,id,insta_follower,tiktok,youtube,price_normalized,Fashion,Game,General,Music,Pop Culture,Sport,Tech
0,INF1,1.288522,1.636039,0.380405,0.643695,1,0,0,1,0,0,0
1,INF2,-0.468992,-1.338636,0.648238,0.572601,0,1,1,0,0,0,0
2,INF3,-1.00251,1.140177,-1.49893,0.441951,1,0,0,1,0,0,1
3,INF4,0.453663,-0.834373,0.078001,0.445078,0,1,0,1,0,1,1
4,INF5,-0.785995,0.096516,1.548138,0.590743,1,1,1,1,1,1,0


Normalize owner data: One-hot categories

In [233]:
df_own_norm = df_owner.copy()

one_hot_categories = df_own_norm['categories'].str.get_dummies(sep=';')
df_own_norm = pd.concat([df_own_norm, one_hot_categories], axis=1)
df_own_norm = df_own_norm.drop('categories', axis=1)
df_own_norm.head()

Unnamed: 0,id,Fashion,Game,General,Music,Pop Culture,Sport,Tech
0,OWN1,1,1,1,1,1,1,1
1,OWN2,0,1,0,0,1,0,0
2,OWN3,1,1,1,1,1,1,1
3,OWN4,1,1,1,0,1,0,0
4,OWN5,0,1,1,0,0,1,0


I give up documenting

In [234]:
STAR_WEIGHT = 0.7
SENTIMENT_WEIGHT = 0.3

df_history["combined_rating"] = STAR_WEIGHT * df_history["star_rating"] / 5 + SENTIMENT_WEIGHT * df_history["sentiment_rating"]
df_history

Unnamed: 0,own_id,inf_id,star_rating,sentiment_rating,combined_rating
0,OWN1735,INF341,5,0.921058,0.976317
1,OWN275,INF1253,2,0.001400,0.280420
2,OWN1430,INF1423,4,0.702913,0.770874
3,OWN176,INF580,2,0.398565,0.399570
4,OWN1005,INF1495,3,0.122339,0.456702
...,...,...,...,...,...
17275,OWN1141,INF99,5,0.164265,0.749279
17276,OWN136,INF719,2,0.464404,0.419321
17277,OWN777,INF337,4,0.062841,0.578852
17278,OWN974,INF482,1,0.916436,0.414931


#### Data Splitting

##### Process feature and label

In [235]:
# Merge history and influencer data by inf_id
df_inf_features = pd.merge(df_history, df_inf_norm, left_on='inf_id', right_on='id', how='left')

df_inf_features.head()

Unnamed: 0,own_id,inf_id,star_rating,sentiment_rating,combined_rating,id,insta_follower,tiktok,youtube,price_normalized,Fashion,Game,General,Music,Pop Culture,Sport,Tech
0,OWN1735,INF341,5,0.921058,0.976317,INF341,-0.218179,-0.920987,0.301485,0.980579,1,0,0,0,1,1,1
1,OWN275,INF1253,2,0.0014,0.28042,INF1253,0.862352,-0.493326,0.826565,0.900812,1,1,1,1,1,1,1
2,OWN1430,INF1423,4,0.702913,0.770874,INF1423,-0.069119,1.681685,-0.899213,0.365875,1,1,1,1,1,1,1
3,OWN176,INF580,2,0.398565,0.39957,INF580,1.425987,-0.90177,1.669671,0.404617,1,1,0,1,0,0,0
4,OWN1005,INF1495,3,0.122339,0.456702,INF1495,-1.603896,-0.487319,-1.36833,0.128867,0,1,0,1,0,1,0


In [236]:
# Remove ID and labels
df_inf_features = df_inf_features.drop(["own_id", "inf_id", "id", "star_rating", "sentiment_rating", "combined_rating"], axis=1)
INFLUENCER_FEATURE_COUNT = len(df_inf_features.columns)

df_inf_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17280 entries, 0 to 17279
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   insta_follower    17280 non-null  float64
 1   tiktok            17280 non-null  float64
 2   youtube           17280 non-null  float64
 3   price_normalized  17280 non-null  float64
 4   Fashion           17280 non-null  int64  
 5   Game              17280 non-null  int64  
 6   General           17280 non-null  int64  
 7   Music             17280 non-null  int64  
 8   Pop Culture       17280 non-null  int64  
 9   Sport             17280 non-null  int64  
 10  Tech              17280 non-null  int64  
dtypes: float64(4), int64(7)
memory usage: 1.6 MB


In [237]:
# Join history and owner data by own_id
df_own_features = pd.merge(df_history, df_own_norm, left_on='own_id', right_on='id', how='left')

df_own_features.head()

Unnamed: 0,own_id,inf_id,star_rating,sentiment_rating,combined_rating,id,Fashion,Game,General,Music,Pop Culture,Sport,Tech
0,OWN1735,INF341,5,0.921058,0.976317,OWN1735,0,1,0,1,0,0,0
1,OWN275,INF1253,2,0.0014,0.28042,OWN275,0,0,1,0,1,1,0
2,OWN1430,INF1423,4,0.702913,0.770874,OWN1430,0,0,0,1,0,0,0
3,OWN176,INF580,2,0.398565,0.39957,OWN176,1,1,1,1,1,1,1
4,OWN1005,INF1495,3,0.122339,0.456702,OWN1005,1,0,1,0,0,1,1


In [238]:
# Remove ID and labels
df_own_features = df_own_features.drop(["own_id", "inf_id", "id", "star_rating", "sentiment_rating", "combined_rating"], axis=1)
OWNER_FEATURE_COUNT = len(df_own_features.columns)

df_own_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17280 entries, 0 to 17279
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   Fashion      17280 non-null  int64
 1   Game         17280 non-null  int64
 2   General      17280 non-null  int64
 3   Music        17280 non-null  int64
 4   Pop Culture  17280 non-null  int64
 5   Sport        17280 non-null  int64
 6   Tech         17280 non-null  int64
dtypes: int64(7)
memory usage: 1.1 MB


In [239]:
# Get labels from history data
df_labels = df_history["combined_rating"]
df_labels.head()

0    0.976317
1    0.280420
2    0.770874
3    0.399570
4    0.456702
Name: combined_rating, dtype: float64

##### Generate train, validation, and test dataset

In [240]:
# Shuffle and batch data
import tensorflow as tf

SHUFFLE_BUFFER = 1000

dataset = tf.data.Dataset.from_tensor_slices(({"inf_feature": df_inf_features, "own_feature": df_own_features}, df_labels))
dataset = dataset.shuffle(SHUFFLE_BUFFER) 

dataset.element_spec

({'inf_feature': TensorSpec(shape=(11,), dtype=tf.float64, name=None),
  'own_feature': TensorSpec(shape=(7,), dtype=tf.int64, name=None)},
 TensorSpec(shape=(), dtype=tf.float64, name=None))

In [241]:
# Generate training, validation, and testing data
DATASET_SIZE = dataset.cardinality().numpy()
TRAIN_SIZE = int(DATASET_SIZE * 0.8)
VAL_SIZE = int(DATASET_SIZE * 0.1)
TEST_SIZE = DATASET_SIZE - TRAIN_SIZE - VAL_SIZE

train_dataset = dataset.take(TRAIN_SIZE)
val_dataset = dataset.skip(TRAIN_SIZE).take(VAL_SIZE)
test_dataset = dataset.skip(TRAIN_SIZE + VAL_SIZE).take(TEST_SIZE)

print(f"Training dataset has {train_dataset.cardinality().numpy()} data")
print(f"Validation dataset has {val_dataset.cardinality().numpy()} data")
print(f"Testing dataset has {test_dataset.cardinality().numpy()} data")

Training dataset has 13824 data
Validation dataset has 1728 data
Testing dataset has 1728 data


In [242]:
# Batching
BATCH_SIZE = 32
REPEAT = 2

train_dataset = train_dataset.batch(BATCH_SIZE).repeat(REPEAT)
val_dataset = val_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

## Creating Model

Model consists of two neural networks that would be combined with Dot layer. The first neural network has influencer features as input and a vector as an output. The second one has owner features as input and a vector as an output. These two vectors will be combined with Dot layer and produces a single combined rating

In [243]:
VECTOR_SIZE = 32
# tf.random.set_seed(1)

model_influencer = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dense(units=128, activation='relu'),
    tf.keras.layers.Dense(units=VECTOR_SIZE, activation='linear'),
])

# create the influencer input and point to the base network
input_influencer = tf.keras.layers.Input(shape=(INFLUENCER_FEATURE_COUNT), name="inf_feature")
vi = model_influencer(input_influencer)
vi = tf.linalg.l2_normalize(vi, axis=1)

model_owner = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.Dense(units=128, activation='relu'),
    tf.keras.layers.Dense(units=VECTOR_SIZE, activation='linear'),
])

# create the owner input and point to the base network
input_owner = tf.keras.layers.Input(shape=(OWNER_FEATURE_COUNT), name="own_feature")
vo = model_owner(input_owner)
vo = tf.linalg.l2_normalize(vo, axis=1)

# compute the dot product of the two vectors vi and vo
dot_product = tf.keras.layers.Dot(axes=1)([vi, vo])
output = tf.keras.activations.sigmoid(dot_product)

# specify the inputs and output of the model
model = tf.keras.Model([input_influencer, input_owner], output)

model.summary()

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 inf_feature (InputLayer)       [(None, 11)]         0           []                               
                                                                                                  
 own_feature (InputLayer)       [(None, 7)]          0           []                               
                                                                                                  
 sequential_12 (Sequential)     (None, 32)           40096       ['inf_feature[0][0]']            
                                                                                                  
 sequential_13 (Sequential)     (None, 32)           39072       ['own_feature[0][0]']            
                                                                                            

In [244]:
LEARNING_RATE = 1e-2

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE), 
              loss=tf.keras.losses.MeanSquaredError(),
              metrics=["accuracy", "mae"])

model.fit(train_dataset, validation_data=val_dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x229618874f0>

Random data fails, since there are no pattern to learn