# Project Assignment: Short Video Recommender System (KuaiRec)

Dataset Source: [Kuairec](https://kuairec.com/)

Arxiv Paper: [KuaiRec: A Fully-observed Dataset and Insights for Evaluating Recommender Systems](https://arxiv.org/pdf/2202.10842)

## Dataset import

In [None]:
!wget https://nas.chongminggao.top:4430/datasets/KuaiRec.zip --no-check-certificate
!unzip KuaiRec.zip

In [None]:
import os

import numpy as np
import pandas as pd
import plotly.express as px
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

# I get my dataset from a Kaggle input
DATA_PATH = "/kaggle/input/kuairec/KuaiRec 2.0/data"
if not os.path.exists(DATA_PATH):
   DATA_PATH = f"{os.getcwd()}/KuaiRec/data"
if not os.path.exists(DATA_PATH):
   DATA_PATH = f"{os.getcwd()}/KuaiRec 2.0/data"
if not os.path.exists(DATA_PATH):
   raise FileNotFoundError("KuaiRec dataset not found. Please check the path.")

DATA_PATH

# Exploratory Data Analysis (EDA)

Each part is associated to a specific observation.

## Observe the dataset

In [None]:
def data_clear(df : pd.DataFrame) -> pd.DataFrame:
    # Date is time in a weird format

    # Time and Date are duplicated of timestamp, we can drop them
    df.drop(columns=["time", "date"], inplace=True)
    # Not a problem, we want to keep the data for the density
    df = df.astype({
        "user_id": "int32",
        "video_id": "int32",
        "play_duration":"int32",
        "timestamp": "int64",
        "watch_ratio": "float32"}, errors="ignore")
    
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    df.dropna(inplace=True)
    df = df[df["timestamp"] >= 0]
    
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s")

    return df

In [None]:
def my_describe(df : pd.DataFrame) -> pd.DataFrame:
    """
    Custom describe for datasets containing user_id and video_id
    """
    print(f"Shape of the small matrix: {df.shape}")
    unique_users = df["user_id"].nunique()
    unique_posts = df["video_id"].nunique()
    print(f"Number of unique users: {unique_users}")
    print(f"Number of unique posts: {unique_posts}")
    print(f"Matrix sparsity: {len(df) /(unique_posts * unique_users) * 100}%")
    return df.describe()

## Small matrix

This table has a density of 99.6%. This means that 99.6% of the entries in the matrix are non-zero, indicating that most users have interacted with most items.

In [None]:
small_matrix = pd.read_csv(f"{DATA_PATH}/small_matrix.csv")

small_matrix = data_clear(small_matrix)


#### General statistics

In [None]:
eda_small_matrix = small_matrix.copy()

In [None]:
eda_small_matrix.head(5)

In [None]:
my_describe(eda_small_matrix)

We have a matrix of interaction.

Let's see the distribution of our video related values (timestamp, watch_ratio etc.)

In [None]:

fig = px.histogram(eda_small_matrix["user_id"].value_counts())
fig.update_layout(
    title="Distribution of Interactions per User",
    xaxis_title="Number of Interactions",
    yaxis_title="Count of Users",
)

fig.show()


fig = px.histogram(eda_small_matrix["video_id"].value_counts())
fig.update_layout(
    title="Distribution of Interactions per Video",
    xaxis_title="Number of Interactions",
    yaxis_title="Count of Videos",
)
fig.show()



#### Time trend

In [None]:
# Plot the number of interactions per hour for top 10 users
def get_n_top_info(df : pd.DataFrame, info : str = "user_id", top_n : int = 10) -> pd.DataFrame:
    top_users = df[info].value_counts().nlargest(top_n).index
    top_users_df = df[df[info].isin(top_users)]

    return top_users_df

In [None]:
top_users_df = get_n_top_info(eda_small_matrix, "user_id", 10)

top_users_df["hour"] = top_users_df["timestamp"].dt.hour

fig = px.histogram(
    top_users_df,
    x="hour",
    color="user_id",
    title="Distribution of Interactions per Hour for Top Users",
    labels={"hour": "Hour of Day", "user_id": "User ID"},
    barmode="group",
)
fig.show()

It seems that top users interact with videos mostly around 5 pm. This is probably when users have the most free time and are more likely to watch videos.

We will map out the distribution of interactions to see if this is a trend or just a coincidence.

In [None]:
eda_small_matrix["hour"] = eda_small_matrix["timestamp"].dt.hour

px.histogram(
    eda_small_matrix,
    x="hour",
    title="Distribution of Interactions per Hour",
    log_y=True
)

Overall, most interactions happen between 11pm and 2 am, without considering the day or location of users.

This insight could be interesting if we consider time as a feature. We can have perfect moments of the day to recommend videos to users.

## Big matrix

This table has a density of 16.3%. We will use this matrix for our training and testing.

It contains more interactions with the same users/items of the small matrix. We do not need to substract the small matrix.

In [None]:
big_matrix = pd.read_csv(f"{DATA_PATH}/big_matrix.csv")

big_matrix = data_clear(big_matrix)


#### General statistics

In [None]:
eda_big_matrix = big_matrix.copy()

In [None]:
eda_big_matrix.head(5)

In [None]:
my_describe(eda_big_matrix)

In [None]:

fig = px.histogram(eda_big_matrix["user_id"].value_counts())
fig.update_layout(
    title="Distribution of Interactions per User",
    xaxis_title="Number of Interactions",
    yaxis_title="Count of Users",
)

fig.show()


fig = px.histogram(eda_big_matrix["video_id"].value_counts())
fig.update_layout(
    title="Distribution of Interactions per Video",
    xaxis_title="Number of Interactions",
    yaxis_title="Count of Videos",
)
fig.show()



We have a matrix of interaction.

Let's see the distribution of our video related values (timestamp, watch_ratio etc.)

#### Time trend

In [None]:
# Plot the number of interactions per hour for top 10 users
def get_n_top_info(df : pd.DataFrame, info : str = "user_id", top_n : int = 10) -> pd.DataFrame:
    top_users = df[info].value_counts().nlargest(top_n).index
    top_users_df = df[df[info].isin(top_users)]

    return top_users_df

In [None]:
top_users_df = get_n_top_info(eda_big_matrix, "user_id", 10)

top_users_df["hour"] = top_users_df["timestamp"].dt.hour

fig = px.histogram(
    top_users_df,
    x="hour",
    color="user_id",
    title="Distribution of Interactions per Hour for Top Users",
    labels={"hour": "Hour of Day", "user_id": "User ID"},
    barmode="group",
)
fig.show()

It seems that top users interact with videos mostly around 5 pm. This is probably when users have the most free time and are more likely to watch videos.

We will map out the distribution of interactions to see if this is a trend or just a coincidence.

In [None]:
eda_big_matrix["hour"] = eda_big_matrix["timestamp"].dt.hour

px.histogram(
    eda_big_matrix,
    x="hour",
    title="Distribution of Interactions per Hour",
    log_y=True
)

Overall, most interactions happen between 11pm and 2 am, without considering the day or location of users.

This insight could be interesting if we consider time as a feature. We can have perfect moments of the day to recommend videos to users.

## Misc

In [None]:
print(f"Proportion of small_matrix relative to big_matrix: {small_matrix.shape[0] * 100 / big_matrix.shape[0]:.2f}%")

## Item category encoding

We have the caracteristics of the videos (author_id, video_type...) but this part requires less preprocessing.

For Content-based filtering, we need to use features of the videos (list of tags). No need for TF-IDF, we will use a simple one-hot encoding.

In [None]:
# No missing values for this data
item_categories = pd.read_csv(f"{DATA_PATH}/item_categories.csv")

# Transform the feat column to a list (evaluate with python)
item_categories["feat"] = item_categories["feat"].apply(eval)

In [None]:
# Use MultiLabelBinarizer to manage efficiently the feat column
mlb = MultiLabelBinarizer()

matrix_item_category = pd.DataFrame(mlb.fit_transform(item_categories["feat"]), 
                  columns=mlb.classes_,
                  index=item_categories["video_id"])


In [None]:
nb_of_features = matrix_item_category.sum()
fig = px.bar(
    x = nb_of_features.index,
    y = nb_of_features.values,
    title="Distribution of Number of Features per Video",
    labels={"feat": "Number of Features"},
)
fig

In [None]:
matrix_item_category

## Item daily features

This dataset is also interesting for content-based filtering.

Mostly composed of textual data, we will use a TF-IDF vectorizer to encode the features of the videos.

In [None]:
item_daily_features = pd.read_csv(f"{DATA_PATH}/item_daily_features.csv", lineterminator='\n')
item_daily_features

## Caption Category

In [None]:
caption_category = pd.read_csv(f"{DATA_PATH}/kuairec_caption_category.csv", lineterminator='\n')
caption_category