In [1]:
import os
import pandas as pd

# Define the URL and the local path to save the dataset
dataset_url = "https://nas.chongminggao.top:4430/datasets/KuaiRec.zip"
dataset_zip_path = "../data/raw/KuaiRec.zip"
dataset_extracted_path = "../data/raw/"

# Create the data directory if it doesn't exist
os.makedirs("../data/raw", exist_ok=True)
os.makedirs("../data/recommendations", exist_ok=True)
os.makedirs("../data/processed", exist_ok=True)

In [2]:
# Download the dataset
! wget -O {dataset_zip_path} {dataset_url}
! unzip -o {dataset_zip_path} -d {dataset_extracted_path}

--2025-05-16 15:21:37--  https://nas.chongminggao.top:4430/datasets/KuaiRec.zip
Resolving nas.chongminggao.top (nas.chongminggao.top)... 211.86.155.249
Connecting to nas.chongminggao.top (nas.chongminggao.top)|211.86.155.249|:4430... failed: Connection refused.
Archive:  ../data/raw/KuaiRec.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of ../data/raw/KuaiRec.zip or
        ../data/raw/KuaiRec.zip.zip, and cannot find ../data/raw/KuaiRec.zip.ZIP, period.


In [3]:
# List all files in the extracted dataset
print("Files in the extracted dataset:")
for file_name in os.listdir(dataset_extracted_path):
    print(file_name)

Files in the extracted dataset:
kuairec_caption_category.csv
.DS_Store
item_daily_features.csv
KuaiRec.zip
item_categories.csv
user_features.csv
small_matrix.csv
social_network.csv
big_matrix.csv
KuaiRec 2.0


In [4]:
# Load and inspect each file in the dataset
files_to_inspect = [
    "big_matrix.csv",
    "small_matrix.csv",
    "social_network.csv",
    "user_features.csv",
    "item_daily_features.csv",
    "item_categories.csv",
]

for file_name in files_to_inspect:
    file_path = os.path.join(dataset_extracted_path, file_name)
    print(f"\nInspecting {file_name}...")
    df = pd.read_csv(file_path)
    #print(df.head())
    print(df.info())




Inspecting big_matrix.csv...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12530806 entries, 0 to 12530805
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   user_id         int64  
 1   video_id        int64  
 2   play_duration   int64  
 3   video_duration  int64  
 4   time            object 
 5   date            int64  
 6   timestamp       float64
 7   watch_ratio     float64
dtypes: float64(2), int64(5), object(1)
memory usage: 764.8+ MB
None

Inspecting small_matrix.csv...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4676570 entries, 0 to 4676569
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   user_id         int64  
 1   video_id        int64  
 2   play_duration   int64  
 3   video_duration  int64  
 4   time            object 
 5   date            float64
 6   timestamp       float64
 7   watch_ratio     float64
dtypes: float64(3), int64(4), object(1)
memory usage: 285.4+ MB

In [5]:
# Define the paths to the dataset files
user_features_path = os.path.join(dataset_extracted_path, "user_features.csv")
item_daily_features_path = os.path.join(dataset_extracted_path, "item_daily_features.csv")
small_matrix_path = os.path.join(dataset_extracted_path, "small_matrix.csv")
big_matrix_path = os.path.join(dataset_extracted_path, "big_matrix.csv")
social_network_path = os.path.join(dataset_extracted_path, "social_network.csv")
item_categories_path = os.path.join(dataset_extracted_path, "item_categories.csv")

# Load the datasets
user_features = pd.read_csv(user_features_path)
item_daily_features = pd.read_csv(item_daily_features_path)
small_matrix = pd.read_csv(small_matrix_path)
big_matrix = pd.read_csv(big_matrix_path)
social_network = pd.read_csv(social_network_path)
item_categories = pd.read_csv(item_categories_path)

In [6]:

# Fill missing values in video_duration with the mean
print("Processing item_daily_features...")
item_daily_features['video_duration'] = item_daily_features['video_duration'].fillna(item_daily_features['video_duration'].mean())
item_daily_features['date'] = pd.to_datetime(item_daily_features['date'], format='%Y%m%d')
item_daily_features.drop(columns=['video_tag_name', 'collect_cnt', 'collect_user_num', 'cancel_collect_cnt', 'cancel_collect_user_num'], inplace=True)
item_daily_features["video_tag_id"] = item_daily_features["video_tag_id"].fillna(0)
item_daily_features["video_tag_id"] = item_daily_features["video_tag_id"].apply(lambda x: x if x >= 0 else 0)
if item_daily_features["video_tag_id"].dtype == float:
    item_daily_features = item_daily_features.astype({"video_tag_id": int})

# Convert the 'time' column to datetime format and drop unnecessary columns
print("Processing big_matrix...")
big_matrix['datetime'] = pd.to_datetime(big_matrix['time'])
big_matrix.drop(columns=['time', 'date', 'timestamp'], inplace=True)

# Convert the 'time' column to datetime format and drop unnecessary columns
print("Processing small_matrix...")
small_matrix['datetime'] = pd.to_datetime(small_matrix['time'])
small_matrix.drop(columns=['time', 'date', 'timestamp'], inplace=True)

# Convert friend_list to a list of integers
print("Processing social_network...")
social_network['friend_list'] = social_network['friend_list'].apply(lambda x: [int(i) for i in x.strip('[]').split(',')] if pd.notnull(x) else [])

# Merge user_features with social_network
print("Merging user_features with social_network...")
user_features = user_features.merge(social_network, on='user_id', how='left')
for i in range(1, 18):
    user_features[f"onehot_feat{i}"] = user_features[f"onehot_feat{i}"].fillna(0)
    user_features[f"onehot_feat{i}"] = user_features[f"onehot_feat{i}"].apply(lambda x: x if x >= 0 else 0)
    if user_features[f"onehot_feat{i}"].dtype == float:
        user_features = user_features.astype({f"onehot_feat{i}": int})

# Merge item_daily_features with item_categories
print("Merging item_daily_features with item_categories...")
item_daily_features = item_daily_features.merge(item_categories, on='video_id', how='left')

Processing item_daily_features...
Processing big_matrix...
Processing small_matrix...
Processing social_network...
Merging user_features with social_network...
Merging item_daily_features with item_categories...


In [7]:
# Save the processed datasets
print("Saving processed datasets...")
processed_path = "../data/processed/"

big_matrix.to_csv(os.path.join(processed_path, "big_matrix_processed.csv"), index=False)
small_matrix.to_csv(os.path.join(processed_path, "small_matrix_processed.csv"), index=False)
user_features.to_csv(os.path.join(processed_path, "user_features_processed.csv"), index=False)
item_daily_features.to_csv(os.path.join(processed_path, "item_daily_features_processed.csv"), index=False)

print("Data preprocessing completed successfully!")

Saving processed datasets...
Data preprocessing completed successfully!
