In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import pickle
from sklearn.cluster import DBSCAN


In [24]:
df = pd.read_csv("interaction_filtered.csv")

In [25]:
columns_to_drop = [
    "user_id", "click", "mod_price", "root_id",
    "category_id", "exposed_time" , "p_hour", "p_date"
]

In [26]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from internal_logics.freq_mapper import SimplifiedFrequencyMapper as SimplifiedFrequencyMapper
from internal_logics.label_encode import SimplifiedLabelEncoder as SimplifiedLabelEncoder
from internal_logics.gender_transform import GenderTransformer as GenderTransformer
from internal_logics.log_transform import LogTransformer as LogTransformer                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

boolean_cols = ["cvm_like", "comment", "follow", "collect", "forward", "hate"]
categorical_cols = ["tag_name", "fre_community_type", "fre_city_level", "fre_city"]
skewed_numeric_cols = ["watch_time", "duration", "author_fans_count"]

skewed_numeric_pipeline = Pipeline(steps=[
    ('log', LogTransformer()),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("booleans", SimplifiedLabelEncoder(), boolean_cols),
        ("categoricals", Pipeline(steps=[
            ('freq_map', SimplifiedFrequencyMapper()),
            ('label_enc', SimplifiedLabelEncoder())
        ]), categorical_cols),
        ("gender", GenderTransformer(), ["gender"]),
        ("skewed_numeric", skewed_numeric_pipeline, skewed_numeric_cols)
    ],
    remainder='passthrough'
)

In [27]:
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95))
])

In [28]:
with open("fitted_preprocessor_5.pkl", "rb") as f:
    loaded_pipeline = pickle.load(f)

# 2. Access the PCA step and its attribute from the loaded object
num_components_from_file = loaded_pipeline['pca'].n_components_

print(f"After loading the file, we can confirm PCA chose {num_components_from_file} components.")

After loading the file, we can confirm PCA chose 2 components.


What did PCA select???


Applyign DBSCAN

In [29]:

# applying pca , preprocessing on training data and making transformed_data 

df_cleaned = df.drop(columns=columns_to_drop)

dbscan_sample_raw = df_cleaned.sample(n=200000, random_state=42)

with open("fitted_preprocessor_5.pkl", "rb") as f:
    full_pipeline = pickle.load(f)

In [30]:
# for saving
# np.save("transformed_data_for_dbscan.npy", transformed_data)

transformed_data = np.load("transformed_data_for_dbscan.npy")

In [31]:
transformed_data.shape

(200000, 2)

If model already trained , then just import that 

In [32]:
with open("dbscan_model.pkl", "rb") as f:
    loaded_dbscan_model = pickle.load(f)
    
cluster_labels = loaded_dbscan_model.labels_

dbscan_sample_raw['dbscan_cluster_label'] = cluster_labels

In [33]:
print(dbscan_sample_raw.shape)

(200000, 21)


In [34]:
# 1. Filter out the noise points (label == -1) before calculating centroids
core_data = dbscan_sample_raw[dbscan_sample_raw['dbscan_cluster_label'] != -1]
cluster_centroids = core_data.drop(columns=['dbscan_cluster_label']).groupby(core_data['dbscan_cluster_label']).mean(numeric_only=True)

with open("cluster_centroids.pkl", "wb") as f:
    pickle.dump(cluster_centroids, f)

Predicting New Data

In [35]:
# Assume 'transformed_data' is the output from your full_pipeline (from the .npy file)
# Assume 'cluster_labels' are the labels from your fitted dbscan model

# 1. Create a DataFrame from the PCA-transformed data
transformed_df = pd.DataFrame(transformed_data)

# 2. Add the cluster labels to this new DataFrame
transformed_df['dbscan_cluster_label'] = cluster_labels

# 3. Filter out noise points
core_data_transformed = transformed_df[transformed_df['dbscan_cluster_label'] != -1]

# 4. Calculate centroids on the TRANSFORMED data
cluster_centroids_transformed = core_data_transformed.groupby('dbscan_cluster_label').mean()

# 5. Save the CORRECT centroids
with open("cluster_centroids.pkl", "wb") as f:
    pickle.dump(cluster_centroids_transformed, f)

In [36]:
with open('cluster_centroids.pkl', 'rb') as f:
    cluster_centroids = pickle.load(f)
print("Preprocessor and cluster centroids loaded successfully.")

Preprocessor and cluster centroids loaded successfully.


In [37]:
new_data = pd.DataFrame({
    "cvm_like": [True], "comment": [False], "follow": [True],
    "collect": [False], "forward": [False], "hate": [False],
    "tag_name": ["some_tag"], "fre_community_type": ["type_A"],
    "fre_city_level": ["level_1"], "fre_city": ["city_X"],
    "duration": [1000.0], "gender": ['M'], "author_fans_count": [10000000],
    "watch_time": [0.0], "parent_id":[30], "age":[20], "category_level":[2]
})



In [38]:
from internal_logics.fallback import predict_with_fallback
final_cluster = predict_with_fallback(new_data, full_pipeline , cluster_centroids)

Prediction Videos Logic 

Creating Video_clusters.csv file

In [39]:
# from internal_logics.video_clusters import video_clusters as vc
# df_video_clusters = vc.makingVideoClusters(dbscan_sample_raw)

In [40]:
df_video_clusters = pd.read_csv("video_clusters.csv")

Testing

In [41]:
from internal_logics.get_recomendations import get_recommendations

recommended_videos = get_recommendations(
    target_video_pid=9999999,
    target_cluster_label=final_cluster,
    all_videos_df=dbscan_sample_raw,
    video_cluster_map=df_video_clusters
)

Getting Actual Videos

In [42]:
import pandas as pd
from internal_logics.url_for_recomend import url_for_videos
data = {
    'pid': recommended_videos['pid'],
    'title': recommended_videos['title'],
    'author_id': recommended_videos['author_id'],
    'watch_time': recommended_videos['watch_time']
}
recommended_videos = pd.DataFrame(data)

# Let's remove duplicates for a cleaner list
recommended_videos = recommended_videos.drop_duplicates(subset='pid').reset_index(drop=True)

all_recomended_videos = url_for_videos(recommended_videos=recommended_videos)


Recommendation #1:
  Title: 不要离开我
  PID: 146585
  Direct Link: https://fi.ee.tsinghua.edu.cn/datasets/short-video-dataset/raw_file/146585.mp4

Recommendation #2:
  Title: 弟弟回外婆家
  PID: 67901
  Direct Link: https://fi.ee.tsinghua.edu.cn/datasets/short-video-dataset/raw_file/67901.mp4

Recommendation #3:
  Title: 《究竟为什么》
  PID: 81311
  Direct Link: https://fi.ee.tsinghua.edu.cn/datasets/short-video-dataset/raw_file/81311.mp4

---------------------------------------------------------
