In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
# 加载数据
data = pd.read_csv("/Users/cookie/Desktop/Test/chunk/chunk_0_final.csv")

In [3]:
# 定义需要提取的类别
categories_of_interest = {
    "construction", "environment", "kitchen", "computers", "camera",
    "video", "audio", "apparel", "furniture", "auto", "kids", 
    "medicine", "country_yard", "sport"
}

In [4]:
# 定义函数提取类别
def extract_main_category(category_code):
    """提取主要类别"""
    if pd.isna(category_code):
        return None
    split_code = category_code.split(".")
    if split_code[0] in categories_of_interest:
        return split_code[0]  # 提取第一列
    if len(split_code) > 1 and split_code[1] in categories_of_interest:
        return split_code[1]  # 提取第二列
    return None

# 处理每个 `most_freq_category` 列，提取主要类别
for col in ["most_freq_category_1", "most_freq_category_2", "most_freq_category_3"]:
    data[col] = data[col].apply(extract_main_category)

# 将三个类别列合并为单一列
def merge_categories(row):
    """按权重顺序返回主要类别"""
    categories = [
        (row["most_freq_category_1"], row["category_1_activity_weight"]),
        (row["most_freq_category_2"], row["category_2_activity_weight"]),
        (row["most_freq_category_3"], row["category_3_activity_weight"])
    ]
    # 按权重降序排序
    categories = sorted(categories, key=lambda x: x[1], reverse=True)
    # 返回第一个非空类别
    for category, weight in categories:
        if category:
            return category
    return None


In [5]:
# 创建主类别列
data["main_category"] = data.apply(merge_categories, axis=1)

# 打印检查结果
print(data[["user_id", "main_category"]])

         user_id main_category
0      518753807     computers
1      543495796     furniture
2      522131081       kitchen
3      557224931   environment
4      560897827     computers
...          ...           ...
19995  541809725         audio
19996  520011224         audio
19997  513205210     computers
19998  513678024       kitchen
19999  559275712         audio

[20000 rows x 2 columns]


In [6]:
# 对类别进行编码
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data["main_category_encoded"] = le.fit_transform(data["main_category"].fillna("unknown"))

# 选择需要的列进行聚类
features = ["main_category_encoded"]

In [7]:
# 应用 K-means
kmeans = KMeans(n_clusters=14, random_state=42)  # 假设分3类
data["cluster"] = kmeans.fit_predict(data[features])

In [8]:
# Check clustering result
print(data[["user_id", "main_category", "cluster"]])

         user_id main_category  cluster
0      518753807     computers        3
1      543495796     furniture        2
2      522131081       kitchen        6
3      557224931   environment        9
4      560897827     computers        3
...          ...           ...      ...
19995  541809725         audio        7
19996  520011224         audio        7
19997  513205210     computers        3
19998  513678024       kitchen        6
19999  559275712         audio        7

[20000 rows x 3 columns]


In [13]:
# 聚类中心特征分析
import numpy as np

# 聚类中心
cluster_centers = kmeans.cluster_centers_

# 聚类数
num_clusters = len(cluster_centers)

# 按 cluster 分组，统计各类特征
grouped = data.groupby("cluster")

# 统计每个聚类中 `main_category` 的分布
for cluster_id, group in grouped:
    print(f"Cluster {cluster_id}:")
    print(group["main_category"].value_counts(normalize=True))  # 输出类别分布占比


Cluster 0:
main_category
medicine    1.0
Name: proportion, dtype: float64
Cluster 1:
main_category
auto    1.0
Name: proportion, dtype: float64
Cluster 2:
main_category
furniture    1.0
Name: proportion, dtype: float64
Cluster 3:
main_category
computers    1.0
Name: proportion, dtype: float64
Cluster 4:
main_category
video    1.0
Name: proportion, dtype: float64
Cluster 5:
main_category
apparel    1.0
Name: proportion, dtype: float64
Cluster 6:
main_category
kitchen    1.0
Name: proportion, dtype: float64
Cluster 7:
main_category
audio    1.0
Name: proportion, dtype: float64
Cluster 8:
main_category
construction    0.97772
country_yard    0.02228
Name: proportion, dtype: float64
Cluster 9:
main_category
environment    1.0
Name: proportion, dtype: float64
Cluster 10:
Series([], Name: proportion, dtype: float64)
Cluster 11:
main_category
kids    1.0
Name: proportion, dtype: float64
Cluster 12:
main_category
sport    1.0
Name: proportion, dtype: float64
Cluster 13:
main_category
camera   

# F1 score

In [14]:
# 定义 cluster 到 label 的映射
cluster_to_label = {
    0: "Maker",            # medicine
    1: "Car_Enthusiast",   # auto
    2: "Homebody",         # furniture
    3: "Geek",             # computers
    4: "Film_Buff",        # video
    5: "Fashionista",      # apparel
    6: "Culinarian",       # kitchen
    7: "Audiophile",       # audio
    8: "Builder",          # construction / country_yard
    9: "Environmentalist", # environment
    10: "Unknown",         # Empty cluster
    11: "Parent",          # kids
    12: "Athlete",         # sport
    13: "Photographer",    # camera
}

# 将 cluster 转换为标签
data["predicted_label"] = data["cluster"].map(cluster_to_label)

In [15]:
import pandas as pd
from sklearn.metrics import classification_report, f1_score
from scipy.optimize import linear_sum_assignment
from sklearn.preprocessing import LabelEncoder

In [26]:
label_data = pd.read_csv("/Users/cookie/Desktop/Test/chunk_0_final_labeled.csv")  # 包含 user_id 和 labels 列
need = ['user_id', 'labels']
label_data = label_data[need]

label_data = label_data[label_data['labels'].notna()]

label_data

Unnamed: 0,user_id,labels
0,519770564,Maker.Sport
1,514292868,Culinarian
2,565527133,Culinarian
3,514066699,Fashionista.Geek
4,542691448,Fashionista.Homebody
...,...,...
1010,549530613,Fashionista
1011,514358357,Car_Enthusiast.Media_Aficionado
1012,546309510,Culinarian.Geek
1013,562298413,Audiophile.Geek.Media_Aficionado


In [27]:
# 提取主标签
label_data['true_label'] = label_data['labels'].str.split('.').str[0]

In [28]:
# 合并原始标签和预测标签
labeled_user_ids = label_data['user_id'].tolist()
predicted_labels_df = data[data['user_id'].isin(labeled_user_ids)].copy()

merged_data = label_data.merge(predicted_labels_df, on="user_id", how="inner")

# 检查合并结果
print(merged_data[["user_id", "predicted_label", "true_label"]])

        user_id predicted_label      true_label
0     519770564         Builder           Maker
1     514292868      Culinarian      Culinarian
2     565527133      Culinarian      Culinarian
3     514066699     Fashionista     Fashionista
4     542691448     Fashionista     Fashionista
...         ...             ...             ...
1010  549530613     Fashionista     Fashionista
1011  514358357  Car_Enthusiast  Car_Enthusiast
1012  546309510            Geek      Culinarian
1013  562298413            Geek      Audiophile
1014  527406214          Parent       Caregiver

[1015 rows x 3 columns]


In [30]:
from sklearn.metrics import precision_recall_fscore_support

true_labels = merged_data["true_label"]
predicted_labels = merged_data["predicted_label"]


In [32]:
# Calculate Precision, Recall, F1 Score
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted')
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Precision: 0.7881
Recall: 0.6512
F1 Score: 0.7032


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# all-featuers

In [5]:
# 特征工程：聚合用户行为
user_features = filtered_data.groupby('user_id').agg({
    'filtered_category': lambda x: x.value_counts().to_dict(),  # 统计用户浏览的种类
    'brand': 'nunique',  # 品牌数量
    'price': ['mean', 'sum'],  # 平均价格和总消费
    'event_type': 'count'  # 总事件数
}).reset_index()

user_features.columns = ['user_id', 'category_counts', 'brand_count', 'avg_price', 'total_spent', 'total_events']


In [6]:
# 转换 category_counts 为特征列
category_counts = pd.json_normalize(user_features['category_counts']).fillna(0)
user_features = pd.concat([user_features.drop(columns=['category_counts']), category_counts], axis=1)

In [8]:
# 数据标准化
scaler = StandardScaler()
scaled_features = scaler.fit_transform(user_features.drop(columns=['user_id']))

In [9]:
# 应用 K-means
kmeans = KMeans(n_clusters=5, random_state=42)  # 假设分3类
user_features['cluster'] = kmeans.fit_predict(scaled_features)

# 聚类结果
print(user_features[['user_id', 'cluster']])


         user_id  cluster
0      315720851        0
1      340041246        0
2      367138781        0
3      370076704        0
4      384989212        0
...          ...      ...
76525  566253970        0
76526  566254699        0
76527  566257884        3
76528  566259665        0
76529  566275254        0

[76530 rows x 2 columns]
