# F1 Score

In [1]:
from sklearn.metrics import precision_recall_fscore_support, classification_report
import pandas as pd

In [2]:
ori_df = pd.read_csv('/Users/cookie/Desktop/FYP/1000k/parameter_labeled.csv')
need = ['user_id', 'labels']
ori_df = ori_df[need]

ori_df = ori_df[ori_df['labels'].notna()]

ori_df

Unnamed: 0,user_id,labels
0,574332796,Homebody.Mid_Consumer.Photophile
1,580039020,Culinarian.Mid_Consumer
2,512771282,Brand_loyalty.Fashionista
3,541502840,Fashionista.Geek.Lifestyle.Mid_Consumer
4,572140044,Audiophile.Brand_loyalty.Culinarian.Fashionist...
...,...,...
176,571668107,Fashionista.Geek.Night_owl
177,528061810,Fashionista.Night_owl.Sport
178,515202238,Audiophile.Caregiver.Culinarian.high_consumer
179,514778515,Audiophile.Early_bird.Fashionista.Geek.Mid_Con...


In [3]:
pred_df = pd.read_csv('/Users/cookie/Desktop/labeled_users.csv')
pred_df

Unnamed: 0,user_id,labels
0,574332796,Culinarian.Homebody.Mid_Consumer.Photophile
1,580039020,Culinarian.Maker.Mid_Consumer
2,512771282,Brand_loyalty.Fashionista.Homebody.Sport
3,541502840,Fashionista.Geek
4,572140044,Audiophile.Brand_loyalty.Culinarian.Fashionist...
...,...,...
3597,514055809,Audiophile.Culinarian.Fashionista.Maker.Mid_Co...
3598,530089063,Audiophile.Fashionista.Geek.Lifestyle.Maker.Ni...
3599,575446586,Audiophile.Caregiver.Culinarian.Geek.Media_Afi...
3600,512384791,Culinarian.Fashionista.Lifestyle.Maker.Mid_Con...


In [4]:
labeled_user_ids = ori_df['user_id'].tolist()
predicted_labels_df = pred_df[pred_df['user_id'].isin(labeled_user_ids)]

In [5]:
merged_df = ori_df.merge(pred_df, on='user_id', how='left', suffixes=('_true', '_pred'))
merged_df['labels_pred'].fillna('', inplace=True)

merged_df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['labels_pred'].fillna('', inplace=True)


Unnamed: 0,user_id,labels_true,labels_pred
0,574332796,Homebody.Mid_Consumer.Photophile,Culinarian.Homebody.Mid_Consumer.Photophile
1,580039020,Culinarian.Mid_Consumer,Culinarian.Maker.Mid_Consumer
2,512771282,Brand_loyalty.Fashionista,Brand_loyalty.Fashionista.Homebody.Sport
3,541502840,Fashionista.Geek.Lifestyle.Mid_Consumer,Fashionista.Geek
4,572140044,Audiophile.Brand_loyalty.Culinarian.Fashionist...,Audiophile.Brand_loyalty.Culinarian.Fashionist...
...,...,...,...
176,571668107,Fashionista.Geek.Night_owl,Fashionista.Geek.Night_owl
177,528061810,Fashionista.Night_owl.Sport,Culinarian.Fashionista.Mid_Consumer.Night_owl....
178,515202238,Audiophile.Caregiver.Culinarian.high_consumer,Audiophile.Caregiver.Culinarian.Geek.high_cons...
179,514778515,Audiophile.Early_bird.Fashionista.Geek.Mid_Con...,Audiophile.Culinarian.Early_bird.Fashionista.M...


In [6]:
merged_df['labels_true'] = merged_df['labels_true'].apply(lambda x: set(x.split('.')))
merged_df['labels_pred'] = merged_df['labels_pred'].apply(lambda x: set(x.split('.')))

merged_df

Unnamed: 0,user_id,labels_true,labels_pred
0,574332796,"{Photophile, Homebody, Mid_Consumer}","{Culinarian, Photophile, Homebody, Mid_Consumer}"
1,580039020,"{Culinarian, Mid_Consumer}","{Maker, Culinarian, Mid_Consumer}"
2,512771282,"{Fashionista, Brand_loyalty}","{Sport, Fashionista, Brand_loyalty, Homebody}"
3,541502840,"{Lifestyle, Fashionista, Geek, Mid_Consumer}","{Fashionista, Geek}"
4,572140044,"{Night_owl, Mid_Consumer, Brand_loyalty, Culin...","{Night_owl, Mid_Consumer, Brand_loyalty, Culin..."
...,...,...,...
176,571668107,"{Night_owl, Fashionista, Geek}","{Night_owl, Fashionista, Geek}"
177,528061810,"{Night_owl, Sport, Fashionista}","{Night_owl, Sport, Mid_Consumer, Culinarian, F..."
178,515202238,"{Audiophile, Culinarian, Caregiver, high_consu...","{Caregiver, Geek, Culinarian, high_consumer, A..."
179,514778515,"{Geek, Mid_Consumer, Early_bird, Fashionista, ...","{Mid_Consumer, Early_bird, Culinarian, Fashion..."


In [7]:
# Step 4: 计算多标签分类指标
y_true = merged_df['labels_true'].tolist()
y_pred = merged_df['labels_pred'].tolist()

In [8]:
# 创建全集标签
all_labels = set.union(*y_true, *y_pred)
all_labels = sorted(all_labels)  # 确保标签顺序一致

In [9]:
# 将标签集转换为多热编码
def encode_labels(labels, all_labels):
    return [1 if label in labels else 0 for label in all_labels]

In [10]:
y_true_encoded = [encode_labels(labels, all_labels) for labels in y_true]
y_pred_encoded = [encode_labels(labels, all_labels) for labels in y_pred]

In [11]:
# 计算 Precision, Recall, F1 分数
precision, recall, f1, _ = precision_recall_fscore_support(y_true_encoded, y_pred_encoded, average='micro')
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Precision: 0.7880
Recall: 0.8297
F1 Score: 0.8083


# Modularity

In [3]:
import sys
import os
import numpy as np
import networkx as nx
from datasets import load_dataset

sys.path.append(os.path.abspath("/Users/cookie/Desktop/LLMRecSys/Personality_Analysis/Model_training"))


In [4]:
from SSGB import preprocessing, GraphClustering  # 替换为你要调用的函数名


In [5]:
fi = '/Users/cookie/Desktop/FYP/1000k/parameter_labeled.csv'

access_token = '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'
original_embedding = load_dataset(
    "CookieLyu/Category_Codes",
    revision="1000k_average_embedded",
    token=access_token
)

augmented_embedding = load_dataset(
    "CookieLyu/Category_Codes",
    revision="1000k_average_embedded_aug",
    token=access_token
)

label_types = ['Night_owl', 'Early_bird', 'Decisive', 'Brand_loyalty', 'Maker', 'Homebody', 'Culinarian', 'Geek',
               'Photophile', 'Media_Aficionado', 'Audiophile', 'Fashionista', 'Lifestyle', 'Car_Enthusiast',
               'Caregiver', 'Health_Enthusiast', 'Farm', 'Sport', 'high_consumer', 'Mid_Consumer']  # 提供你的标签类别
k_neighbors = 5
threshold = 0.1
step = 3

preprocessor = preprocessing(fi, label_types, original_embedding, augmented_embedding)
all_features, seed_indices, merged_labels, consistency_loss = preprocessor.run()

graph_clustering = GraphClustering(all_features, k_neighbors, threshold, merged_labels, seed_indices, consistency_loss, step)
candidates, adj_matrix = graph_clustering.run()

# 构建图
G = graph_clustering.construct_graph()
G = nx.from_numpy_array(G)

# 计算图的模块度
from networkx.algorithms.community.quality import modularity

communities = nx.algorithms.community.label_propagation_communities(G)
modularity_score = modularity(G, communities)
print(f"Modularity: {modularity_score}")


[*] Reading from parameter_labeled.csv...
[*] Data preview:
------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3621 entries, 0 to 3620
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   user_id                     3621 non-null   int64  
 1   average_time_float          3621 non-null   float64
 2   purchase_ratio              3621 non-null   float64
 3   average_price               3621 non-null   float64
 4   brand_loyalty_ratio         3621 non-null   float64
 5   most_freq_category_1        3621 non-null   object 
 6   category_1_activity_weight  3621 non-null   float64
 7   most_freq_category_2        3621 non-null   object 
 8   category_2_activity_weight  3621 non-null   float64
 9   most_freq_category_3        3621 non-null   object 
 10  category_3_activity_weight  3621 non-null   float64
 11  labels        

100%|██████████| 181/181 [00:00<00:00, 10505.49it/s]

[*] Preprocessing completed in 0.20 seconds.
[*] The result of OneHotEncoder is:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0




Modularity: 0.6345088485711736
