In [1]:
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple LightFM

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
from lightfm import LightFM 
from lightfm.data import Dataset
from scipy.sparse import csr_matrix
import pickle
from sklearn.model_selection import train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k
from lightfm.evaluation import auc_score



In [None]:
# Load the pickle files into DataFrames
prod_f1 = pd.read_pickle('./product_features1.pkl')
u_f = pd.read_pickle("./user_features1.pkl")
u_prod_f = pd.read_pickle('./user_product_features1.pkl')

# Display the DataFrames (optional)
print(prod_f1.head())
print(f"Number of rows in prod_f1: {len(prod_f1)}")
len(prod_f1)


   product_id  p_users_unique  p_users_total  p_reordered_total  \
0           1             716           1852               1136   
1           2              78             90                 12   
2           3              74            277                203   
3           4             182            329                147   
4           5               6             15                  9   

   p_reordered_percentage  p_avg_cart  p_order_first_cnt  p_order_second_cnt  \
0                0.613391    5.801836                716                 276   
1                0.133333    9.888889                 78                   8   
2                0.732852    6.415162                 74                  36   
3                0.446809    9.507599                182                  64   
4                0.600000    6.466667                  6                   4   

   p_order_second_percent                                       product_name  \
0                0.385475           

49677

In [5]:
print(u_prod_f.head())
len(u_prod_f)

   user_id  product_id  u_p_avg_cart  u_p_avg_days_since_prior  \
0        1         196      1.400000                 17.600000   
1        1       10258      3.333333                 19.555555   
2        1       10326      5.000000                 28.000000   
3        1       12427      3.300000                 17.600000   
4        1       13032      6.333333                 21.666666   

   u_p_orders_total  u_p_reordered_total  u_p_reordered_percentage  \
0                10                    9                  0.900000   
1                 9                    8                  0.888889   
2                 1                    0                  0.000000   
3                10                    9                  0.900000   
4                 3                    2                  0.666667   

   u_p_last_order  is_reorder_3  is_reorder_2  is_reorder_1  
0              10           1.0           1.0           1.0  
1              10           1.0           1.0           1.

13307953

In [6]:
print(u_f.head())
len(u_f)

   user_id  u_mean_dow  u_std_dow  u_avg_hour  u_std_hour  \
0        1    2.644068   1.256194   10.542373    3.500355   
1        2    2.005128   0.971222   10.441026    1.649854   
2        3    1.011364   1.245630   16.352273    1.454599   
3        4    4.722222   0.826442   13.111111    1.745208   
4        5    1.621622   1.276961   15.729730    2.588958   

   u_avg_days_since_prior  u_std_days_since_prior  u_orders_total  \
0               18.542374               10.559065              10   
1               14.902564                9.671712              14   
2               10.181818                5.867396              12   
3               11.944445                9.973330               5   
4               10.189189                7.600577               4   

   u_products_total  u_products_unique  u_reordered_total  \
0                59                 18                 41   
1               195                102                 93   
2                88                

206209

In [9]:
correlation_matrix = u_f.corr()
print(correlation_matrix)

                         user_id  u_mean_dow  u_std_dow  u_avg_hour  \
user_id                 1.000000   -0.001486   0.000847    0.001170   
u_mean_dow             -0.001486    1.000000   0.167551   -0.002256   
u_std_dow               0.000847    0.167551   1.000000    0.069197   
u_avg_hour              0.001170   -0.002256   0.069197    1.000000   
u_std_hour              0.002463    0.059094   0.172756   -0.006721   
u_avg_days_since_prior  0.003232   -0.036810  -0.040949    0.042678   
u_std_days_since_prior  0.003901   -0.031917  -0.077813    0.060456   
u_orders_total         -0.002122    0.019901   0.166557   -0.062973   
u_products_total       -0.002171   -0.008535   0.158198   -0.042106   
u_products_unique      -0.002037   -0.012439   0.193353   -0.013795   
u_reordered_total      -0.002071   -0.006559   0.134868   -0.049353   
u_reordered_percentage -0.002306   -0.012274   0.160319   -0.074130   
u_avg_order_size       -0.002283   -0.065106   0.036460    0.025225   
u_avg_

In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

# 绘制相关性矩阵的热力图
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Correlation Matrix Heatmap")
plt.show()

In [8]:
# Number of unique products
num_unique_products = prod_f1['product_id'].nunique()
print(f"Number of unique products: {num_unique_products}")

# Number of unique customers
num_unique_customers = u_f['user_id'].nunique()
print(f"Number of unique customers: {num_unique_customers}")

Number of unique products: 49677
Number of unique customers: 206209


In [9]:
# Number of unique users
num_unique_users = u_f['user_id'].nunique()
print(f"Number of unique users: {num_unique_users}")

# Display unique user IDs
unique_user_ids = u_f['user_id'].unique()
print(f"Unique user IDs: {unique_user_ids}")

Number of unique users: 206209
Unique user IDs: [     1      2      3 ... 206207 206208 206209]


In [10]:
cs= u_prod_f[['user_id', 'product_id', 'u_p_orders_total']]

In [11]:
# Step 1: Prepare the interaction matrix
# Use the user-product interaction data (u_prod_f) to create the interaction matrix
# Create index mappings for product_id
unique_product_ids = u_prod_f['product_id'].unique()
id_index = {product_id: idx for idx, product_id in enumerate(unique_product_ids)}
index_id = {idx: product_id for product_id, idx in id_index.items()}

# Map product_id to integer indices
u_prod_f['product_id'] = u_prod_f['product_id'].map(id_index)
u_prod_f = u_prod_f.sample(frac=1, random_state=42)
interaction_data = u_prod_f[['user_id', 'product_id', 'u_p_orders_total']]
# Create the interaction matrix
interaction_matrix = csr_matrix(
    (interaction_data['u_p_orders_total'], 
     (interaction_data['user_id'], interaction_data['product_id'])), shape=(num_unique_users + 1, num_unique_products + 1)
)
# Split the interaction matrix into train and test sets
train_data, test_data = train_test_split(interaction_data, test_size=0.2, random_state=42)

train_interaction_matrix = csr_matrix(
    (train_data['u_p_orders_total'], 
     (train_data['user_id'], train_data['product_id'])), shape=(num_unique_users + 1, num_unique_products + 1)
)

test_interaction_matrix = csr_matrix(
    (test_data['u_p_orders_total'], 
     (test_data['user_id'], test_data['product_id'])), shape=(num_unique_users + 1, num_unique_products + 1)
)

In [None]:
# 将 train_interaction_matrix 转换为稀疏矩阵的 COO 格式
train_coo = train_interaction_matrix.tocoo()

# 创建 DataFrame 表示用户与产品的交互数据
interaction_df = pd.DataFrame({
    'user_id': train_coo.row,
    'product_id': train_coo.col,
    'interaction_value': train_coo.data
})

# 按 user_id 聚合交互数据，计算每个用户的总交互值
user_interaction_summary = interaction_df.groupby('user_id')['interaction_value'].sum().reset_index()
user_interaction_summary.rename(columns={'interaction_value': 'total_interaction'}, inplace=True)

# 将用户交互数据与 u_f 合并
u_f_with_interaction = pd.merge(u_f, user_interaction_summary, left_on='user_id', right_on='user_id', how='left')

# 计算相关性矩阵
correlation_matrix = u_f_with_interaction.corr()

# 输出相关性矩阵
print(correlation_matrix['total_interaction'].sort_values(ascending=False))
u_f_with_interaction = u_f_with_interaction[['user_id', 'u_products_total']]
print(u_f_with_interaction.head())

In [12]:
print(train_interaction_matrix)

  (1, 0)	10
  (1, 3)	10
  (1, 4)	3
  (1, 6)	1
  (1, 7)	1
  (1, 9)	2
  (1, 11)	1
  (1, 12)	1
  (1, 13)	1
  (1, 14)	1
  (1, 15)	1
  (1, 16)	3
  (1, 17)	2
  (2, 5)	1
  (2, 18)	1
  (2, 19)	1
  (2, 20)	6
  (2, 21)	4
  (2, 22)	2
  (2, 23)	1
  (2, 24)	1
  (2, 25)	1
  (2, 26)	1
  (2, 30)	1
  (2, 31)	3
  :	:
  (206209, 5214)	1
  (206209, 5221)	2
  (206209, 5276)	1
  (206209, 5307)	2
  (206209, 5742)	1
  (206209, 5823)	1
  (206209, 6367)	1
  (206209, 6372)	1
  (206209, 6754)	1
  (206209, 6941)	10
  (206209, 7204)	1
  (206209, 7368)	1
  (206209, 8519)	2
  (206209, 9174)	3
  (206209, 10175)	1
  (206209, 11550)	2
  (206209, 11690)	3
  (206209, 12001)	4
  (206209, 13042)	2
  (206209, 13164)	1
  (206209, 13910)	1
  (206209, 19457)	1
  (206209, 22799)	2
  (206209, 23026)	1
  (206209, 23954)	1


In [13]:
# Check the shape of the interaction matrix
print("Train Interaction Matrix Shape:", train_interaction_matrix.shape)
print("Test Interaction Matrix Shape:", test_interaction_matrix.shape)

Train Interaction Matrix Shape: (206210, 49678)
Test Interaction Matrix Shape: (206210, 49678)


In [14]:
# Initialize the LightFM model
model = LightFM(loss='warp')

# Fit the model using the training interaction matrix
model.fit(train_interaction_matrix, epochs=5, num_threads=4)

<lightfm.lightfm.LightFM at 0x79e2fe478b80>

In [15]:
import time
start = time.time()

auc = auc_score(
    model=model,
    test_interactions=test_interaction_matrix,
    train_interactions=train_interaction_matrix,
    num_threads=4,
    check_intersections=False 
)

end = time.time()

print("Evaluation Time: {:.2f} seconds".format(end - start))
print("AUC Score: {:.4f}".format(auc.mean()))

Evaluation Time: 258.65 seconds
AUC Score: 0.9522


In [16]:
pickle.dump(model, open( "model.p", "wb" ) )

In [17]:
#precision evaluation
start = time.time()
model=precision_at_k(model = model, 
                        test_interactions = test_interaction_matrix,k=10,
                        num_threads = 4, check_intersections = False)
end = time.time()
pickle.dump(model, open( "model.p", "wb" ) )
print("Precision at k score = {0:.{1}f}".format(auc.mean(), 2))
print("time taken for precision at k evaluation = {0:.{1}f} seconds".format(end - start, 2))

Precision at k score = 0.95
time taken for precision at k evaluation = 211.40 seconds


In [18]:
prod_f1

Unnamed: 0,product_id,p_users_unique,p_users_total,p_reordered_total,p_reordered_percentage,p_avg_cart,p_order_first_cnt,p_order_second_cnt,p_order_second_percent,product_name,...,a_users_total,a_reordered_total,a_reordered_percentage,a_avg_cart,d_users_unique,d_users_total,d_reordered_total,d_reordered_percentage,d_avg_cart,department
0,1,716,1852,1136,0.613391,5.801836,716,276,0.385475,Chocolate Sandwich Cookies,...,234065,128431,0.548698,9.253092,174219,2887550,1657973,0.574180,9.187743,snacks
1,2,78,90,12,0.133333,9.888889,78,8,0.102564,All-Seasons Salt,...,212092,32321,0.152391,9.996181,172755,1875577,650301,0.346721,9.593425,pantry
2,3,74,277,203,0.732852,6.415162,74,36,0.486486,Robust Golden Unsweetened Oolong Tea,...,249341,131556,0.527615,8.519846,172795,2690129,1757892,0.653460,6.976699,beverages
3,4,182,329,147,0.446809,9.507599,182,64,0.351648,Smart Ones Classic Favorites Mini Rigatoni Wit...,...,390299,217262,0.556655,9.207741,163233,2236432,1211890,0.541885,8.996414,frozen
4,5,6,15,9,0.600000,6.466667,6,4,0.666667,Green Chile Anytime Sauce,...,62510,17542,0.280627,10.297600,172755,1875577,650301,0.346721,9.593425,pantry
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49672,49684,8,9,1,0.111111,4.333333,8,1,0.125000,"Vodka, Triple Distilled, Twist of Vanilla",...,28102,16084,0.572344,4.852751,15798,153696,87595,0.569924,5.428346,alcohol
49673,49685,43,49,6,0.122449,9.571429,43,6,0.139535,En Croute Roast Hazelnut Cranberry,...,99369,53875,0.542171,9.294961,163233,2236432,1211890,0.541885,8.996414,frozen
49674,49686,36,120,84,0.700000,7.500000,36,16,0.444444,Artisan Baguette,...,584834,391937,0.670168,7.515117,140612,1176787,739188,0.628141,8.084397,bakery
49675,49687,7,13,6,0.461538,7.538462,7,4,0.571429,Smartblend Healthy Metabolism Dry Cat Food,...,63421,39377,0.620883,7.512244,14986,97724,58760,0.601285,7.718544,pets


In [19]:
print(u_f)

        user_id  u_mean_dow  u_std_dow  u_avg_hour  u_std_hour  \
0             1    2.644068   1.256194   10.542373    3.500355   
1             2    2.005128   0.971222   10.441026    1.649854   
2             3    1.011364   1.245630   16.352273    1.454599   
3             4    4.722222   0.826442   13.111111    1.745208   
4             5    1.621622   1.276961   15.729730    2.588958   
...         ...         ...        ...         ...         ...   
206204   206205    3.718750   1.084625   13.625000    1.791557   
206205   206206    2.312281   1.929394   16.796491    2.282972   
206206   206207    2.896861   2.051837   13.130045    4.499864   
206207   206208    2.760709   1.734285   13.968981    3.802901   
206208   206209    2.658915   1.538547   12.922481    2.737507   

        u_avg_days_since_prior  u_std_days_since_prior  u_orders_total  \
0                    18.542374               10.559065              10   
1                    14.902564                9.671712     

In [20]:
print(u_f.dtypes)
print(u_f.shape)

user_id                     int32
u_mean_dow                float64
u_std_dow                 float64
u_avg_hour                float64
u_std_hour                float64
u_avg_days_since_prior    float32
u_std_days_since_prior    float64
u_orders_total              int64
u_products_total            int64
u_products_unique           int64
u_reordered_total           int64
u_reordered_percentage    float64
u_avg_order_size          float64
u_avg_reordered_orders    float64
orders_3                  float64
orders_2                  float64
orders_1                  float64
reorder_3                 float64
reorder_2                 float64
reorder_1                 float64
dtype: object
(206209, 20)


In [21]:
print(prod_f1.dtypes)
print(prod_f1.shape)
prod_f1=prod_f1.drop(columns =['department','aisle','product_name'])

product_id                  int32
p_users_unique              int64
p_users_total               int64
p_reordered_total           int64
p_reordered_percentage    float64
p_avg_cart                float64
p_order_first_cnt           int64
p_order_second_cnt          int64
p_order_second_percent    float64
product_name               object
aisle_id                    int16
department_id                int8
aisle                      object
a_users_unique              int64
a_users_total               int64
a_reordered_total           int64
a_reordered_percentage    float64
a_avg_cart                float64
d_users_unique              int64
d_users_total               int64
d_reordered_total           int64
d_reordered_percentage    float64
d_avg_cart                float64
department                 object
dtype: object
(49677, 24)


In [3]:
print(u_prod_f.dtypes)

user_id                       int32
product_id                    int32
u_p_avg_cart                float64
u_p_avg_days_since_prior    float32
u_p_orders_total              int64
u_p_reordered_total            int8
u_p_reordered_percentage    float64
u_p_last_order                 int8
is_reorder_3                float64
is_reorder_2                float64
is_reorder_1                float64
dtype: object


In [22]:
user_features_col = u_f.drop(columns =['user_id']).columns.values

In [23]:
print(user_features_col)

['u_mean_dow' 'u_std_dow' 'u_avg_hour' 'u_std_hour'
 'u_avg_days_since_prior' 'u_std_days_since_prior' 'u_orders_total'
 'u_products_total' 'u_products_unique' 'u_reordered_total'
 'u_reordered_percentage' 'u_avg_order_size' 'u_avg_reordered_orders'
 'orders_3' 'orders_2' 'orders_1' 'reorder_3' 'reorder_2' 'reorder_1']


In [24]:
user_feat = u_f.drop(columns =['user_id']).to_dict(orient='records')

In [25]:
user_feat[:5] 

[{'u_mean_dow': 2.6440677966101696,
  'u_std_dow': 1.256194470234808,
  'u_avg_hour': 10.542372881355933,
  'u_std_hour': 3.5003548288034687,
  'u_avg_days_since_prior': 18.542373657226562,
  'u_std_days_since_prior': 10.559065452864308,
  'u_orders_total': 10,
  'u_products_total': 59,
  'u_products_unique': 18,
  'u_reordered_total': 41,
  'u_reordered_percentage': 0.6949152542372882,
  'u_avg_order_size': 5.9,
  'u_avg_reordered_orders': 0.7058333333333333,
  'orders_3': 6.0,
  'orders_2': 6.0,
  'orders_1': 9.0,
  'reorder_3': 0.6666666666666666,
  'reorder_2': 1.0,
  'reorder_1': 0.6666666666666666},
 {'u_mean_dow': 2.005128205128205,
  'u_std_dow': 0.9712221862298618,
  'u_avg_hour': 10.441025641025641,
  'u_std_hour': 1.649854406059765,
  'u_avg_days_since_prior': 14.90256404876709,
  'u_std_days_since_prior': 9.671711518690296,
  'u_orders_total': 14,
  'u_products_total': 195,
  'u_products_unique': 102,
  'u_reordered_total': 93,
  'u_reordered_percentage': 0.4769230769230769

In [26]:
item_features_col = prod_f1.drop(columns=['product_id']).columns.values

In [27]:
item_feat = prod_f1.drop(columns=['product_id']).to_dict(orient='records')

In [28]:
# Step 1: Prepare user and product features
dataset = Dataset()

# Build user and item feature mappings
dataset.fit(
    users=[x for x in u_f['user_id']],
    items=[x for x in prod_f1['product_id']],
    user_features=user_features_col,  # Exclude 'user_id'
    item_features=item_features_col   # Exclude 'product_id'
)


In [29]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 206209, num_items 49677.


In [30]:
item_features = dataset.build_item_features((x,y) for x,y in zip(prod_f1['product_id'],item_feat))

In [31]:
user_features = dataset.build_user_features((x,y) for x,y in zip(u_f['user_id'],user_feat))

In [32]:
print(user_features.dtype)
print(item_features.dtype)

float32
float32


In [33]:
print(user_features.shape)
print(item_features.shape)

(206209, 206228)
(49677, 49697)


In [34]:
print(user_features)

  (0, 0)	0.004838432185351849
  (0, 206209)	0.012793143279850483
  (0, 206210)	0.006078011821955442
  (0, 206211)	0.05100855603814125
  (0, 206212)	0.016936229541897774
  (0, 206213)	0.08971602469682693
  (0, 206214)	0.0510893240571022
  (0, 206215)	0.048384323716163635
  (0, 206216)	0.2854675054550171
  (0, 206217)	0.08709178119897842
  (0, 206218)	0.19837573170661926
  (0, 206219)	0.0033623003400862217
  (0, 206220)	0.028546752408146858
  (0, 206221)	0.003415126819163561
  (0, 206222)	0.02903059497475624
  (0, 206223)	0.02903059497475624
  (0, 206224)	0.04354589059948921
  (0, 206225)	0.0032256217673420906
  (0, 206226)	0.004838432185351849
  (0, 206227)	0.0032256217673420906
  (1, 1)	0.0019813794642686844
  (1, 206209)	0.003972919657826424
  (1, 206210)	0.0019243595888838172
  (1, 206211)	0.020687632262706757
  (1, 206212)	0.0032689874060451984
  :	:
  (206207, 206223)	0.005320543423295021
  (206207, 206224)	0.011306154541671276
  (206207, 206225)	0.0006650679279118776
  (206207, 20

In [35]:
print(item_features)

  (0, 0)	1.9450547483756964e-07
  (0, 49677)	0.00013926591782364994
  (0, 49678)	0.0003602241340558976
  (0, 49679)	0.00022095821623224765
  (0, 49680)	1.1930789867165004e-07
  (0, 49681)	1.1284888614682131e-06
  (0, 49682)	0.00013926591782364994
  (0, 49683)	5.368351048673503e-05
  (0, 49684)	7.497697396274816e-08
  (0, 49685)	1.1864834050356876e-05
  (0, 49686)	3.6956041640223702e-06
  (0, 49687)	0.010542585514485836
  (0, 49688)	0.04552692547440529
  (0, 49689)	0.024980533868074417
  (0, 49690)	1.0672476946638199e-07
  (0, 49691)	1.7997770100919297e-06
  (0, 49692)	0.03388654813170433
  (0, 49693)	0.5616443157196045
  (0, 49694)	0.3224848210811615
  (0, 49695)	1.1168112479253978e-07
  (0, 49696)	1.7870662532004644e-06
  (1, 1)	3.3114093866970506e-07
  (1, 49677)	2.5828992875176482e-05
  (1, 49678)	2.9802684366586618e-05
  (1, 49679)	3.973691036662785e-06
  :	:
  (49675, 49693)	0.3461854159832001
  (49675, 49694)	0.20815619826316833
  (49675, 49695)	2.130041821146733e-06
  (49675, 49

In [36]:
print(user_features.shape)

(206209, 206228)


In [37]:
print(user_features[:, 0])

  (0, 0)	0.004838432185351849


In [38]:
data = user_features.data
rows, cols = user_features.nonzero()
print(f"len(data): {len(data)}")
print(f"len(rows): {len(rows)}")
print(f"len(cols): {len(cols)}")


len(data): 4124180
len(rows): 4039477
len(cols): 4039477


In [39]:
user_features_coo = user_features.tocoo()
rows = user_features_coo.row
cols = user_features_coo.col
data = user_features_coo.data

# 再检查是否长度一致
print(f"len(data): {len(data)}")
print(f"len(rows): {len(rows)}")
print(f"len(cols): {len(cols)}")

len(data): 4124180
len(rows): 4124180
len(cols): 4124180


In [40]:
rows_shifted = rows + 1

# 新的矩阵大小（行多一行）
new_shape = (user_features.shape[0] + 1, user_features.shape[1])

# 构建新的 csr_matrix
user_features_row_shifted = csr_matrix((data, (rows_shifted, cols)), shape=new_shape)

In [41]:
print(user_features_row_shifted)

  (1, 0)	0.004838432185351849
  (1, 206209)	0.012793143279850483
  (1, 206210)	0.006078011821955442
  (1, 206211)	0.05100855603814125
  (1, 206212)	0.016936229541897774
  (1, 206213)	0.08971602469682693
  (1, 206214)	0.0510893240571022
  (1, 206215)	0.048384323716163635
  (1, 206216)	0.2854675054550171
  (1, 206217)	0.08709178119897842
  (1, 206218)	0.19837573170661926
  (1, 206219)	0.0033623003400862217
  (1, 206220)	0.028546752408146858
  (1, 206221)	0.003415126819163561
  (1, 206222)	0.02903059497475624
  (1, 206223)	0.02903059497475624
  (1, 206224)	0.04354589059948921
  (1, 206225)	0.0032256217673420906
  (1, 206226)	0.004838432185351849
  (1, 206227)	0.0032256217673420906
  (2, 1)	0.0019813794642686844
  (2, 206209)	0.003972919657826424
  (2, 206210)	0.0019243595888838172
  (2, 206211)	0.020687632262706757
  (2, 206212)	0.0032689874060451984
  :	:
  (206208, 206223)	0.005320543423295021
  (206208, 206224)	0.011306154541671276
  (206208, 206225)	0.0006650679279118776
  (206208, 20

In [42]:
data1 = item_features.data
rows1, cols1 =item_features.nonzero()
print(f"len(data): {len(data1)}")
print(f"len(rows): {len(rows1)}")
print(f"len(cols): {len(cols1)}")

len(data): 1043217
len(rows): 1025729
len(cols): 1025729


In [43]:
item_features_coo = item_features.tocoo()
rows1 = item_features_coo.row
cols1 = item_features_coo.col
data1 = item_features_coo.data

# 再检查是否长度一致
print(f"len(data): {len(data1)}")
print(f"len(rows): {len(rows1)}")
print(f"len(cols): {len(cols1)}")

len(data): 1043217
len(rows): 1043217
len(cols): 1043217


In [44]:
rows_shifted1 = rows1 + 1

# 新的矩阵大小（行多一行）
new_shape1 = (item_features.shape[0] + 1, item_features.shape[1])

# 构建新的 csr_matrix
item_features_row_shifted = csr_matrix((data1, (rows_shifted1, cols1)), shape=new_shape1)

In [45]:
print(item_features_row_shifted.shape)

(49678, 49697)


In [None]:
from sklearn.model_selection import ParameterGrid

# Step 2: Initialize the LightFM model
model2 = LightFM(loss='warp')
# Define the parameter grid
param_grid = {
    'no_components': [10, 20, 30],
    'learning_rate': [0.01, 0.05, 0.1],
    'item_alpha': [1e-6, 1e-5],
    'user_alpha': [1e-6, 1e-5]
}

# Initialize variables to store the best parameters and score
best_params = None
best_precision = 0

# Perform grid search
for params in ParameterGrid(param_grid):
    model2 = LightFM(
        loss='warp',
        no_components=params['no_components'],
        learning_rate=params['learning_rate'],
        item_alpha=params['item_alpha'],
        user_alpha=params['user_alpha']
    )
    model2.fit(
        train_interaction_matrix,
        user_features=user_features_row_shifted,
        item_features=item_features_row_shifted,
        epochs=5,
        num_threads=1
    )
    precision = precision_at_k(
        model2,
        test_interactions=test_interaction_matrix,
        user_features=user_features_row_shifted,
        item_features=item_features_row_shifted,
        k=10,
        num_threads=4
    ).mean()
    
    if precision > best_precision:
        best_precision = precision
        best_params = params

print("Best Parameters:", best_params)
print("Best Precision:", best_precision)
# Step 3: Fit the model with user and product features
model2.fit(
    train_interaction_matrix,
    user_features=user_features_row_shifted,
    item_features=item_features_row_shifted,
    epochs=5,
    num_threads=1
)


<lightfm.lightfm.LightFM at 0x79e2f16f57e0>

In [47]:
start = time.time()
model_precision_hybrid2=precision_at_k(model = model2, 
                        test_interactions = test_interaction_matrix,k=10,user_features=user_features_row_shifted, item_features=item_features_row_shifted,
                        num_threads = 4, check_intersections = False)
end = time.time()
#pickle.dump(model_precision_hybrid2, open("model_precision_hybrid2.p", "wb" ) )
print("precision score for hybrid method= {0:.{1}f}".format(model_precision_hybrid2.mean(), 2))
print("time taken for precision at k evaluation = {0:.{1}f} seconds".format(end - start, 2))

precision score for hybrid method= 0.00
time taken for precision at k evaluation = 1076.86 seconds


In [None]:
# Step 2: Initialize the LightFM model
model3 = LightFM(loss='warp')
# Initial training with model.fit()
model3.fit(
    train_interaction_matrix,
    user_features=user_features_row_shifted,
    item_features=None,
    epochs=5,
    num_threads=1
)

# Further training with model.partial_fit()
model3.partial_fit(
    train_interaction_matrix,
    user_features=user_features_row_shifted,
    item_features=None,
    epochs=3,
    num_threads=1
)

<lightfm.lightfm.LightFM at 0x79e2f16f4220>

In [49]:
start = time.time()
model_precision_hybrid3=precision_at_k(model = model3, 
                        test_interactions = test_interaction_matrix,k=10,user_features=user_features_row_shifted,item_features=None,
                        num_threads = 4, check_intersections = False)
end = time.time()
#pickle.dump(model_precision_hybrid2, open("model_precision_hybrid2.p", "wb" ) )
print("precision score for hybrid method= {0:.{1}f}".format(model_precision_hybrid3.mean(), 2))
print("time taken for precision at k evaluation = {0:.{1}f} seconds".format(end - start, 2))

precision score for hybrid method= 0.05
time taken for precision at k evaluation = 214.94 seconds


In [50]:
start = time.time()
model_precision_hybrid3=precision_at_k(model = model3, 
                        test_interactions = test_interaction_matrix,k=5,user_features=user_features_row_shifted,item_features=None,
                        num_threads = 4, check_intersections = False)
end = time.time()
#pickle.dump(model_precision_hybrid2, open("model_precision_hybrid2.p", "wb" ) )
print("precision score for hybrid method= {0:.{1}f}".format(model_precision_hybrid3.mean(), 2))
print("time taken for precision at k evaluation = {0:.{1}f} seconds".format(end - start, 2))

precision score for hybrid method= 0.06
time taken for precision at k evaluation = 214.56 seconds


In [51]:
# Step 2: Initialize the LightFM model
model4 = LightFM(loss='warp')

# Step 3: Fit the model with user and product features
model4.fit(
    train_interaction_matrix,
    user_features=None,
    item_features=item_features_row_shifted,
    epochs=5,
    num_threads=1
)

<lightfm.lightfm.LightFM at 0x79e2f16f5270>

In [52]:
start = time.time()
model_precision_hybrid4=precision_at_k(model = model4, 
                        test_interactions = test_interaction_matrix,k=10,user_features=None, item_features=item_features_row_shifted,
                        num_threads = 4, check_intersections = False)
end = time.time()
#pickle.dump(model_precision_hybrid2, open("model_precision_hybrid2.p", "wb" ) )
print("precision score for hybrid method= {0:.{1}f}".format(model_precision_hybrid4.mean(), 2))
print("time taken for precision at k evaluation = {0:.{1}f} seconds".format(end - start, 2))

precision score for hybrid method= 0.00
time taken for precision at k evaluation = 1078.37 seconds


In [53]:
start = time.time()
model_precision_hybrid4=precision_at_k(model = model4, 
                        test_interactions = test_interaction_matrix,k=5,user_features=None, item_features=item_features_row_shifted,
                        num_threads = 4, check_intersections = False)
end = time.time()
#pickle.dump(model_precision_hybrid2, open("model_precision_hybrid2.p", "wb" ) )
print("precision score for hybrid method= {0:.{1}f}".format(model_precision_hybrid4.mean(), 2))
print("time taken for precision at k evaluation = {0:.{1}f} seconds".format(end - start, 2))

precision score for hybrid method= 0.00
time taken for precision at k evaluation = 1079.16 seconds


In [54]:
import pandas as pd
from sklearn.decomposition import PCA
u_f_features = u_f.drop(columns=['user_id'])  # 去掉 user_id 列
pca_u_f = PCA(n_components=0.99)  
u_f_pca_features = pca_u_f.fit_transform(u_f_features)

# 将 PCA 结果转换为 DataFrame 并添加 user_id
u_f_pca = pd.DataFrame(u_f_pca_features, columns=[f'pc_{i+1}' for i in range(u_f_pca_features.shape[1])])
u_f_pca['user_id'] = u_f['user_id']

# 对 prod_f1 进行 PCA 特征选择
prod_f1_features = prod_f1.drop(columns=['product_id'])  # 去掉 product_id 列
pca_prod_f1 = PCA(n_components=0.99) 
prod_f1_pca_features = pca_prod_f1.fit_transform(prod_f1_features)

# 将 PCA 结果转换为 DataFrame 并添加 product_id
prod_f1_pca = pd.DataFrame(prod_f1_pca_features, columns=[f'pc_{i+1}' for i in range(prod_f1_pca_features.shape[1])])
prod_f1_pca['product_id'] = prod_f1['product_id']

# 输出结果
print(u_f_pca.shape)
print(prod_f1_pca.shape)

(206209, 3)
(49677, 3)


In [55]:
user_features_col_pca =u_f_pca.drop(columns =['user_id']).columns.values
user_feat_pca = u_f_pca.drop(columns =['user_id']).to_dict(orient='records')
item_features_col_pca = prod_f1_pca.drop(columns=['product_id']).columns.values
item_feat_pca = prod_f1_pca.drop(columns=['product_id']).to_dict(orient='records')
# Step 1: Prepare user and product features
dataset = Dataset()

# Build user and item feature mappings
dataset.fit(
    users=[x for x in u_f_pca['user_id']],
    items=[x for x in prod_f1_pca['product_id']],
    user_features=user_features_col_pca,  # Exclude 'user_id'
    item_features=item_features_col_pca   # Exclude 'product_id'
)
item_features = dataset.build_item_features((x,y) for x,y in zip(prod_f1_pca['product_id'],item_feat_pca))
user_features = dataset.build_user_features((x,y) for x,y in zip(u_f_pca['user_id'],user_feat_pca))

user_features_coo_pca = user_features.tocoo()
rows_pca = user_features_coo_pca.row
cols_pca = user_features_coo_pca.col
data_pca = user_features_coo_pca.data

# 再检查是否长度一致
print(f"len(data): {len(data_pca)}")
print(f"len(rows): {len(rows_pca)}")
print(f"len(cols): {len(cols_pca)}")
rows_shifted_pca = rows_pca + 1

# 新的矩阵大小（行多一行）
new_shape_pca = (user_features.shape[0] + 1, user_features.shape[1])

# 构建新的 csr_matrix
user_features_pca = csr_matrix((data_pca, (rows_shifted_pca, cols_pca)), shape=new_shape_pca)
item_features_coo_pca = item_features.tocoo()
rows1_pca = item_features_coo_pca.row
cols1_pca = item_features_coo_pca.col
data1_pca = item_features_coo_pca.data

# 再检查是否长度一致
print(f"len(data): {len(data1_pca)}")
print(f"len(rows): {len(rows1_pca)}")
print(f"len(cols): {len(cols1_pca)}")

rows_shifted1_pca = rows1_pca + 1

# 新的矩阵大小（行多一行）
new_shape1_pca = (item_features.shape[0] + 1, item_features.shape[1])

# 构建新的 csr_matrix
item_features_pca = csr_matrix((data1_pca, (rows_shifted1_pca, cols1_pca)), shape=new_shape1_pca)

len(data): 618627
len(rows): 618627
len(cols): 618627
len(data): 149031
len(rows): 149031
len(cols): 149031


In [56]:
# Step 2: Initialize the LightFM model
model5 = LightFM(loss='warp')

# Step 3: Fit the model with user and product features
model5.fit(
    train_interaction_matrix,
    user_features=user_features_pca,
    item_features=item_features_pca,
    epochs=5,
    num_threads=4
)


<lightfm.lightfm.LightFM at 0x79e2fe478940>

In [57]:
import time
start = time.time()
model_precision_hybrid5=precision_at_k(model = model5, 
                        test_interactions = test_interaction_matrix,k=5,user_features=user_features_pca, item_features=item_features_pca,
                        num_threads = 4, check_intersections = False)
end = time.time()
#pickle.dump(model_precision_hybrid2, open("model_precision_hybrid2.p", "wb" ) )
print("precision score for hybrid method= {0:.{1}f}".format(model_precision_hybrid5.mean(), 2))
print("time taken for precision at k evaluation = {0:.{1}f} seconds".format(end - start, 2))

precision score for hybrid method= 0.00
time taken for precision at k evaluation = 333.79 seconds


In [58]:
# Step 2: Initialize the LightFM model
model6 = LightFM(loss='warp')

# Step 3: Fit the model with user and product features
model6.fit(
    train_interaction_matrix,
    user_features=None,
    item_features=item_features_pca,
    epochs=5,
    num_threads=1
)

<lightfm.lightfm.LightFM at 0x79e2fe478fa0>

In [59]:
start = time.time()
model_precision_hybrid6=precision_at_k(model = model6, 
                        test_interactions = test_interaction_matrix,k=5,user_features=None,item_features=item_features_pca,
                        num_threads = 4, check_intersections = False)
end = time.time()
#pickle.dump(model_precision_hybrid2, open("model_precision_hybrid2.p", "wb" ) )
print("precision score for hybrid method= {0:.{1}f}".format(model_precision_hybrid6.mean(), 2))
print("time taken for precision at k evaluation = {0:.{1}f} seconds".format(end - start, 2))

precision score for hybrid method= 0.00
time taken for precision at k evaluation = 333.19 seconds


In [60]:
# Step 2: Initialize the LightFM model
model7 = LightFM(loss='warp')

# Step 3: Fit the model with user and product features
model7.fit(
    train_interaction_matrix,
    user_features=user_features_pca,
    item_features=None,
    epochs=100,
    num_threads=1,
    verbose=True
)

Epoch: 100%|██████████| 100/100 [29:58<00:00, 17.99s/it]


<lightfm.lightfm.LightFM at 0x79e2e59c6a10>

In [61]:
import time
start = time.time()
model_precision_hybrid7=precision_at_k(model = model7, 
                        test_interactions = test_interaction_matrix,k=5,user_features=user_features_pca,item_features=None,
                        num_threads = 4, check_intersections = False)
end = time.time()
#pickle.dump(model_precision_hybrid2, open("model_precision_hybrid2.p", "wb" ) )
print("precision score for hybrid method= {0:.{1}f}".format(model_precision_hybrid7.mean(), 2))
print("time taken for precision at k evaluation = {0:.{1}f} seconds".format(end - start, 2))

precision score for hybrid method= 0.06
time taken for precision at k evaluation = 214.30 seconds
