# 嵌入数据分析 - Embeddings 50D

本笔记本用于分析 `ml-1m/multimodal_datasets/embeddings_50d` 目录中的 parquet 文件，帮助理解数据结构。

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# 设置显示选项
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

# 设置绘图样式
plt.style.use('default')
sns.set_palette("husl")

# 数据路径
data_dir = Path('embeddings_50d')

## 1. 图像嵌入数据分析 (Image Embeddings)

In [None]:
# 读取图像嵌入数据
image_embeddings_path = data_dir / 'image_embeddings_50d.parquet'
df_image = pd.read_parquet(image_embeddings_path)
print(df_image.head())

print("=" * 80)
print("图像嵌入数据基本信息")
print("=" * 80)
print(f"\n数据形状: {df_image.shape}")
print(f"行数: {len(df_image)}")
print(f"列数: {len(df_image.columns)}")
print(f"\n列名: {list(df_image.columns)}")

   movie_id     emb_0     emb_1     emb_2     emb_3     emb_4     emb_5  \
0         1  6.122087  3.800552  1.215208  5.515930 -9.530784  0.357612   
1        10  4.050676 -4.215814  4.934200 -2.403425 -1.234904  0.052328   
2       100 -7.610673  1.448131 -0.200280  1.818382  2.324749  0.274551   
3      1000 -4.156527 -0.697786  0.662406  1.541152 -1.610273 -4.961243   
4      1001  2.865229  3.966596 -1.951353  1.324681  1.444917  1.961245   

      emb_6     emb_7     emb_8     emb_9    emb_10    emb_11    emb_12  \
0 -3.964445  2.029935  3.390905 -0.517166 -2.810322 -2.257475  1.865410   
1 -4.190878  1.086988  3.193662 -0.277129 -5.471847 -0.309380 -2.493725   
2  4.048800 -2.380233 -0.544547  0.172007 -0.509357 -1.075543 -0.851279   
3  0.884443 -0.972738  3.833170 -4.831747  2.335052 -0.343443 -0.043749   
4  3.249363 -0.898588 -2.382261 -1.867482  0.023159 -4.170153  0.500814   

     emb_13    emb_14    emb_15    emb_16    emb_17    emb_18    emb_19  \
0 -1.802562 -1.003019  

In [8]:
# 基本统计信息
print("\n基本统计信息:")
df_image.describe()


基本统计信息:


Unnamed: 0,movie_id,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,emb_10,emb_11,emb_12,emb_13,emb_14,emb_15,emb_16,emb_17,emb_18,emb_19,emb_20,emb_21,emb_22,emb_23,emb_24,emb_25,emb_26,emb_27,emb_28,emb_29,emb_30,emb_31,emb_32,emb_33,emb_34,emb_35,emb_36,emb_37,emb_38,emb_39,emb_40,emb_41,emb_42,emb_43,emb_44,emb_45,emb_46,emb_47,emb_48,emb_49
count,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0,3882.0
mean,1985.89052,1.356075e-07,-3.537586e-08,5.424299e-07,3.223134e-07,1.159542e-07,6.289042e-08,2.672843e-07,3.832385e-08,2.554923e-08,-3.537586e-08,1.611567e-07,1.542781e-07,2.260125e-08,-1.218502e-07,6.682107e-08,1.120236e-07,2.161858e-08,-4.962448e-08,7.566504e-08,-7.861303e-09,1.429774e-07,-6.976907e-08,-5.502912e-08,-2.063592e-08,5.502912e-08,1.473994e-08,-8.156102e-08,-1.198849e-07,-2.751456e-08,8.5983e-08,-4.274584e-08,-4.815048e-08,-2.161858e-08,9.630096e-08,1.464168e-07,-4.913314e-08,-1.338878e-07,-7.369972e-08,-4.643082e-08,-7.271705e-08,-1.768793e-08,1.277462e-08,-5.208113e-08,-4.028918e-08,-4.028918e-08,-6.87864e-09,-4.913314e-08,-1.159542e-07,0.0,-1.206219e-07
std,1146.883315,3.798834,3.41402,3.239007,2.921398,2.708747,2.387886,2.372876,2.295573,2.197237,2.160636,2.05451,2.007156,1.950151,1.932329,1.862979,1.769458,1.709988,1.687118,1.658667,1.621217,1.596277,1.545818,1.537069,1.527768,1.499743,1.495004,1.465414,1.442042,1.431255,1.390615,1.382702,1.361788,1.337891,1.332818,1.327387,1.292278,1.288831,1.264427,1.249247,1.237431,1.231764,1.207974,1.199305,1.191995,1.183986,1.169386,1.15494,1.146069,1.141238,1.133662
min,1.0,-12.39949,-10.68168,-9.965386,-10.81141,-9.530784,-8.048719,-7.662282,-8.08173,-8.912818,-7.1778,-6.591005,-8.126117,-8.444752,-10.67275,-6.434633,-7.059103,-6.520772,-6.634412,-5.794857,-6.707428,-5.098803,-6.498062,-5.895929,-6.184476,-5.807355,-5.288855,-5.121744,-6.079494,-7.227887,-4.932028,-6.474805,-5.862,-6.421957,-5.075693,-5.420047,-5.094073,-5.337527,-4.589982,-4.915793,-5.850954,-5.341143,-4.266122,-4.178255,-3.908796,-4.467739,-4.717782,-3.759288,-6.855003,-4.419488,-4.69307
25%,982.25,-2.491468,-2.343568,-2.229814,-1.857939,-1.745629,-1.598247,-1.589706,-1.580059,-1.4721,-1.487931,-1.362254,-1.305568,-1.27186,-1.23849,-1.239848,-1.20131,-1.149959,-1.076707,-1.114972,-1.064182,-1.064066,-0.9603199,-1.007428,-1.019834,-0.9920738,-0.9994497,-0.9678545,-0.9346103,-0.9159685,-0.9291215,-0.8706105,-0.9130322,-0.8689329,-0.8988,-0.8591814,-0.8145744,-0.8239058,-0.8252915,-0.787659,-0.8163808,-0.8003779,-0.810025,-0.8021855,-0.7595299,-0.7895496,-0.7626948,-0.7656395,-0.7331168,-0.742188,-0.7313065
50%,2009.5,0.2678642,-0.145192,-0.1000335,0.1807822,0.08306944,0.002508422,-4.402917e-05,-0.07948274,0.0127828,-0.02794869,-0.04402282,0.0282627,0.02750195,-0.0388927,-0.05789442,-0.01385543,0.02363667,0.0189176,-0.004432422,-0.01451297,-0.01016543,0.03071539,0.00405178,-0.008004731,-0.02207626,-0.01006991,0.009932598,0.003264749,-0.02233226,-0.02712899,-0.009097128,-0.008853295,0.008782346,-0.002808691,0.01567832,-0.01297543,0.01804482,-0.003234126,-0.002510737,0.02761957,-0.001988882,-0.01155256,0.002794024,-0.001402221,-0.01024642,-0.01050778,-0.007955662,0.01423319,0.000363,-0.01158175
75%,2980.75,2.726577,2.189215,2.194866,1.973335,1.869177,1.613266,1.51813,1.428842,1.471021,1.455234,1.354228,1.328818,1.298746,1.30012,1.170947,1.151071,1.14348,1.074833,1.046394,1.062839,1.052442,1.01981,1.005808,0.9821381,0.9551424,0.9646835,0.9772762,0.9269246,0.8912679,0.9050155,0.8943385,0.9031429,0.8785877,0.8617781,0.8512262,0.8120761,0.8204512,0.8186496,0.7895512,0.8116627,0.8193362,0.7840007,0.7984402,0.7529207,0.7703601,0.7889307,0.7389005,0.7430544,0.726605,0.7241487
max,3952.0,10.45461,13.27078,9.559536,11.87245,8.581712,9.619331,8.637212,9.226597,7.283813,8.117242,8.648738,7.540692,7.151769,6.945608,9.928967,7.400547,6.488667,6.484322,6.546899,6.655606,7.610367,5.6728,6.78992,6.619087,5.843015,5.031377,5.545691,5.90157,6.58491,6.654459,6.812237,5.024515,7.313225,5.24298,5.230511,4.737367,4.792642,5.066227,4.546007,5.042493,5.250912,4.996135,5.523388,4.222965,5.938553,5.475853,4.211509,6.102327,4.741985,3.977823


In [9]:
# 检查缺失值
print("\n缺失值统计:")
missing_info = pd.DataFrame({
    '缺失数量': df_image.isnull().sum(),
    '缺失比例': df_image.isnull().sum() / len(df_image) * 100
})
print(missing_info[missing_info['缺失数量'] > 0])
if missing_info['缺失数量'].sum() == 0:
    print("没有缺失值！")


缺失值统计:
Empty DataFrame
Columns: [缺失数量, 缺失比例]
Index: []
没有缺失值！


In [10]:
# 如果有嵌入向量列，分析其维度和统计信息
embedding_cols = [col for col in df_image.columns if 'embedding' in col.lower() or 'vector' in col.lower()]

if embedding_cols:
    print(f"\n找到嵌入向量列: {embedding_cols}")
    for col in embedding_cols:
        sample_embedding = df_image[col].iloc[0]
        if isinstance(sample_embedding, (list, np.ndarray)):
            print(f"\n{col} 维度: {len(sample_embedding)}")
            print(f"示例向量 (前10维): {sample_embedding[:10]}")
            
            # 计算所有嵌入向量的统计信息
            all_embeddings = np.array(df_image[col].tolist())
            print(f"\n嵌入向量统计:")
            print(f"  形状: {all_embeddings.shape}")
            print(f"  均值: {all_embeddings.mean():.4f}")
            print(f"  标准差: {all_embeddings.std():.4f}")
            print(f"  最小值: {all_embeddings.min():.4f}")
            print(f"  最大值: {all_embeddings.max():.4f}")
else:
    print("\n未找到明显的嵌入向量列，显示所有列的详细信息:")
    for col in df_image.columns:
        print(f"\n列名: {col}")
        print(f"  数据类型: {df_image[col].dtype}")
        print(f"  唯一值数量: {df_image[col].nunique()}")
        sample_value = df_image[col].iloc[0]
        print(f"  示例值: {sample_value}")
        if isinstance(sample_value, (list, np.ndarray)):
            print(f"  向量维度: {len(sample_value)}")


未找到明显的嵌入向量列，显示所有列的详细信息:

列名: movie_id
  数据类型: int64
  唯一值数量: 3882
  示例值: 1

列名: emb_0
  数据类型: float32
  唯一值数量: 3873
  示例值: 6.122086524963379

列名: emb_1
  数据类型: float32
  唯一值数量: 3873
  示例值: 3.800551652908325

列名: emb_2
  数据类型: float32
  唯一值数量: 3873
  示例值: 1.2152082920074463

列名: emb_3
  数据类型: float32
  唯一值数量: 3873
  示例值: 5.515929698944092

列名: emb_4
  数据类型: float32
  唯一值数量: 3873
  示例值: -9.530783653259277

列名: emb_5
  数据类型: float32
  唯一值数量: 3873
  示例值: 0.35761216282844543

列名: emb_6
  数据类型: float32
  唯一值数量: 3873
  示例值: -3.9644453525543213

列名: emb_7
  数据类型: float32
  唯一值数量: 3873
  示例值: 2.029935121536255

列名: emb_8
  数据类型: float32
  唯一值数量: 3873
  示例值: 3.390904664993286

列名: emb_9
  数据类型: float32
  唯一值数量: 3873
  示例值: -0.5171663165092468

列名: emb_10
  数据类型: float32
  唯一值数量: 3873
  示例值: -2.8103220462799072

列名: emb_11
  数据类型: float32
  唯一值数量: 3873
  示例值: -2.257474660873413

列名: emb_12
  数据类型: float32
  唯一值数量: 3872
  示例值: 1.8654104471206665

列名: emb_13
  数据类型: float32
  唯一值数量: 3873
  示例值: -1

## 2. 文本嵌入数据分析 (Text Embeddings)

In [11]:
# 读取文本嵌入数据
text_embeddings_path = data_dir / 'text_embeddings_50d.parquet'
df_text = pd.read_parquet(text_embeddings_path)

print("=" * 80)
print("文本嵌入数据基本信息")
print("=" * 80)
print(f"\n数据形状: {df_text.shape}")
print(f"行数: {len(df_text)}")
print(f"列数: {len(df_text.columns)}")
print(f"\n列名: {list(df_text.columns)}")

文本嵌入数据基本信息

数据形状: (3883, 51)
行数: 3883
列数: 51

列名: ['movie_id', 'emb_0', 'emb_1', 'emb_2', 'emb_3', 'emb_4', 'emb_5', 'emb_6', 'emb_7', 'emb_8', 'emb_9', 'emb_10', 'emb_11', 'emb_12', 'emb_13', 'emb_14', 'emb_15', 'emb_16', 'emb_17', 'emb_18', 'emb_19', 'emb_20', 'emb_21', 'emb_22', 'emb_23', 'emb_24', 'emb_25', 'emb_26', 'emb_27', 'emb_28', 'emb_29', 'emb_30', 'emb_31', 'emb_32', 'emb_33', 'emb_34', 'emb_35', 'emb_36', 'emb_37', 'emb_38', 'emb_39', 'emb_40', 'emb_41', 'emb_42', 'emb_43', 'emb_44', 'emb_45', 'emb_46', 'emb_47', 'emb_48', 'emb_49']


In [12]:
# 基本统计信息
print("\n基本统计信息:")
df_text.describe()


基本统计信息:


Unnamed: 0,movie_id,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,emb_10,emb_11,emb_12,emb_13,emb_14,emb_15,emb_16,emb_17,emb_18,emb_19,emb_20,emb_21,emb_22,emb_23,emb_24,emb_25,emb_26,emb_27,emb_28,emb_29,emb_30,emb_31,emb_32,emb_33,emb_34,emb_35,emb_36,emb_37,emb_38,emb_39,emb_40,emb_41,emb_42,emb_43,emb_44,emb_45,emb_46,emb_47,emb_48,emb_49
count,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0,3883.0
mean,1986.049446,-1.100299e-07,-3.929639e-09,-6.483905e-08,2.161302e-08,1.375374e-08,-2.898109e-08,3.04547e-08,3.585796e-08,2.750747e-08,-4.126121e-08,1.449054e-08,-5.255892e-08,7.957519e-08,-8.350483e-09,-4.813808e-08,-4.617326e-08,-2.652506e-08,5.452375e-08,5.501495e-08,3.929639e-09,-4.617326e-08,-7.368074e-09,-4.98573e-08,-6.385664e-09,-1.080651e-08,2.333223e-09,7.269833e-08,1.817458e-08,-7.171592e-08,5.526055e-08,-2.075341e-08,2.701627e-09,4.605046e-08,-6.925989e-08,-5.452375e-08,-1.203452e-08,1.326253e-08,-6.385664e-09,1.96482e-09,5.071691e-08,2.775308e-08,4.912049e-09,5.833058e-08,-6.876868e-08,-3.634916e-08,-4.469965e-08,-2.00166e-08,1.080651e-08,-5.833058e-09,-5.894459e-09
std,1146.778349,2.638638,2.086772,1.549236,1.483986,1.305942,1.287798,1.143535,1.113504,1.014151,0.9899521,0.9139537,0.8755124,0.8489232,0.8072243,0.7928983,0.7749908,0.7394227,0.7015085,0.679218,0.6689841,0.6380668,0.6234101,0.6079443,0.5974584,0.5771583,0.5660998,0.5518146,0.541539,0.5343422,0.5267762,0.5181388,0.5050689,0.4929465,0.4890699,0.4790872,0.4687163,0.4613469,0.4536903,0.4447857,0.4434336,0.4377486,0.4342952,0.4309579,0.4271682,0.4179942,0.4113229,0.4048111,0.4018734,0.3998204,0.3936301
min,1.0,-8.186671,-5.289219,-6.628344,-3.663813,-5.642734,-4.350369,-3.968802,-4.261932,-3.571239,-3.036309,-3.59383,-4.039468,-3.118671,-2.862685,-3.372697,-3.960113,-2.65643,-2.493454,-2.503066,-2.343222,-3.097566,-2.17467,-3.060199,-2.203025,-2.670004,-1.975357,-1.910761,-1.690859,-2.216962,-2.041089,-2.028276,-2.095566,-2.230863,-2.206091,-2.472812,-1.811771,-1.708104,-2.267195,-2.145212,-1.560166,-1.580811,-1.866318,-1.564193,-1.872899,-1.339164,-1.792853,-1.739853,-1.613165,-1.352715,-1.834829
25%,982.5,-2.011176,-1.359007,-0.9588127,-1.089361,-0.8986492,-0.8684812,-0.7366146,-0.7476568,-0.6771129,-0.6858288,-0.596541,-0.5620063,-0.5717358,-0.5560368,-0.5395639,-0.5218821,-0.4808983,-0.4733438,-0.4356733,-0.4479651,-0.4121129,-0.4158873,-0.4014857,-0.4021237,-0.3735015,-0.3752158,-0.3596874,-0.3567261,-0.3383831,-0.3449471,-0.333731,-0.3317159,-0.3178704,-0.3194695,-0.307691,-0.300339,-0.2944842,-0.2906171,-0.287602,-0.2885687,-0.2878215,-0.2793525,-0.2761484,-0.2768558,-0.2758242,-0.2592487,-0.2646003,-0.2542403,-0.2711044,-0.2476515
50%,2010.0,0.1773442,-0.1122973,0.146861,-0.1975464,-0.005967319,-0.07100928,0.01105106,-0.006821796,-0.01129492,-0.07560498,0.04745085,-0.004108593,0.01559755,-0.0109759,-0.01140838,-0.00428243,0.01259624,0.01054789,0.009844476,0.02139121,-0.01302609,0.005857281,0.008454084,-0.0107134,0.00467317,-0.01655157,0.001010209,-0.003165089,-0.002167719,0.01411608,0.006400947,0.00776815,0.007484049,-0.003948092,0.004574114,0.009357877,0.01100329,0.005049124,0.004703298,-0.005625471,-0.007170051,0.006488062,0.0008136556,0.001377329,-0.001583572,-0.000200998,0.0005801967,-0.0003207102,-0.004266717,0.00491214
75%,2980.5,2.135538,1.263046,1.073306,0.9771213,0.8633924,0.8446105,0.7784758,0.7479107,0.6698855,0.636187,0.6275656,0.5700189,0.5799223,0.5451177,0.4946112,0.499225,0.4812189,0.4682945,0.4319136,0.4658044,0.3996983,0.3970352,0.3999612,0.3845628,0.3758436,0.368856,0.3704332,0.3464192,0.3452929,0.3539206,0.331654,0.3444687,0.3227816,0.3197677,0.2940678,0.3160225,0.3059226,0.2904827,0.2927656,0.2858681,0.2886882,0.2926531,0.2729187,0.2806053,0.2684661,0.2627751,0.2567722,0.2735417,0.2632561,0.2524069
max,3952.0,6.109449,10.11045,4.167048,6.053966,4.471469,4.17317,3.654921,4.424485,3.558247,3.641684,2.707936,3.690877,2.969241,2.930299,3.073215,2.48779,2.864385,2.489317,2.477002,2.667549,3.084263,2.713951,2.432714,2.594264,2.186693,2.667435,1.924398,2.049068,2.047071,2.001987,2.265653,1.648498,2.18686,1.986343,1.833819,1.66141,1.596982,1.695749,1.922461,1.695772,1.630205,1.505876,1.442147,1.847386,1.624995,1.722108,2.125527,2.117026,1.971302,1.912184


In [13]:
# 检查缺失值
print("\n缺失值统计:")
missing_info = pd.DataFrame({
    '缺失数量': df_text.isnull().sum(),
    '缺失比例': df_text.isnull().sum() / len(df_text) * 100
})
print(missing_info[missing_info['缺失数量'] > 0])
if missing_info['缺失数量'].sum() == 0:
    print("没有缺失值！")


缺失值统计:
Empty DataFrame
Columns: [缺失数量, 缺失比例]
Index: []
没有缺失值！


In [14]:
# 分析文本嵌入向量
embedding_cols = [col for col in df_text.columns if 'embedding' in col.lower() or 'vector' in col.lower()]

if embedding_cols:
    print(f"\n找到嵌入向量列: {embedding_cols}")
    for col in embedding_cols:
        sample_embedding = df_text[col].iloc[0]
        if isinstance(sample_embedding, (list, np.ndarray)):
            print(f"\n{col} 维度: {len(sample_embedding)}")
            print(f"示例向量 (前10维): {sample_embedding[:10]}")
            
            # 计算所有嵌入向量的统计信息
            all_embeddings = np.array(df_text[col].tolist())
            print(f"\n嵌入向量统计:")
            print(f"  形状: {all_embeddings.shape}")
            print(f"  均值: {all_embeddings.mean():.4f}")
            print(f"  标准差: {all_embeddings.std():.4f}")
            print(f"  最小值: {all_embeddings.min():.4f}")
            print(f"  最大值: {all_embeddings.max():.4f}")
else:
    print("\n未找到明显的嵌入向量列，显示所有列的详细信息:")
    for col in df_text.columns:
        print(f"\n列名: {col}")
        print(f"  数据类型: {df_text[col].dtype}")
        print(f"  唯一值数量: {df_text[col].nunique()}")
        sample_value = df_text[col].iloc[0]
        print(f"  示例值: {sample_value}")
        if isinstance(sample_value, (list, np.ndarray)):
            print(f"  向量维度: {len(sample_value)}")


未找到明显的嵌入向量列，显示所有列的详细信息:

列名: movie_id
  数据类型: int64
  唯一值数量: 3883
  示例值: 1

列名: emb_0
  数据类型: float32
  唯一值数量: 3881
  示例值: 0.5745090246200562

列名: emb_1
  数据类型: float32
  唯一值数量: 3881
  示例值: 0.06517046689987183

列名: emb_2
  数据类型: float32
  唯一值数量: 3881
  示例值: 1.0747010707855225

列名: emb_3
  数据类型: float32
  唯一值数量: 3881
  示例值: 1.4308173656463623

列名: emb_4
  数据类型: float32
  唯一值数量: 3881
  示例值: -0.1625608205795288

列名: emb_5
  数据类型: float32
  唯一值数量: 3880
  示例值: -0.03542358800768852

列名: emb_6
  数据类型: float32
  唯一值数量: 3881
  示例值: 2.4707908630371094

列名: emb_7
  数据类型: float32
  唯一值数量: 3881
  示例值: 1.2785171270370483

列名: emb_8
  数据类型: float32
  唯一值数量: 3881
  示例值: 0.9358195066452026

列名: emb_9
  数据类型: float32
  唯一值数量: 3880
  示例值: 1.6259171962738037

列名: emb_10
  数据类型: float32
  唯一值数量: 3881
  示例值: -0.34745657444000244

列名: emb_11
  数据类型: float32
  唯一值数量: 3881
  示例值: -0.14405575394630432

列名: emb_12
  数据类型: float32
  唯一值数量: 3880
  示例值: -0.3101930618286133

列名: emb_13
  数据类型: float32
  唯一值数量: 3880

In [19]:
# image 的movieid少一个
image_movieid = df_image['movie_id'].unique()
text_movieid = df_text['movie_id'].unique()
print(len(image_movieid), len(text_movieid))
print(set(text_movieid) - set(image_movieid))

3882 3883
{2603}


## 3. 数据对比分析

In [15]:
# 对比两个数据集
print("=" * 80)
print("图像嵌入 vs 文本嵌入对比")
print("=" * 80)

comparison = pd.DataFrame({
    '指标': ['行数', '列数', '列名'],
    '图像嵌入': [len(df_image), len(df_image.columns), ', '.join(df_image.columns)],
    '文本嵌入': [len(df_text), len(df_text.columns), ', '.join(df_text.columns)]
})

print(comparison.to_string(index=False))

图像嵌入 vs 文本嵌入对比
指标                                                                                                                                                                                                                                                                                                                                                                                                           图像嵌入                                                                                                                                                                                                                                                                                                                                                                                                           文本嵌入
行数                                                                                                                                                                                      

In [16]:
# 检查是否有共同的ID列用于关联
common_cols = set(df_image.columns) & set(df_text.columns)
print(f"\n共同列: {common_cols}")

if common_cols:
    for col in common_cols:
        print(f"\n列 '{col}' 的对比:")
        print(f"  图像嵌入唯一值: {df_image[col].nunique()}")
        print(f"  文本嵌入唯一值: {df_text[col].nunique()}")
        
        # 检查是否有重叠的值
        image_values = set(df_image[col].unique())
        text_values = set(df_text[col].unique())
        overlap = image_values & text_values
        print(f"  重叠值数量: {len(overlap)}")
        print(f"  仅在图像中: {len(image_values - text_values)}")
        print(f"  仅在文本中: {len(text_values - image_values)}")


共同列: {'emb_44', 'emb_22', 'emb_7', 'emb_12', 'emb_27', 'emb_19', 'emb_30', 'emb_0', 'emb_21', 'emb_24', 'movie_id', 'emb_34', 'emb_15', 'emb_41', 'emb_14', 'emb_1', 'emb_26', 'emb_20', 'emb_29', 'emb_9', 'emb_5', 'emb_35', 'emb_37', 'emb_3', 'emb_8', 'emb_17', 'emb_40', 'emb_48', 'emb_39', 'emb_49', 'emb_6', 'emb_11', 'emb_13', 'emb_45', 'emb_4', 'emb_36', 'emb_16', 'emb_33', 'emb_10', 'emb_18', 'emb_23', 'emb_2', 'emb_42', 'emb_47', 'emb_46', 'emb_38', 'emb_28', 'emb_31', 'emb_43', 'emb_32', 'emb_25'}

列 'emb_44' 的对比:
  图像嵌入唯一值: 3873
  文本嵌入唯一值: 3881
  重叠值数量: 0
  仅在图像中: 3873
  仅在文本中: 3881

列 'emb_22' 的对比:
  图像嵌入唯一值: 3872
  文本嵌入唯一值: 3881
  重叠值数量: 0
  仅在图像中: 3872
  仅在文本中: 3881

列 'emb_7' 的对比:
  图像嵌入唯一值: 3873
  文本嵌入唯一值: 3881
  重叠值数量: 0
  仅在图像中: 3873
  仅在文本中: 3881

列 'emb_12' 的对比:
  图像嵌入唯一值: 3872
  文本嵌入唯一值: 3880
  重叠值数量: 0
  仅在图像中: 3872
  仅在文本中: 3880

列 'emb_27' 的对比:
  图像嵌入唯一值: 3873
  文本嵌入唯一值: 3880
  重叠值数量: 0
  仅在图像中: 3873
  仅在文本中: 3880

列 'emb_19' 的对比:
  图像嵌入唯一值: 3873
  文本嵌入唯一值: 3881
  重

## 4. 可视化分析

In [None]:
# 可视化嵌入向量的分布
def visualize_embeddings(df, title_prefix):
    """可视化嵌入向量的统计分布"""
    # 查找嵌入向量列
    embedding_col = None
    for col in df.columns:
        sample = df[col].iloc[0]
        if isinstance(sample, (list, np.ndarray)):
            embedding_col = col
            break
    
    if embedding_col is None:
        print(f"未找到 {title_prefix} 的嵌入向量列")
        return
    
    embeddings = np.array(df[embedding_col].tolist())
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle(f'{title_prefix} - 嵌入向量分析', fontsize=16, fontweight='bold')
    
    # 1. 每个维度的均值分布
    dim_means = embeddings.mean(axis=0)
    axes[0, 0].plot(dim_means)
    axes[0, 0].set_title('各维度均值分布')
    axes[0, 0].set_xlabel('维度')
    axes[0, 0].set_ylabel('均值')
    axes[0, 0].grid(True, alpha=0.3)
    
    # 2. 每个维度的标准差分布
    dim_stds = embeddings.std(axis=0)
    axes[0, 1].plot(dim_stds)
    axes[0, 1].set_title('各维度标准差分布')
    axes[0, 1].set_xlabel('维度')
    axes[0, 1].set_ylabel('标准差')
    axes[0, 1].grid(True, alpha=0.3)
    
    # 3. 所有嵌入值的直方图
    axes[1, 0].hist(embeddings.flatten(), bins=50, edgecolor='black', alpha=0.7)
    axes[1, 0].set_title('所有嵌入值分布')
    axes[1, 0].set_xlabel('值')
    axes[1, 0].set_ylabel('频数')
    axes[1, 0].grid(True, alpha=0.3)
    
    # 4. 嵌入向量范数分布
    norms = np.linalg.norm(embeddings, axis=1)
    axes[1, 1].hist(norms, bins=50, edgecolor='black', alpha=0.7, color='green')
    axes[1, 1].set_title('嵌入向量L2范数分布')
    axes[1, 1].set_xlabel('L2范数')
    axes[1, 1].set_ylabel('频数')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\n{title_prefix} 统计摘要:")
    print(f"  嵌入维度: {embeddings.shape[1]}")
    print(f"  样本数量: {embeddings.shape[0]}")
    print(f"  L2范数 - 均值: {norms.mean():.4f}, 标准差: {norms.std():.4f}")
    print(f"  L2范数 - 最小值: {norms.min():.4f}, 最大值: {norms.max():.4f}")

In [None]:
# 可视化图像嵌入
visualize_embeddings(df_image, "图像嵌入")

In [None]:
# 可视化文本嵌入
visualize_embeddings(df_text, "文本嵌入")

## 5. 数据结构总结

In [None]:
print("=" * 80)
print("数据结构总结")
print("=" * 80)

print("\n📊 图像嵌入数据 (image_embeddings_50d.parquet):")
print(f"  - 形状: {df_image.shape}")
print(f"  - 列: {list(df_image.columns)}")
print(f"  - 内存使用: {df_image.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n📝 文本嵌入数据 (text_embeddings_50d.parquet):")
print(f"  - 形状: {df_text.shape}")
print(f"  - 列: {list(df_text.columns)}")
print(f"  - 内存使用: {df_text.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n✅ 分析完成！")