In [2]:
# 导入必要库
import os
import numpy as np
import pandas as pd
from PIL import Image
import jieba
from collections import Counter
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input  
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 加载数据
master = pd.read_csv('/course325/BiliBili/data/master.csv')  # 读取master信息文件
img_list = os.listdir('/course325/BiliBili/data/Images/')   # 所有分析用图片文件名
N = len(img_list)  # 获取总图片数
n = 3000  # 设置n为总图片数，处理全部数据

# 初始化存储数组
imgs_features = []      # 图像特征数组
followers_array = []    # 粉丝数数组
exts_array = []        # 文字数组
texts_array = []

for idx in range(n):    
    # 获取图片ID和路径    
    pic_id = master.aid[idx]                  # 通过索引直接获取（非随机抽样）    
    img_path = f'/course325/BiliBili/data/Images/{pic_id}.jpg'   
    
    # 图片预处理  
    img = Image.open(img_path)    
    dx, dy = 224, 224  # 确定图片尺寸    
    img_resized = img.resize((dx, dy))        # 调整图片尺寸    
    img_array = np.array(img_resized)         # 转为数组    
    img_preprocessed = preprocess_input(img_array)  # VGG专用标准化    
    imgs_features.append(img_preprocessed)   # 存储预处理后的图片
    
    # 粉丝数数据准备    
    follower = master.follower[idx]    
    followers_array.append(np.log(1 + follower)) # 对数变换（避免0值，提升数值稳定性） 
    
    # 文本数据准备     
    title = master.title[idx]    
    texts_array.append(title)                  # 存储原始标题，后续统一处理
# 构建VGG16基础模型（不包含顶层分类器）
base_model = VGG16(    
    weights='imagenet',            # 加载ImageNet预训练权重    
    include_top=False,             # 不包含顶层全连接层（仅保留卷积层）    
    input_shape=(dx, dy, 3)        # 输入尺寸匹配VGG16要求
)
# 添加全局平均池化层（将7x7x512特征压缩为512维向量）
x = GlobalAveragePooling2D()(base_model.output)
vgg_model = Model(inputs=base_model.input, outputs=x)

# 提取图像特征
imgs_features = np.array(imgs_features)
image_features = vgg_model.predict(imgs_features)  # 使用VGG模型预测特征

# 文本数据预处理
# 分词并过滤停用词
stop_words = [',', '!', '：', '《', '》', '?', '(', ')', '，', '【', '】', '！', '？', '。', '#', ' ', '.']
分词后的文本 = []
for text in texts_array:    
    words = jieba.lcut(text)                  # 中文分词    
    filtered_words = [word for word in words if word not in stop_words and len(word) > 1]    
    分词后的文本.append(filtered_words)
    
# 构建词表并编码
tokenizer = Tokenizer()
tokenizer.fit_on_texts([word for text in 分词后的文本 for word in text])  # 基于所有分词构建词表
text_sequences = tokenizer.texts_to_sequences(分词后的文本)  # 转为整数序列

# 填充序列
max_text_length = 10  # 可根据数据最长文本调整，示例设为10
texts_padded = pad_sequences(text_sequences, maxlen=max_text_length, padding='post')  # 后补0

# 数据格式整理
followers_array = np.array(followers_array).reshape(-1, 1)  # 转为2D数组（N, 1）
image_features = image_features  # VGG输出特征维度为(N, 512)
texts_padded = texts_padded  # 文本特征维度为(N, max_text_length)

#  结果验证
print(f"图像特征维度: {image_features.shape}")
print(f"粉丝数维度: {followers_array.shape}")
print(f"文本特征维度: {texts_padded.shape}")

2025-05-01 04:37:10.194024: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2025-05-01 04:37:33.277089: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2025-05-01 04:37:33.332256: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:1e:00.0 name: Tesla P100-SXM2-16GB computeCapability: 6.0
coreClock: 1.4805GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s
2025-05-01 04:37:33.332293: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2025-05-01 04:37:33.344220: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2025-05-01 04:37:33.344273: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcubl

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


2025-05-01 04:46:13.498236: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2025-05-01 04:46:13.531243: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2600000000 Hz
2025-05-01 04:46:13.803813: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8
2025-05-01 04:46:14.161741: I tensorflow/stream_executor/cuda/cuda_dnn.cc:359] Loaded cuDNN version 8101
2025-05-01 04:46:15.256393: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2025-05-01 04:46:15.603121: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.629 seconds.
Prefix dict has been built successfully.


图像特征维度: (3000, 512)
粉丝数维度: (3000, 1)
文本特征维度: (3000, 10)
