In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# 读取数据
train_data = pd.read_csv("../dataset/preprocessed/netflix_train.csv")
val_data = pd.read_csv("../dataset/preprocessed/netflix_val.csv")
test_data = pd.read_csv("../dataset/preprocessed/netflix_test.csv")

# 提取特征和目标变量
X_train = train_data[['title', 'description']]
y_train = train_data['target_ages']

X_val = val_data[['title', 'description']]
y_val = val_data['target_ages']

X_test = test_data[['title', 'description']]
y_test = test_data['target_ages']

# 对目标变量进行编码
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train_title = vectorizer.fit_transform(X_train['title'])
X_val_title = vectorizer.transform(X_val['title'])
X_test_title = vectorizer.transform(X_test['title'])

X_train_desc = vectorizer.fit_transform(X_train['description'])
X_val_desc = vectorizer.transform(X_val['description'])
X_test_desc = vectorizer.transform(X_test['description'])

# 合并特征
X_train = pd.concat([pd.DataFrame(X_train_title.toarray()), pd.DataFrame(X_train_desc.toarray())], axis=1)
X_val = pd.concat([pd.DataFrame(X_val_title.toarray()), pd.DataFrame(X_val_desc.toarray())], axis=1)
X_test = pd.concat([pd.DataFrame(X_test_title.toarray()), pd.DataFrame(X_test_desc.toarray())], axis=1)

# 标准化特征
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# 构建MLP模型
model = Sequential([
    Dense(128, activation='relu', input_dim=X_train.shape[1]),
    Dense(64, activation='relu'),
    Dense(4, activation='softmax')
])

# 编译模型
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 训练模型
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# 评估模型
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")

# 输出分类报告
y_pred = model.predict_classes(X_test)
y_test_original = label_encoder.inverse_transform(y_test)
y_pred_original = label_encoder.inverse_transform(y_pred)

print(classification_report(y_test_original, y_pred_original))

# 保存模型
#model.save("mlp_model.h5")
