In [1]:
import json
import pandas as pd
from sklearn.decomposition import PCA

def dim_reduction(n_components, embedding_dict): # input: dict; output: dict
    keys = list(embedding_dict.keys())
    embeddings = list(embedding_dict.values())
    pca = PCA(n_components=n_components)
    pca_embeddings = pca.fit_transform(embeddings).tolist()
    pca_embedding_dict = dict(zip(keys, pca_embeddings))
    return pca_embedding_dict

book_df = pd.read_excel('Book.xlsx')
book_test_df = pd.read_excel('Book_test.xlsx')
with open('embedding/book_embedding.json', 'r') as f:
    diction = json.load(f)

dim = 64
diction = dim_reduction(dim, diction)

df = book_df
df['tags_emotion'] = df['tags_emotion'].map(diction)
df = df[['tags_emotion', 'label']]
df = df.dropna()

df_test = book_test_df
book_test_df['tags_emotion'] = book_test_df['tags_emotion'].map(diction)

In [2]:
train_df = df.copy()
test_df = df_test.copy()

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [4]:
lengths = train_df['tags_emotion'].apply(len)
all_lengths_same = lengths.nunique() == 1
num_features = lengths.iloc[0] if all_lengths_same else None
all_lengths_same, num_features

(True, 64)

In [5]:
feature_columns = [f'feature_{i}' for i in range(num_features)]
train_df[feature_columns] = pd.DataFrame(train_df['tags_emotion'].tolist(), index=train_df.index)
train_df.drop('tags_emotion', axis=1, inplace=True)
train_df.head()

Unnamed: 0,label,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63
0,1,-0.079846,0.151734,-0.117649,-0.038118,0.063766,0.160557,0.050273,0.014925,0.132145,...,-0.017757,-0.007905,0.000447,-0.023104,-0.005614,0.022447,0.00191,0.040961,0.001846,0.002751
1,1,-0.05186,0.162282,-0.072282,-0.02229,0.055591,0.135468,0.039086,-0.005943,0.114176,...,-9.5e-05,-0.011506,0.0031,-0.028905,-0.014337,-0.004721,-0.028982,-0.004804,-0.001171,-0.016863
2,2,-0.101277,0.161681,-0.052881,-0.058777,0.047182,-0.060434,0.103294,-0.083601,-0.065294,...,0.012365,-0.017172,-0.016049,0.021671,0.001635,-0.002531,0.034684,0.007085,-0.003571,-0.001683
3,2,-0.147157,0.031508,-0.023745,-0.154682,-0.011095,0.0469,0.032782,0.027216,-0.026969,...,0.00761,-0.018175,0.023014,-0.016051,0.022654,-0.060257,-0.0154,-0.000844,-0.020046,-0.025367
4,2,-0.078391,0.081369,-0.034141,-0.101931,0.072935,-0.003035,0.082386,-0.08856,-0.002257,...,0.010516,0.026563,0.019128,-0.03319,0.048003,-0.02211,0.016347,0.001329,-0.034278,0.02398


In [6]:
X = train_df.drop('label', axis=1)
y = train_df['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_val)
evaluation_report = classification_report(y_val, y_pred, output_dict=False)
print(evaluation_report)

              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.50      0.25      0.33         4
           3       0.00      0.00      0.00         1
           4       0.38      1.00      0.55         3
           5       0.77      1.00      0.87        10
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1

    accuracy                           0.61        23
   macro avg       0.18      0.25      0.19        23
weighted avg       0.47      0.61      0.51        23



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
test_df[feature_columns] = pd.DataFrame(test_df['tags_emotion'].tolist(), index=test_df.index)
test_df.drop('tags_emotion', axis=1, inplace=True)
test_predictions = rf_clf.predict(test_df)
test_predictions

array([5, 5, 5, 5, 2, 4, 5, 5, 4, 4, 4, 4, 9, 5, 5, 5, 5, 4, 4, 5, 3, 3,
       5, 5, 5, 5, 5, 5, 2, 2, 5, 5, 5, 5, 4, 2, 5, 5, 5, 2, 4, 5, 4, 4,
       2, 2, 2, 5, 4, 4, 4, 5, 4, 4, 4, 7, 4, 5, 7, 5, 5, 5, 5, 5, 4, 3,
       8, 5, 4, 5, 5, 7, 4, 5, 5, 5, 5, 7, 4, 2, 2, 4, 4, 9, 4, 4, 4, 4,
       4, 5, 5, 4, 4, 5, 5, 5, 8, 2, 7, 5, 4, 9, 9, 5, 5, 4, 5, 4, 4, 2,
       5, 4, 5, 5, 5, 5, 5, 4, 2, 4, 8, 5, 2, 5, 5, 2, 5, 4, 1, 4, 7, 4,
       4, 5, 5, 5, 4, 5, 4, 4, 5, 4, 5, 5, 5, 5, 5, 4, 5, 5, 5, 4, 5, 4,
       4, 2, 5, 4, 5, 5, 5, 4, 4, 5, 5, 5, 4, 5, 5, 4, 5, 4, 4, 5, 5, 5,
       7, 3, 5, 5, 5, 5, 5, 5, 5, 4, 5, 2, 2, 5, 4, 5])

In [8]:
predicted_labels = test_predictions.tolist()
df_test['predicted'] = pd.Series(predicted_labels)
tuple_pca_embedding_dict = {k: tuple(v) for k, v in diction.items()}
inverse_pca_embedding_dict = {v: k for k, v in tuple_pca_embedding_dict.items()}
df_test['tags_emotion'] = df_test['tags_emotion'].apply(lambda x: inverse_pca_embedding_dict[tuple(x)] if tuple(x) in inverse_pca_embedding_dict else None)
diction2 = {1:'搞笑组', 2:'情绪组', 3:'害怕组', 4:'正能量组', 5:'负能量组', 6:'烂片组', 7:'励志组', 8:'荒诞组', 9:'思考组', 10:'经典组'}
df_test['predicted'] = df_test['predicted'].map(diction2)
df_test.sort_values('predicted').head(50)

Unnamed: 0,tags_emotion,predicted
71,爱国,励志组
130,鼓舞,励志组
58,揭示,励志组
55,坚持,励志组
77,挑战,励志组
98,唯美,励志组
176,自豪,励志组
21,恐慌,害怕组
20,惊慌,害怕组
177,紧张,害怕组


In [9]:
df_test.sort_values('predicted').to_excel('Book_output_64.xlsx', index=False)