In [5]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

from transformers import BertTokenizer, BertModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

level1 = pd.read_csv('level1.csv')
level2 = pd.read_csv('level2.csv')
level3 = pd.read_csv('level3.csv')

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertModel.from_pretrained('bert-base-chinese')

In [7]:
def get_bert_embedding(label):
    inputs = tokenizer(label, return_tensors="pt", padding=True, truncation=True, max_length=20)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs['last_hidden_state'][:,0,:].numpy()
# Convert labels to BERT embeddings
label_embeddings = [get_bert_embedding(label) for label in level3['art_style']]
X_bert = np.array(label_embeddings).squeeze()

In [20]:
# Clustering for the second level (80 clusters)
kmeans_2nd_level_bert = KMeans(n_clusters=50, init='k-means++', random_state=42)
level3['second_level_labels_bert'] = kmeans_2nd_level_bert.fit_predict(X_bert)

# Clustering for the first level (30 clusters) based on the second level cluster centers
kmeans_1st_level_bert = KMeans(n_clusters=30, init='k-means++', random_state=42)
second_to_first_mapping_bert = dict(zip(range(80), kmeans_1st_level_bert.fit_predict(kmeans_2nd_level_bert.cluster_centers_)))
level3['first_level_labels_bert'] = level3['second_level_labels_bert'].map(second_to_first_mapping_bert)

# Generate representative labels for each cluster
used_labels = set()
def get_representative_label_bert(cluster_labels, original_labels):
    counter = Counter(original_labels[cluster_labels.index])
    common_labels = counter.most_common()
    for label, _ in common_labels:
        if label not in used_labels:
            used_labels.add(label)
            return label
    return None

level3['second_level_labels_text_bert'] = level3.groupby('second_level_labels_bert')['art_style'].transform(get_representative_label_bert, level3['art_style'])
used_labels = set()
level3['first_level_labels_text_bert'] = level3.groupby('first_level_labels_bert')['second_level_labels_text_bert'].transform(get_representative_label_bert, level3['second_level_labels_text_bert'])
# level3[['art_style', 'second_level_labels_text_bert', 'first_level_labels_text_bert']]

df = level3[['second_level_labels_text_bert','art_style']]
df = df.drop_duplicates() 
level = df.groupby('second_level_labels_text_bert')['art_style'].apply(list).reset_index(name='art_style').head(50)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
words = ['英勇', '英勇无畏', '英勇奋斗', '热血激昂', '英勇壮烈', '勇敢抗争', '正义凛然', '英勇激昂']               
embeddings = [get_bert_embedding(word) for word in words]
average_embedding = np.mean(embeddings, axis=0)

vocabulary = ["word1", "word2", ...]  # Predefined vocabulary
vocab_embeddings = [get_bert_embedding(word) for word in vocabulary]
# define 一个提前确定好范围的标签，比如情感标签（一级库和二级库）
# 计算相似度得到最优的情感词汇

from sklearn.metrics.pairwise import cosine_similarity # 使用cosine相似
similarities = [cosine_similarity([average_embedding], [vocab_embed]) for vocab_embed in vocab_embeddings]
most_similar_word = vocabulary[np.argmax(similarities)]

In [21]:
em2 = pd.read_csv('emotion1.csv')
em3 = pd.read_csv('emotion2.csv')
# Convert labels to BERT embeddings
label_embeddings = [get_bert_embedding(label) for label in em3['emotion']]
X_bert = np.array(label_embeddings).squeeze()
# Clustering for the second level (80 clusters)
kmeans_2nd_level_bert = KMeans(n_clusters=35, init='k-means++', random_state=42)
em3['second_level_labels_bert'] = kmeans_2nd_level_bert.fit_predict(X_bert)
# Generate representative labels for each cluster
used_labels = set()
def get_representative_label_bert(cluster_labels, original_labels):
    counter = Counter(original_labels[cluster_labels.index])
    common_labels = counter.most_common()
    for label, _ in common_labels:
        if label not in used_labels:
            used_labels.add(label)
            return label
    return None
em3['second_level_labels_text_bert'] = em3.groupby('second_level_labels_bert')['emotion'].transform(get_representative_label_bert, em3['emotion'])
# em3[['emotion', 'second_level_labels_text_bert']]
df = em3[['emotion', 'second_level_labels_text_bert']]
df = df.drop_duplicates() 
emotion = df.groupby('second_level_labels_text_bert')['emotion'].apply(list).reset_index(name='emotion').head(50)

  super()._check_params_vs_input(X, default_n_init=10)


In [22]:
pr2 = pd.read_csv('plot_rhythm1.csv')
pr3 = pd.read_csv('plot_rhythm2.csv')
# Convert labels to BERT embeddings
label_embeddings = [get_bert_embedding(label) for label in pr3['plot_rhythm']]
X_bert = np.array(label_embeddings).squeeze()
# Clustering for the second level (80 clusters)
kmeans_2nd_level_bert = KMeans(n_clusters=25, init='k-means++', random_state=42)
pr3['second_level_labels_bert'] = kmeans_2nd_level_bert.fit_predict(X_bert)
# Generate representative labels for each cluster
used_labels = set()
def get_representative_label_bert(cluster_labels, original_labels):
    counter = Counter(original_labels[cluster_labels.index])
    common_labels = counter.most_common()
    for label, _ in common_labels:
        if label not in used_labels:
            used_labels.add(label)
            return label
    return None
pr3['second_level_labels_text_bert'] = pr3.groupby('second_level_labels_bert')['plot_rhythm'].transform(get_representative_label_bert, pr3['plot_rhythm'])
df = pr3[['plot_rhythm', 'second_level_labels_text_bert']]
df = df.drop_duplicates() 
plt = df.groupby('second_level_labels_text_bert')['plot_rhythm'].apply(list).reset_index(name='plot_rhythm').head(50)

  super()._check_params_vs_input(X, default_n_init=10)


In [23]:
vl2 = pd.read_csv('value1.csv')
vl3 = pd.read_csv('value2.csv')
# Convert labels to BERT embeddings
label_embeddings = [get_bert_embedding(label) for label in vl3['value']]
X_bert = np.array(label_embeddings).squeeze()
# Clustering for the second level (20 clusters)
kmeans_2nd_level_bert = KMeans(n_clusters=20, init='k-means++', random_state=42)
vl3['second_level_labels_bert'] = kmeans_2nd_level_bert.fit_predict(X_bert)
# Generate revlesentative labels for each cluster
used_labels = set()
def get_revlesentative_label_bert(cluster_labels, original_labels):
    counter = Counter(original_labels[cluster_labels.index])
    common_labels = counter.most_common()
    for label, _ in common_labels:
        if label not in used_labels:
            used_labels.add(label)
            return label
    return None
vl3['second_level_labels_text_bert'] = vl3.groupby('second_level_labels_bert')['value'].transform(get_revlesentative_label_bert, vl3['value'])
df = vl3[['value', 'second_level_labels_text_bert']]
df = df.drop_duplicates() 
value = df.groupby('second_level_labels_text_bert')['value'].apply(list).reset_index(name='value').head(50)

  super()._check_params_vs_input(X, default_n_init=10)


In [11]:
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.cluster import SpectralClustering
from sklearn.cluster import OPTICS
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MeanShift
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster

vl2 = pd.read_csv('value1.csv')
vl3 = pd.read_csv('value2.csv')

# Convert labels to BERT embeddings
label_embeddings = [get_bert_embedding(label) for label in vl3['value']]
X_bert = np.array(label_embeddings).squeeze()
# Clustering for the second level (20 clusters)
# affinity_propagation = AffinityPropagation(random_state=42)
# vl3['second_level_labels_bert'] = affinity_propagation.fit_predict(X_bert)

# Convert labels to BERT embeddings
label_embeddings = [get_bert_embedding(label) for label in vl3['value']]
X_bert = np.array(label_embeddings).squeeze()

# Use Agglomerative Clustering to build the full dendrogram
linkage_matrix = linkage(X_bert, method='ward')  # Ward method can be replaced with other linkage metrics

# Using fcluster to "cut" the dendrogram at a specified depth to simulate divisive clustering
max_d = 3  # This can be adjusted based on the desired number of clusters or dendrogram inspection
clusters = fcluster(linkage_matrix, max_d, criterion='maxclust')

vl3['second_level_labels_bert'] = clusters

# Generate revlesentative labels for each cluster
used_labels = set()
def get_revlesentative_label_bert(cluster_labels, original_labels):
    counter = Counter(original_labels[cluster_labels.index])
    common_labels = counter.most_common()
    for label, _ in common_labels:
        if label not in used_labels:
            used_labels.add(label)
            return label
    return None

vl3['second_level_labels_text_bert'] = vl3.groupby('second_level_labels_bert')['value'].transform(get_revlesentative_label_bert, vl3['value'])
df = vl3[['value', 'second_level_labels_text_bert']]
df = df.drop_duplicates() 
value = df.groupby('second_level_labels_text_bert')['value'].apply(list).reset_index(name='value').head(50)
value

Unnamed: 0,second_level_labels_text_bert,value
0,勇敢面对,"[勇敢面对, 正义, 亲情至上, 勇敢, 正义必胜, 正义战胜邪恶, 责任担当, 正义与邪恶..."
1,勇气,"[勇气, 自我成长, 团结, 忠诚, 勇敢无畏, 真爱, 自信, 法治精神, 坚持, 真爱无..."
2,团结协作,"[团结协作, 团结互助, 团队合作, 团队精神, 人性挣扎, 智慧, 自强不息, 家庭和睦,..."


In [69]:
words = eval(vl2['1'][0])        
embeddings = [get_bert_embedding(word) for word in words]
embeddings = [emb.reshape(-1,) for emb in embeddings]
average_embedding = np.mean(embeddings, axis=0)

vocabulary = ['勇敢正义', '团结合作', '家庭亲情', '道德伦理', '友谊情感','自我成长','智慧能力','责任担当','生命观念','正义与邪恶','法治观念','信任关系','独立自主','爱情观','人与自然','人类团结','环境保护','坚持奋斗','信仰传承','和谐共生']  # Predefined vocabulary
vocab_embeddings = [get_bert_embedding(word).reshape(-1,) for word in vocabulary]
# define 一个提前确定好范围的标签，比如情感标签（一级库和二级库）
# 计算相似度得到最优的情感词汇

In [62]:
print("Shape of a sample word embedding:", embeddings[0].shape)
print("Shape of a sample word embedding:", vocab_embeddings[0].shape)

Shape of a sample word embedding: (768,)
Shape of a sample word embedding: (768,)


In [63]:
if len(embeddings[0].shape) > 1 and embeddings[0].shape[0] == 1:
    embeddings = [emb[0] for emb in embeddings]

if len(vocab_embeddings[0].shape) > 1 and vocab_embeddings[0].shape[0] == 1:
    vocab_embeddings = [emb[0] for emb in vocab_embeddings]


In [70]:
from sklearn.metrics.pairwise import cosine_similarity # 使用cosine相似
similarities = [cosine_similarity([average_embedding], [vocab_embed]) for vocab_embed in vocab_embeddings]
most_similar_word = vocabulary[np.argmax(similarities)]

In [71]:
most_similar_word

'勇敢正义'

In [93]:
value.to_excel('board.xlsx')

In [65]:
# 数据格式处理
df = pd.read_excel('plot_rhythm_一级标签_gpt.xlsx')
df["1"] = df["1"].apply(eval)
# df_exploded_corrected = df.explode("二级")
df_exploded = df.explode("1").reset_index(drop=True)
df_exploded.to_excel('plot_rhythm_一级标签_gpt.xlsx')


In [3]:
# 数据转格式
import pandas as pd
df = pd.read_csv('plot_rhythm1.csv')
df.to_excel('plot_rhythm_一级标签_gpt.xlsx') 

In [12]:
# 数据读取显示tool
df = pd.read_csv('value1.csv')
df['0'].tolist()


['勇敢正义',
 '团结合作',
 '家庭亲情',
 '道德伦理',
 '友谊情感',
 '自我成长',
 '智慧能力',
 '责任担当',
 '生命观念',
 '正义与邪恶',
 '法治观念',
 '信任关系',
 '独立自主',
 '爱情观',
 '人与自然',
 '人类团结',
 '环境保护',
 '坚持奋斗',
 '信仰传承',
 '和谐共生']

In [26]:
df1 = pd.read_csv('emotion2.csv')
df2 = pd.read_csv('value2.csv')
df2.rename(columns={'value': 'emotion'}, inplace=True)
df3 = pd.concat([df1, df2])

In [27]:
df3 = df3['emotion']
df3

0         紧张
1         感人
2         欢乐
3         温馨
4         感动
       ...  
155     尊重传统
156     坚守信仰
157     母爱伟大
158    成长与挑战
159       敬业
Name: emotion, Length: 351, dtype: object

In [30]:
df3.drop_duplicates(inplace=True)
df3


0         紧张
1         感人
2         欢乐
3         温馨
4         感动
       ...  
153     人道主义
154    跨文化理解
155     尊重传统
156     坚守信仰
157     母爱伟大
Name: emotion, Length: 331, dtype: object

In [31]:
df3.to_excel('total.xlsx')

In [34]:
pr3 = pd.read_excel('total.xlsx')
# Convert labels to BERT embeddings
label_embeddings = [get_bert_embedding(label) for label in pr3['emotion']]
X_bert = np.array(label_embeddings).squeeze()
# Clustering for the second level (80 clusters)
kmeans_2nd_level_bert = KMeans(n_clusters=25, init='k-means++', random_state=42)
pr3['second_level_labels_bert'] = kmeans_2nd_level_bert.fit_predict(X_bert)
# Generate representative labels for each cluster
used_labels = set()
def get_representative_label_bert(cluster_labels, original_labels):
    counter = Counter(original_labels[cluster_labels.index])
    common_labels = counter.most_common()
    for label, _ in common_labels:
        if label not in used_labels:
            used_labels.add(label)
            return label
    return None
pr3['second_level_labels_text_bert'] = pr3.groupby('second_level_labels_bert')['emotion'].transform(get_representative_label_bert, pr3['emotion'])
df = pr3[['emotion', 'second_level_labels_text_bert']]
df = df.drop_duplicates() 
plt = df.groupby('second_level_labels_text_bert')['emotion'].apply(list).reset_index(name='emotion').head(50)

  super()._check_params_vs_input(X, default_n_init=10)


In [35]:
plt

Unnamed: 0,second_level_labels_text_bert,emotion
0,乐观,"[乐观, 成长, 情感波动, 自我成长, 传统美德, 创新, 自我认知, 自我实现, 传承,..."
1,亲情至上,"[亲情至上, 友谊至上, 真爱至上, 生命至上, 生命尊严, 爱情至上, 友情至上, 家庭至上]"
2,克服困难,"[克服困难, 勇敢无畏, 忠诚, 勇气, 坚持, 自信, 信任, 责任感, 善良, 生命尊重..."
3,刺激,"[刺激, 激烈, 冲突, 冲击, 触动, 挑战]"
4,勇敢,"[勇敢, 勇敢坚定, 正义感, 乐观坚韧, 正义, 勇敢抵抗, 正义必胜, 正义战胜邪恶, ..."
5,勇敢面对,"[勇敢面对, 勇敢面对恐惧, 勇敢面对困境, 勇敢面对困难]"
6,压抑,"[压抑, 情感冲突, 压抑紧张, 挣扎, 心理冲突, 好奇心, 心理挣扎, 人性挣扎, 亲子关系]"
7,友谊,"[友谊, 团结友情, 团结友爱, 团结友谊, 友谊长存, 忠诚友谊, 团结与友谊]"
8,哲学思考,"[哲学思考, 敬畏自然, 思考人性, 人性, 自我价值, 生命意义, 权力欲望, 自我发现,..."
9,团结,"[团结, 内省, 团结一致, 人类团结]"


In [58]:
words = ['乐观','希望','坚韧','感悟','反思','敬仰','敬畏','勇敢','好奇','热血','震撼','激昂','同情','感人','温暖','真诚','欢乐','轻松','自豪','激动','压抑','紧张','恐惧','挫败','绝望','孤独','悲伤','无奈','愤怒']
pr3 = pd.DataFrame({
    'emotion': pd.Series(words),
})
# Convert labels to BERT embeddings
label_embeddings = [get_bert_embedding(label) for label in pr3['emotion']]
X_bert = np.array(label_embeddings).squeeze()
# Clustering for the second level (80 clusters)
kmeans_2nd_level_bert = KMeans(n_clusters=7, init='k-means++', random_state=42)
pr3['second_level_labels_bert'] = kmeans_2nd_level_bert.fit_predict(X_bert)
# Generate representative labels for each cluster
used_labels = set()
def get_representative_label_bert(cluster_labels, original_labels):
    counter = Counter(original_labels[cluster_labels.index])
    common_labels = counter.most_common()
    for label, _ in common_labels:
        if label not in used_labels:
            used_labels.add(label)
            return label
    return None
pr3['second_level_labels_text_bert'] = pr3.groupby('second_level_labels_bert')['emotion'].transform(get_representative_label_bert, pr3['emotion'])
df = pr3[['emotion', 'second_level_labels_text_bert']]
df = df.drop_duplicates() 
plt = df.groupby('second_level_labels_text_bert')['emotion'].apply(list).reset_index(name='emotion').head(50)

  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
pr3 = pd.read_excel('total.xlsx')
# Convert labels to BERT embeddings
label_embeddings = [get_bert_embedding(label) for label in pr3['emotion']]
X_bert = np.array(label_embeddings).squeeze()
# Clustering for the second level (80 clusters)
kmeans_2nd_level_bert = KMeans(n_clusters=7, init='k-means++', random_state=42)
pr3['second_level_labels_bert'] = kmeans_2nd_level_bert.fit_predict(X_bert)
# Generate representative labels for each cluster
used_labels = set()
def get_representative_label_bert(cluster_labels, original_labels):
    counter = Counter(original_labels[cluster_labels.index])
    common_labels = counter.most_common()
    for label, _ in common_labels:
        if label not in used_labels:
            used_labels.add(label)
            return label
    return None
pr3['second_level_labels_text_bert'] = pr3.groupby('second_level_labels_bert')['emotion'].transform(get_representative_label_bert, pr3['emotion'])
df = pr3[['emotion', 'second_level_labels_text_bert']]
df = df.drop_duplicates() 
plt = df.groupby('second_level_labels_text_bert')['emotion'].apply(list).reset_index(name='emotion').head(50)

In [38]:
plt.rename(columns={'emotion': '二级', 'second_level_labels_text_bert': '一级'}, inplace=True)
plt.to_excel('board.xlsx')
df = pd.read_excel('board.xlsx')
df["二级"] = df["二级"].apply(eval)
# df_exploded_corrected = df.explode("二级")
df_exploded = df.explode("二级").reset_index(drop=True)
df_exploded.to_excel('total_一级标签.xlsx')


In [74]:
txt

'家庭与亲情,爱情与浪漫,成长与探索,社会与文化,冲突与斗争,历史与传统,友情与背叛,科技与未来,环境与自然,权力与统治,信仰与宗教,家庭与家庭纷争,自由与束缚,教育与成长,战争与和平,多元文化与种族,青春与成人,竞争与合作,社会正义与不平等,探索与冒险,科学与伦理,自我探索与身份认同,生命与死亡,人性与道德,艺术与创造力,家庭与传承,革命与抵抗,时间与记忆,幻想与现实,孤独与连接,幸福与追求,冒险与发现,科学与探索,人工智能与未来科技,信任与背叛,人际关系与社会交往,童年与成年,宿命与自由意志,历史回顾与反思,社交媒体与虚拟世界,家庭暴力与复仇,幽默与讽刺,革命与政治改革,社会隔离与融合,伙伴关系与团队合作,艰苦奋斗与成功,疾病与健康,梦想与现实,传奇与英雄,时尚与文化,困境与逆境,犯罪与正义,科幻与未知,心理学与精神健康,传统与现代,生活哲学与意义,旅行与冒险,科幻与幻想,个人自由与社会责任,风景与自然美,灾难与重建,救赎与赎罪,神话与传说,艺术与表演,科学幻想与探索未知,人类与自然界,权谋与背叛,道德困境与伦理冲突,理想主义与现实主义,幻想与现实,宿命与自由意志,心理恐怖与精神扭曲,欲望与满足,传统文化与现代社会,命运与改变,军事与战争,社交与婚姻,人际关系与交流,心理分析与自我认识,历险与探险,革新与变革,竞争与成功,善恶与道德,愿望与追求,隐秘与秘密,纪实与虚构,权威与反叛,爱情与遗憾,时间旅行与时光,掌控与失控,冒险与自我发现,儿童与童真,媒体与社交网络,信仰与怀疑,科学发现与奇迹,心理战与心智较量,青少年问题与挑战,善与恶的斗争,文化冲突与文化交流,新生活与重新开始,社交期望与自我价值,团队合作与领导力,神秘事件与解谜,抗争与抵抗,未来愿景与科技进步,艺术家与创造过程,季节与自然周期,情感控制与失控,世界观与哲学思考,文学与创意写作,跨国合作,家庭观念,家庭伦理,家族观念,亲子关系,民间传说,社会现象,地域文化,青春校园,女性力量,社会风貌,社会现象,社会伦理,跨国合作,都市生活,校园生活,运动精神,青春与成长,历史传承,科学探索,环保意识,自然灾害,成长与救赎,成长与冒险,青春成长,自我探索,心理成长,生命意义,成长与复仇,探讨人性,犯罪心理,心理变态,命运抗争,职业道德,拯救地球,心理探究,种族关系,勇气与友谊,友情与合作,青春回忆,江湖情义,忠诚

In [69]:
df = pd.read_csv('theme_new.csv')
txt = ''
for i in df['theme'].to_list():
    txt = txt + str(i)+','
txt

'家庭与亲情,爱情与浪漫,成长与探索,社会与文化,冲突与斗争,历史与传统,友情与背叛,科技与未来,环境与自然,权力与统治,信仰与宗教,家庭与家庭纷争,自由与束缚,教育与成长,战争与和平,多元文化与种族,青春与成人,竞争与合作,社会正义与不平等,探索与冒险,科学与伦理,自我探索与身份认同,生命与死亡,人性与道德,艺术与创造力,家庭与传承,革命与抵抗,时间与记忆,幻想与现实,孤独与连接,幸福与追求,冒险与发现,科学与探索,人工智能与未来科技,信任与背叛,人际关系与社会交往,童年与成年,宿命与自由意志,历史回顾与反思,社交媒体与虚拟世界,家庭暴力与复仇,幽默与讽刺,革命与政治改革,社会隔离与融合,伙伴关系与团队合作,艰苦奋斗与成功,疾病与健康,梦想与现实,传奇与英雄,时尚与文化,困境与逆境,犯罪与正义,科幻与未知,心理学与精神健康,传统与现代,生活哲学与意义,旅行与冒险,科幻与幻想,个人自由与社会责任,风景与自然美,灾难与重建,救赎与赎罪,神话与传说,艺术与表演,科学幻想与探索未知,人类与自然界,权谋与背叛,道德困境与伦理冲突,理想主义与现实主义,幻想与现实,宿命与自由意志,心理恐怖与精神扭曲,欲望与满足,传统文化与现代社会,命运与改变,军事与战争,社交与婚姻,人际关系与交流,心理分析与自我认识,历险与探险,革新与变革,竞争与成功,善恶与道德,愿望与追求,隐秘与秘密,纪实与虚构,权威与反叛,爱情与遗憾,时间旅行与时光,掌控与失控,冒险与自我发现,儿童与童真,媒体与社交网络,信仰与怀疑,科学发现与奇迹,心理战与心智较量,青少年问题与挑战,善与恶的斗争,文化冲突与文化交流,新生活与重新开始,社交期望与自我价值,团队合作与领导力,神秘事件与解谜,抗争与抵抗,未来愿景与科技进步,艺术家与创造过程,季节与自然周期,情感控制与失控,世界观与哲学思考,文学与创意写作,跨国合作,家庭观念,家庭伦理,家族观念,亲子关系,民间传说,社会现象,地域文化,青春校园,女性力量,社会风貌,社会现象,社会伦理,跨国合作,都市生活,校园生活,运动精神,青春与成长,历史传承,科学探索,环保意识,自然灾害,成长与救赎,成长与冒险,青春成长,自我探索,心理成长,生命意义,成长与复仇,探讨人性,犯罪心理,心理变态,命运抗争,职业道德,拯救地球,心理探究,种族关系,勇气与友谊,友情与合作,青春回忆,江湖情义,忠诚

In [73]:
df = pd.read_csv('theme_new.csv')
df.to_excel('/Users/qiaochufeng/Desktop/theme.xlsx')

In [5]:
list1 = [1, 5, 4]
list2 = [4, 5, 6]
list1.extend(list2)

In [6]:
list1

[1, 5, 4, 4, 5, 6]

In [12]:
import ast

# List of string representations of lists with tags
tags_list = [
    "['剧情' '动作' '奇幻' '战争' '古装']",
    "['历史' '动画']",
    "['动作']",
    "['冒险' '科幻' '动作']",
    "['悬疑' '犯罪']",
    "['动作' '惊悚' '悬疑' '冒险' '犯罪']",
    "['奇幻' '冒险' '剧情' '动画']"
]
all_tags = []

# Go through each string in the list
for tag_str in tags_list:
    formatted_str = tag_str.replace("' '", "', '")
    try:
        tag_list = ast.literal_eval(formatted_str)
        all_tags.extend(tag_list)
    except ValueError as e:
        print(f"Error converting {tag_str}: {e}")

# Print the resulting big list of tags
print(all_tags)


['剧情', '动作', '奇幻', '战争', '古装', '历史', '动画', '动作', '冒险', '科幻', '动作', '悬疑', '犯罪', '动作', '惊悚', '悬疑', '冒险', '犯罪', '奇幻', '冒险', '剧情', '动画']


In [11]:
import ast

# Original string
tag_str = "['剧情' '动作' '奇幻' '战争' '古装']"
formatted_str = tag_str.replace("' '", "', '")

# Use ast.literal_eval() to convert the string to a list
tag_list = ast.literal_eval(formatted_str)

print(tag_list)


['剧情', '动作', '奇幻', '战争', '古装']


In [36]:
import pandas as pd
import ast
import math
from collections import Counter
mt = pd.read_csv('data/movie_themes_map.csv')
themes = [i for i in mt['thems_map'].tolist() if not (isinstance(i, float) and math.isnan(i))]
# emos = [i for i in mt['tags_emotion'].tolist() if not (isinstance(i, float) and math.isnan(i))]
theme_tags = []
emo_tags = []

for th in themes:
    formatted_th = th.replace("' '", "', '")
    try:
        theme_list = ast.literal_eval(formatted_th)
        theme_tags.extend(theme_list)
    except ValueError as e:
        print(f"Error converting {formatted_th}: {e}")

# for th in emos:
#     formatted_th = th.replace("' '", "', '")
#     try:
#         emos_list = ast.literal_eval(formatted_th)
#         emo_tags.extend(emos_list)
#     except ValueError as e:
#         print(f"Error converting {formatted_th}: {e}")
      
theme_tags.extend(emo_tags)

In [37]:
len(set(theme_tags))

338

In [34]:
import pandas as pd

L = theme_tags
L = list(set(L))

df_tag = pd.read_excel('DEC_c170_new.xlsx')
df_tag['tag'] = pd.Series(L)

L1 = theme_tags
tag_counts = {tag: L1.count(tag) for tag in df_tag['tag'].unique()}
df_tag['count'] = df_tag['tag'].map(tag_counts)

# Show the resulting DataFrame
df_tag.to_excel('DEC_c170_new.xlsx')

['神话与传说',
 '信仰与选择',
 '忠诚与团结',
 '忠诚与承诺',
 '友谊',
 '战争与和平',
 '政治与权力',
 '忠诚与承诺',
 '团结与支持',
 '科技与未来',
 '探索与冒险',
 '勇气与冒险',
 '真相与谎言',
 '家庭与团圆',
 '友谊',
 '忠诚与承诺',
 '友谊',
 '儿童与成长',
 '探索与冒险',
 '亲情与成长',
 '环境与自然',
 '儿童与成长',
 '友谊',
 '成长与挑战',
 '儿童与成长',
 '犯罪与黑帮',
 '政治与权力',
 '困境与挑战',
 '儿童与成长',
 '友谊',
 '团队合作',
 '复仇与救赎',
 '父子/父女',
 '信任与背叛',
 '青春与成长',
 '探索与冒险',
 '神秘与超自然',
 '环境与自然',
 '友谊',
 '成长与挑战',
 '成长与自我发现',
 '真诚与友谊',
 '环境与自然',
 '勇敢面对困境',
 '成长与自我发现',
 '信任与背叛',
 '勇敢面对困境',
 '勇气与冒险',
 '团结与支持',
 '政治与权力',
 '犯罪与黑帮',
 '心理与健康',
 '信任',
 '社会平等',
 '友谊',
 '勇敢面对困境',
 '成长与自我发现',
 '真诚与友谊',
 '青春与成长',
 '勇气与冒险',
 '忠诚与团结',
 '生死与和平',
 '女性力量与平等',
 '家庭矛盾',
 '法律与正义',
 '团结与支持',
 '勇敢面对困境',
 '文化与传承',
 '灾难与生存',
 '环境与自然',
 '勇气与冒险',
 '成长与自我发现',
 '勇敢面对困境',
 '真诚与友谊',
 '家庭与团圆',
 '姐妹情谊',
 '勇气与冒险',
 '女性力量与平等',
 '勇气与冒险',
 '成长与自我发现',
 '复仇与救赎',
 '信任与背叛',
 '勇气与冒险',
 '环保意识',
 '友谊',
 '团结与支持',
 '生存与挑战',
 '困境与挑战',
 '灾难与生存',
 '环境与自然',
 '人与自然',
 '科技与未来',
 '灵异现象',
 '战争与和平',
 '忠诚与承诺',
 '团结与支持',
 '亲情与成长',
 '自我与成长',
 '信念与坚持',
 '战争与和平',
 '忠诚与团结',
 '历史