In [1]:
import pandas as pd
import sqlite3
import networkx as nx
from itertools import combinations
from collections import Counter
from build_network import config as cfg

In [2]:
G = nx.Graph()

In [3]:

con = sqlite3.connect(cfg.FILE['video_info'])
# sql = '''
# select bvid, uid, labels
# from video_info
# limit 1000
# '''
sql = '''
select bvid, uid, labels
from videos
'''
df = pd.read_sql(sql=sql, con=con, index_col='bvid', )
con.close()

In [4]:
df['labels'] = df['labels'].apply(lambda x: [kw.strip() for kw in x.split(',')] if x is not None else [])

In [5]:
df

Unnamed: 0_level_0,uid,labels
bvid,Unnamed: 1_level_1,Unnamed: 2_level_1
BV111421f7WW,67299944,"[美国, 韩国, 新年新知识, 海外, 春节, 2024bilibili迎春会]"
BV111421k75P,546189,"[战争, 国际人道法, ICRC, 法律科普局, 法律人专属加油站]"
BV111421q7oW,11280389,"[历史, 皇帝, 人文历史档案馆]"
BV111421t7db,13897629,"[伪纪录片, 模拟恐怖, 史蒂芬市, #24-01, #99空间, #10空间, 新怪谈, ..."
BV111421X7Xx,1959209,"[FPS, 180hz, 宿舍神屏, 24寸小金刚, 24寸1080P, PS5, 创维, ..."
...,...,...
BV1ZZ4y1x7qQ,7458285,"[测评, 手机, 爱否速描, 爱否科技, 一加8 Pro]"
BV1Zz4y1Z7PK,546418,"[为家乡打call, 4K, 短片, 自制, 旅游, VLOG, 古街, 松阳, HDR, ..."
BV1ZzCZYCEEv,2351309,"[2024诺贝尔奖, 镜头, 摄影器材, 摄影技术, 专利, 微距, 移轴, 老蛙, 扯闲篇..."
BV1Zzx9eKETS,10850097,"[原创, 独夫之心, 钥匙扣, 预售, 定制版, 青年军]"


In [6]:
G.size()

0

In [7]:
# Add document and author nodes
for bvid, row in df.iterrows():
    G.add_node(bvid, type='document')
    G.add_node(row['uid'], type='author')

    # Add doc-author edge
    G.add_edge(bvid, row['uid'], attribute='author')
    # G.add_edge(row['bvid'], row['uid'], weight=row['duration'], attribute='author')

In [8]:
# Add keyword nodes and doc-keyword edges
for bvid, row in df.iterrows():
    keywords = row['labels']

    # Add keyword nodes and edges
    for keyword in keywords:
        G.add_node(keyword, type='keyword')

        # Doc-keyword edge
        keyword_count = keywords.count(keyword)
        G.add_edge(bvid, keyword, weight=keyword_count, attribute='contain')

    # Add keyword-keyword edges
    for kw1, kw2 in combinations(keywords, 2):
        if G.has_edge(kw1, kw2):
            G[kw1][kw2]['weight'] += 1
        else:
            G.add_edge(kw1, kw2, weight=1, attribute='co-occurrence')

In [9]:
import math

# Update weights of edges between keywords
for u, v, data in G.edges(data=True):
    if data.get('attribute') == 'co-occurrence':  # Check if the edge is of type 'mentioned'
        original_weight = data['weight']
        # Avoid math domain error for log(0)
        new_weight = math.log(original_weight+1) 
        G[u][v]['weight'] = new_weight


In [35]:
# for u, v, data in G.edges(data=True):
#     if data.get('attribute') == 'mentioned':  # Check if the edge attribute is 'mentioned'
#         G[u][v]['attribute'] = 'co-occurrence'

In [11]:
from collections import defaultdict

# Step 1: Aggregate keyword counts for each author
author_keyword_counts = defaultdict(Counter)

for _, row in df.iterrows():
    author = row['uid']
    keywords = row['labels']
    for keyword in keywords:
        author_keyword_counts[author][keyword] += keywords.count(keyword)

# Step 2: Add edges between authors and keywords
for author, keyword_counts in author_keyword_counts.items():
    for keyword, count in keyword_counts.items():
        if G.has_edge(author, keyword):
            # G[author][keyword]['weight'] += count
            pass
        else:
            G.add_edge(author, keyword, weight=count, attribute='mention')

# log the weight
for u, v, data in G.edges(data=True):
    if data.get('attribute') == 'mention':  
        original_weight = data['weight']
        new_weight = math.log(original_weight+1) 
        G[u][v]['weight'] = new_weight


In [38]:
# import importlib 
# importlib.reload(cfg)
nx.write_graphml(G, cfg.FILE['graph_label'])

In [None]:
G = nx.read_graphml(cfg.FILE['graph_label'])

In [15]:
# Print sample author-keyword edges
random = 0
for u, v, data in G.edges(data=True):
    if data.get('attribute') == 'mention' and G.nodes[u].get('type') == 'author':
        if random % 50 == 0:
            print(f"uid: {u}, keyword: {v:7}, Weight: {round(data['weight'], 3)}")
        random += 1
        if random == 10000:
            break

uid: 67299944, keyword: 美国     , Weight: 6.998
uid: 67299944, keyword: 潮流     , Weight: 4.263
uid: 67299944, keyword: 癌症     , Weight: 1.386
uid: 67299944, keyword: 码农     , Weight: 2.398
uid: 67299944, keyword: 耐克     , Weight: 1.099
uid: 67299944, keyword: 灾难     , Weight: 2.996
uid: 67299944, keyword: 戴口罩    , Weight: 1.099
uid: 67299944, keyword: 科技与狠活  , Weight: 1.792
uid: 67299944, keyword: 大阪世博会吉祥物, Weight: 0.693
uid: 67299944, keyword: ai     , Weight: 1.099
uid: 67299944, keyword: 韩国人    , Weight: 1.609
uid: 67299944, keyword: 社会洞察计划 , Weight: 1.946
uid: 67299944, keyword: 真实故事   , Weight: 2.079
uid: 67299944, keyword: IPHONE12, Weight: 1.099
uid: 67299944, keyword: 鼠头鸭脖   , Weight: 0.693
uid: 67299944, keyword: 哈维尔穿越  , Weight: 0.693
uid: 67299944, keyword: 老人     , Weight: 1.609
uid: 67299944, keyword: 小红书    , Weight: 0.693
uid: 67299944, keyword: 小姐姐    , Weight: 1.386
uid: 67299944, keyword: 股市     , Weight: 1.099
uid: 67299944, keyword: 虚拟     , Weight: 0.693
uid: 672999

In [31]:
def recommend_based_on_keywords(graph, keywords, max_len=None):
    if len(keywords) < 2:
        raise ValueError("At least two keywords are required.")
    
    # Step 1: Find all pairs of keywords
    pairs = [(keywords[i], keywords[j]) for i in range(len(keywords)) for j in range(i+1, len(keywords))]
    recommendation_counts = Counter()

    for kw1, kw2 in pairs:
        try:
            # Step 2: Find the shortest path
            paths = list(nx.all_shortest_paths(graph, source=kw1, target=kw2))
            # paths = nx.all_simple_paths(graph, source=kw1, target=kw2, cutoff=max_len)
            
            for path in paths:
                # Ensure path includes at least one doc or author
                docs_authors = [node for node in path if graph.nodes[node].get('type') in {'document', 'author'}]
                if docs_authors:
                    recommendation_counts.update(docs_authors)

        except nx.NetworkXNoPath:
            continue  # Skip if no path exists between the pair

    # Step 3: Rank authors and docs by their presence in paths
    recommendations = sorted(recommendation_counts.items(), key=lambda x: x[1], reverse=True)

    return recommendations


In [32]:
recommend_based_on_keywords(G, ['美国', '潮流'])

[]

In [34]:
recommend_based_on_keywords(G_without_kw_edges_vid_nodes, ['美国', '潮流'])

[(67299944, 1), (11336264, 1), (946974, 1), (8736958, 1), (3743849, 1)]

In [19]:
import copy
G_without_kw_edges = copy.deepcopy(G)
edges_to_remove = [(u, v) for u, v, data in G.edges(data=True) if data.get('attribute') == 'co-occurrence']
# Remove the edges from the graph
G_without_kw_edges.remove_edges_from(edges_to_remove)

In [20]:
recommend_based_on_keywords(G_without_kw_edges, ['华为', '小米', '手机'])

[('BV1154y1a7e6', 3),
 ('BV11E411j7jq', 3),
 ('BV11S411N7xz', 3),
 ('BV11t411g78A', 3),
 ('BV1254y1H7Nx', 3),
 ('BV1254y1p7m9', 3),
 ('BV12hxzeLEof', 3),
 ('BV12R4y1M78b', 3),
 ('BV1354y117fx', 3),
 ('BV13A411B78e', 3),
 ('BV13J411v7tg', 3),
 ('BV13J411y7y1', 3),
 ('BV13J411z7yD', 3),
 ('BV13t411w7yv', 3),
 ('BV13T4y1574a', 3),
 ('BV1434y1U7fR', 3),
 ('BV14E41127CS', 3),
 ('BV14P41157z6', 3),
 ('BV15A411F7NP', 3),
 ('BV15a4y1g7VX', 3),
 ('BV15K4y1k7QH', 3),
 ('BV15t411c7D5', 3),
 ('BV15t411R72q', 3),
 ('BV15w411d7n3', 3),
 ('BV15Z4y1G7TJ', 3),
 ('BV1644y1y7kY', 3),
 ('BV16b411h7Bd', 3),
 ('BV16b411Y7Ly', 3),
 ('BV16J411e7CW', 3),
 ('BV16J411J7tz', 3),
 ('BV16J411P7px', 3),
 ('BV16T41117Zn', 3),
 ('BV16v411w72x', 3),
 ('BV17SsyeyEXE', 3),
 ('BV17x4y147qK', 3),
 ('BV184411U71b', 3),
 ('BV187411u7D7', 3),
 ('BV18a411s79j', 3),
 ('BV18L2gYiEt1', 3),
 ('BV18w4m167Xd', 3),
 ('BV194411c7DB', 3),
 ('BV19i4y197jr', 3),
 ('BV19t41167L5', 3),
 ('BV19V411E7CU', 3),
 ('BV19x411h7Jx', 3),
 ('BV1A341

In [26]:
G_without_kw_edges_vid_nodes = copy.deepcopy(G_without_kw_edges)
nodes_to_remove = [u for u, data in G_without_kw_edges.nodes(data=True) if data.get('type') == 'document']
# Remove the edges from the graph
G_without_kw_edges_vid_nodes.remove_nodes_from(nodes_to_remove)

In [33]:
recommend_based_on_keywords(G_without_kw_edges_vid_nodes, ['华为', '小米', '手机'])

[(67299944, 3),
 (1959209, 3),
 (475961, 3),
 (8969156, 3),
 (8372353, 3),
 (8482768, 3),
 (67991584, 3),
 (3766866, 3),
 (546418, 3),
 (367877, 3),
 (2233213, 3),
 (10850097, 3),
 (61086273, 3),
 (50444181, 3),
 (11336264, 3),
 (8784855, 3),
 (12590, 3),
 (946974, 3),
 (41820, 3),
 (4401694, 3),
 (13565996, 3),
 (13407784, 3),
 (9321359, 3),
 (67368617, 3),
 (12145493, 3),
 (287795639, 3),
 (957060, 3),
 (5626102, 3),
 (65764710, 3),
 (12300996, 3),
 (18937162, 3),
 (3530725, 3),
 (7458285, 3),
 (11978336, 3),
 (3743849, 3),
 (10698051, 3),
 (37029661, 1),
 (12119849, 1),
 (62986950, 1),
 (453972, 1),
 (2351309, 1),
 (12434430, 1),
 (11914415, 1),
 (14583962, 1),
 (4835329, 1),
 (5128788, 1),
 (79888370, 1),
 (11021614, 1),
 (6314588, 1),
 (6905829, 1),
 (74529609, 1),
 (64219557, 1)]

In [27]:
print(G.size(), G_without_kw_edges.size(), G_without_kw_edges_vid_nodes.size())

1218166 565302 97744


In [74]:
G_without_kw_edges.size()

571727

In [35]:
keywords = [node for node, data in G.nodes(data=True) if data.get('type') == 'keyword']

# Step 2: Save keywords to a file
with open("keywords.txt", "w") as file:
    for keyword in keywords:
        file.write(f"{keyword}\n")

In [37]:
keywords.__len__()

53048

In [None]:
### rec up by up

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Build a TF-IDF matrix based on authors and keywords
def create_tfidf_matrix(graph):
    # Extract author-keyword edges
    author_keyword_edges = [(u, v) for u, v, data in graph.edges(data=True) if graph.nodes[u].get('type') == 'author' and graph.nodes[v].get('type') == 'keyword']
    
    # Create a dictionary of authors and their associated keywords
    author_keywords = {}
    for author, keyword in author_keyword_edges:
        if author not in author_keywords:
            author_keywords[author] = []
        author_keywords[author].append(keyword)
    
    # Convert the dictionary into a list of keyword lists for each author
    author_keyword_lists = [' '.join(keywords) for keywords in author_keywords.values()]
    
    # Step 2: Create the TF-IDF matrix using TfidfVectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(author_keyword_lists)
    
    return tfidf_matrix, list(author_keywords.keys()), vectorizer

# Step 3: Compute the similarity between the given author and all other authors
def compute_similarity(tfidf_matrix, author_list, given_author, vectorizer):
    # Find the index of the given author
    author_index = author_list.index(given_author)
    
    # Compute cosine similarities between the given author and all others
    cosine_sim = cosine_similarity(tfidf_matrix[author_index], tfidf_matrix).flatten()
    
    # Sort the similarities in descending order and get the most similar authors
    similar_authors_indices = cosine_sim.argsort()[-2:][::-1]  # Top 2, excluding the given author
    similar_authors = [author_list[i] for i in similar_authors_indices]
    similarity_scores = [cosine_sim[i] for i in similar_authors_indices]
    
    return similar_authors, similarity_scores

# Step 4: Find the shortest path between the given author and the recommended author
def find_shortest_path(graph, author1, author2):
    try:
        # Find the shortest path between the two authors
        path = nx.shortest_path(graph, source=author1, target=author2)
        path_length = len(path) - 1
    except nx.NetworkXNoPath:
        path = None
        path_length = None
    
    return path, path_length

# Example usage
# Assuming you already have the graph 'G' built with author-keyword edges

# Create the TF-IDF matrix
tfidf_matrix, author_list, vectorizer = create_tfidf_matrix(G)

# Given an author (replace with a real author name)
given_author = 4401694

# Compute similarity
similar_authors, similarity_scores = compute_similarity(tfidf_matrix, author_list, given_author, vectorizer)

# Get the most similar author
recommended_author = similar_authors[1:6]
similarity_score = similarity_scores[1:6]

# Find the shortest path between the given author and the recommended author
for i in range(len(recommended_author)):
    au = recommended_author[i]
    shortest_path, path_length = find_shortest_path(G, given_author, au)
    # Display the results
    print(f"Recommended Author: {recommended_author[i]}")
    print(f"Similarity Score: {similarity_score[i]}")
    print(f"Shortest Path: {shortest_path}")
    print(f"Path Length: {path_length}")


Recommended Author: 546418
Similarity Score: 0.26127305013244956
Shortest Path: [4401694, 'APPLE', 546418]
Path Length: 2
