## 时间戳转换

In [15]:
from datetime import datetime

# Convert human-readable date to Unix timestamp
def date_to_timestamp(year, month, day):
    dt = datetime(year, month, day)
    timestamp = dt.timestamp()
    return int(timestamp)

# Convert Unix timestamp to human-readable date
def timestamp_to_date(timestamp):
    dt = datetime.fromtimestamp(timestamp)
    year = dt.year
    month = dt.month
    day = dt.day
    return year, month, day

## 一组句子中相同的最长连续序列

### 测试（已弃用）

In [12]:
from collections import defaultdict

stopwords_path = r"D:\毕业论文\bilibili_data\bilibili_data\data\addition\stopwords_cn.txt"
    
def find_common_substrings(sentences,stopwords: str='',filter:int = 0,min_length:int = 2,max_length:int = 5):
    if stopwords != '':
    # 导入停用词列表
        with open(stopwords, 'r', encoding='utf-8') as f:
            stopwords = f.read().splitlines()
    # 创建一个字典用于存储相同连续词语的出现次数
    common_substrings_count = defaultdict(int)

    for sentence in sentences:
        words = []
        for char in sentence:
            words.append(char)
        n = len(words)

        # 遍历每个词语组成的子串，计算其出现次数
        for i in range(n):
            for j in range(i + min_length, min(n, i + max_length + 1)):
                if j - i > max_length:
                    break
                for k in words[i:j]:
                    if k in stopwords:
                        continue
                substring = ''.join(words[i:j])
                if substring not in stopwords:
                    common_substrings_count[substring] += 1
    if filter == 0:
        return common_substrings_count
    # 从字典中筛选出出现次数大于1的词语或词组
    common_substrings = {substring: count for substring, count in common_substrings_count.items() if count >= filter}

    return common_substrings

# 示例
sentences = [
    "我是谁",
    "你说我是谁",
    "我是天下第一",
    "天下第一是谁"
]

common_substrings = find_common_substrings(sentences, stopwords_path, 2, 2, 3)
print("Common substrings and their counts:")
for substring, count in common_substrings.items():
    print(f"{substring}: {count}")

Common substrings and their counts:
天下: 2
天下第: 2
下第: 2


### 优化

In [None]:
from collections import defaultdict

stopwords_path = r"D:\毕业论文\bilibili_data\bilibili_data\data\addition\stopwords_cn.txt"

# 加载停用词列表
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stopwords = set(f.read().splitlines())

def find_common_substrings(sentences, filter=0, min_length=2, max_length=5):
    # 创建一个字典用于存储相同连续词语的出现次数
    common_substrings_count = defaultdict(int)

    for sentence in sentences:
        words = list(sentence)  # 将字符串转换为字符列表
        n = len(words)

        # 遍历每个词语组成的子串，计算其出现次数
        for i in range(n):
            for j in range(i + min_length, min(n, i + max_length + 1)):
                if j - i > max_length:
                    break
                if any(char in stopwords for char in words[i:j]):  # 如果有字符在停用词中，跳出当前循环
                    continue
                substring = ''.join(words[i:j])
                common_substrings_count[substring] += 1

    if filter == 0:
        return common_substrings_count
    # 从字典中筛选出出现次数大于等于filter的词语或词组
    common_substrings = {substring: count for substring, count in common_substrings_count.items() if count >= filter}

    return common_substrings


### 多线程（最终版本）

In [7]:
##滑动窗口
def find_subsequences(sentence, window_size):
    subsequences = []

    for i in range(len(sentence) - window_size + 1):
        window = sentence[i:i + window_size]
        subsequences.append((window, i, i + window_size ))

    return subsequences

# 示例
sentence = "我是天下第一"
window_size = 3
subsequences = find_subsequences(sentence, window_size)
print(f"Subsequences of size {window_size}: {subsequences}")

Subsequences of size 3: [('我是天', 0, 3), ('是天下', 1, 4), ('天下第', 2, 5), ('下第一', 3, 6)]


In [33]:
from collections import defaultdict
import concurrent.futures


stopwords_path = r"D:\毕业论文\bilibili_data\bilibili_data\data\addition\stopwords_cn.txt"

# 加载停用词列表
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stopwords = set(f.read().splitlines())

def judge_max_common_sentence(location, max_common_substrings_location):
    if max_common_substrings_location == []:
        return True
    for k in max_common_substrings_location:
        if location[0] >= k[0] and location[1] <= k[1]:
            return False
    return True

def process_sentence(sentence, min_length, max_length, common_substrings_count):
    sentence = sentence.replace('\n', '')
    max_common_substrings_location = []
    max_size = min(max_length+1,len(sentence)+1)
    for i in range(min_length, max_size):
        for j in find_subsequences(sentence, max_size-i+min_length):
            substring = j[0]
            if any(char in stopwords for char in substring) or substring in stopwords:
                continue
            if substring not in common_substrings_count.keys():
                common_substrings_count[substring] = 1
                max_common_substrings_location.append((j[1], j[2]))
            elif judge_max_common_sentence(j[1:], max_common_substrings_location):
                common_substrings_count[substring] += 1
                max_common_substrings_location.append((j[1], j[2]))
            else:
                continue



def find_common_substrings(sentences, filter=0, min_length=2, max_length=50):
    common_substrings_count = defaultdict(int)
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_sentence, sentence, min_length, max_length, common_substrings_count) for sentence in sentences]
        concurrent.futures.wait(futures)

    if filter == 0:
        return common_substrings_count

    common_substrings = {substring: count for substring, count in common_substrings_count.items() if count >= filter}
    return common_substrings

# 测试
sentences = ["我喜欢吃苹果和橙子", "她也喜欢吃苹果但更喜欢橙子", "他不喜欢吃苹果也不喜欢橙子", "我喜欢吃苹果也喜欢橙子"]
result = find_common_substrings(sentences, filter=2)
print(result)

{'喜欢吃苹果': 4, '喜欢橙子': 3}


## 按时间排列

In [17]:
import pandas as pd

# 读取CSV文件
df = pd.read_csv(r"D:\毕业论文\bilibili_data\bilibili_data\data\results\merged_data_with_time_all.csv")

# 按"ctime"列的值从小到大进行排序
df_sorted = df.sort_values(by="ctime")

# 将排序后的结果写入新的CSV文件
df_sorted.to_csv(r"D:\毕业论文\bilibili_data\bilibili_data\data\results\merged_data_with_time_all_sorted.csv", index=False)


## 按点赞数排列

In [1]:
import pandas as pd

# 读取CSV文件
df = pd.read_csv(r"D:\毕业论文\bilibili_data\bilibili_data\data\results\merged_data_with_time_all.csv")

# 按"like"列的值从大到小进行排序
df_sorted = df.sort_values(by="like", ascending=False)

# 将排序后的结果写入新的CSV文件
df_sorted.to_csv(r"D:\毕业论文\bilibili_data\bilibili_data\data\results\merged_data_with_time_all_sorted_like.csv", index=False)


In [2]:
# 读取CSV文件,并取出前10000行
df = pd.read_csv(r"D:\毕业论文\bilibili_data\bilibili_data\data\results\merged_data_with_time_all_sorted_like.csv", nrows=10000)

# 将前10000行的数据写入新的CSV文件
df.to_csv(r"D:\毕业论文\bilibili_data\bilibili_data\data\results\merged_data_with_time_all_sorted_like_10000.csv", index=False)

In [3]:
import pandas as pd
# 读取CSV文件,并取出前10000行
df = pd.read_csv(r"D:\毕业论文\bilibili_data\bilibili_data\data\results\merged_data_with_time_all_sorted_like.csv", nrows=100)

# 将前10000行的数据写入新的CSV文件
df.to_csv(r"D:\毕业论文\bilibili_data\bilibili_data\data\results\merged_data_with_time_all_sorted_like_100.csv", index=False)

## 一组句子中相同的最长连续序列——>批量处理

In [None]:
import csv


com_sentences = []
# 读取csv文件
with open(r'D:\毕业论文\bilibili_data\bilibili_data\data\results\merged_data_with_time_all_sorted.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        if int(row['ctime']) <= int(date_to_timestamp(2024, 2, 3)):
            com_sentences.append(row['content'])

In [34]:
import csv
from datetime import datetime, timedelta

start_date = datetime(2024, 2, 2)
end_date = datetime(2024, 3, 8)

current_date = start_date
while current_date <= end_date:
    current_date += timedelta(days=1)
    year, month, day= current_date.year, current_date.month, current_date.day
    com_sentences = []
    yesterday_data = current_date-timedelta(days=1)

    # 读取csv文件
    with open(r'D:\毕业论文\bilibili_data\bilibili_data\data\results\merged_data_with_time_all_sorted.csv', 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            if int(row['ctime']) <= int(date_to_timestamp(year, month, day)) and int(row['ctime']) >= int(date_to_timestamp(yesterday_data.year, yesterday_data.month, yesterday_data.day)):
                com_sentences.append(row['content'])
    com_dict =  find_common_substrings(com_sentences, 2, 2, 50)
    # 将com_dict按照value值进行排序并写入csv文件
    com_dict_sorted = sorted(com_dict.items(), key=lambda x: x[1], reverse=True)
    com_data_sorted = pd.DataFrame(com_dict_sorted, columns=['common_substrings', 'count'])
    file_path = "D:/毕业论文/bilibili_data/bilibili_data/data/results/common_sentence/%s-%s-%s.csv" % (year, month, day-1)
    com_data_sorted.to_csv(file_path,mode='w', index=False)