# 下载原始txt文件，转换为csv格式 

In [1]:
import re
import pandas as pd

def parse_chat_file(file_path):
    messages = []
    with open(file_path, 'r', encoding='utf-8') as file:
        current_message = {'发送人': None, '消息内容': '', '特殊内容': '', '消息时间年': None, '消息时间月': None, '消息时间日': None, '消息时间时': None, '消息时间分': None, '消息时间秒': None}
        for line in file:
            match = re.match(r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) (.+?):?$', line)
            if match:
                if current_message['发送人']:
                    messages.append(current_message.copy())
                current_message['发送人'] = match.group(2)
                timestamp = match.group(1).split(' ')
                current_message['消息时间年'] = int(timestamp[0].split('-')[0])
                current_message['消息时间月'] = int(timestamp[0].split('-')[1])
                current_message['消息时间日'] = int(timestamp[0].split('-')[2])
                current_message['消息时间时'] = int(timestamp[1].split(':')[0])
                current_message['消息时间分'] = int(timestamp[1].split(':')[1])
                current_message['消息时间秒'] = int(timestamp[1].split(':')[2])
                current_message['消息内容'] = ''
                current_message['特殊内容'] = ''
            else:
                if line.strip().startswith('引用:'):
                    current_message['特殊内容'] = line.strip()
                else:
                    current_message['消息内容'] += line.strip() + ' '
        # Add the last message
        if current_message['发送人']:
            messages.append(current_message.copy())
    return messages

def create_dataframe(messages):
    df = pd.DataFrame(messages)
    return df[['发送人', '消息内容', '特殊内容', '消息时间年', '消息时间月', '消息时间日', '消息时间时', '消息时间分', '消息时间秒']]

def main():
    file_path = '4.24bar.txt'
    messages = parse_chat_file(file_path)
    df = create_dataframe(messages)
    output_file = 'output.csv'
    df.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"导出成功，保存为 {output_file}")

if __name__ == "__main__":
    main()

导出成功，保存为 output.csv


# 对数据进行进一步清洗，把所有撤回拍一拍等消息交给发送人“系统”

In [2]:
# 提供的发送人列表
valid_senders = ['dimi', '张钰瑶', '绍煜', '四辩杨婧', '徐铭', '张润涵', '刘垭君', '王佳', '郑欣宇', '凯文师兄', '梓卉师姐', 'Oreo（看到我请提醒我学雅思）']

def process_csv(file_path, valid_senders):
    df = pd.read_csv(file_path)

    for index, row in df.iterrows():
        sender = row['发送人']
        if sender not in valid_senders:
            # 如果发送人不在列表中，则将内容放入特殊内容中，将发送人名称改为系统
            df.at[index, '特殊内容'] = row['发送人'] + ': ' + row['消息内容']
            df.at[index, '发送人'] = '系统'

    output_file = 'outputpro.csv'#csv/xlsx自己改
    df.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"修改完成，保存为 {output_file}")

def main():
    file_path = 'output.csv'
    process_csv(file_path, valid_senders)

if __name__ == "__main__":
    main()

修改完成，保存为 outputpro.csv


# 用以特殊导出，导出每个人每天的发言频数，方便进一步绘图
需要注意的是，这里的消息日期经过特殊处理，仅保留了yy/mm/dd格式的一列，叫做消息日期，并把保存方式修改为了xlsx

In [4]:
import pandas as pd

# 1. 读取 Excel 文件
df = pd.read_csv('outputpro.csv')  # 注意，这个1outputpro是我手动制作的

# 2. 根据日期和发送人分组统计每天每个人的发言条数
df['消息日期'] = pd.to_datetime(df['消息日期'])  # 将日期列转换为日期时间类型
daily_message_count = df.groupby([df['消息日期'].dt.date, '发送人']).size().unstack(fill_value=0)

# 3. 创建包含日期、总发言数量和每个人的发言数量的 DataFrame
daily_message_count['总发言数量'] = daily_message_count.sum(axis=1)
daily_message_count.reset_index(inplace=True)

# 4. 导出 Excel 文件
output_file = 'daily_message_count_with_individual_counts.xlsx'
daily_message_count.to_excel(output_file, index=False, encoding='utf-8-sig')
print(f"导出成功，保存为 {output_file}")


导出成功，保存为 daily_message_count_with_individual_counts.xlsx
