In [None]:
import os
import pandas as pd
import py7zr
import shutil
from tqdm import tqdm
import regex as re
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
import numpy as np

In [None]:
def extract_md_files(archive_path, extract_to):
    with py7zr.SevenZipFile(archive_path, 'r') as archive:
        all_files = archive.getnames()
        md_files = [f for f in all_files if f.endswith('.md')]
        if md_files:
            archive.extract(targets=md_files, path=extract_to)
        else:
            print(f"No markdown files to extract in {archive_path}.")

def read_md_files_from_folder(folder_path):
    md_contents = []
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.md'):
                file_path = os.path.join(root, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    md_contents.append(file.read())
    return md_contents

folder_path = '/Users/caojie/Desktop/MACSS/2024 spring/Perspective3/rmrb-master/7z'
extract_to = 'mediate'

if not os.path.exists(extract_to):
    os.makedirs(extract_to)

data = []
all_files = []

for root, dirs, files in os.walk(folder_path):
    for file in files:
        if file.endswith('.7z') and "03月" in file:
            file_path = os.path.join(root, file)
            all_files.append(file_path)

for file_path in tqdm(all_files, desc="Processing .7z files"):
    extract_md_files(file_path, extract_to)
    md_contents = read_md_files_from_folder(extract_to)
    if md_contents:
        for content in md_contents:
            data.append({'filename': os.path.basename(file_path), 'content': content})
    else:
        print(f"No Markdown files found in: {file_path}")
    shutil.rmtree(extract_to)
    os.makedirs(extract_to)

if not data:
    print("No data extracted from any files.")
else:
    print(f"Extracted data from {len(data)} entries.")
df = pd.DataFrame(data)

if df.empty:
    print("DataFrame is empty.")
else:
    print("DataFrame created successfully.")
    
df.to_csv('rmrb_march.csv', index=False)
shutil.rmtree(extract_to)

In [None]:
df['year'] = df['filename'].str.extract(r'(\d{4})年')
df['content'] = df['content'].apply(lambda x: ''.join(re.findall(r'[\u4e00-\u9fa5]', x)))
df.drop('filename', axis=1, inplace=True)
df = df[df['content'].str.contains('女', na=False)]

In [None]:
stopwords = set()
with open('/Users/caojie/Desktop/MACSS/2024 fall/data mining/week3/rmrb-master/baidu_stopwords.txt', 'r', encoding='utf-8') as file:
    for line in file:
        stopwords.add(line.strip())

chinese_number_pattern = re.compile(r'[零一二两三四五六七八九十百千万亿]')

df['content'] = df['content'].apply(lambda x: [
    word for word in jieba.cut(x) 
    if word not in stopwords and len(word) > 1 and not chinese_number_pattern.search(word)
])

df['content'] = df['content'].apply(lambda x: ','.join(x))

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df['content'].tolist())
sparse_matrix = csr_matrix(tfidf_matrix)
np.savez_compressed("rmrb_march_tfidf_matrix.npz", data=sparse_matrix.data, indices=sparse_matrix.indices, 
                    indptr=sparse_matrix.indptr, shape=sparse_matrix.shape)