In [None]:
import os
import pandas as pd
from tqdm import tqdm
import networkx as nx
import pickle

In [None]:
# read in the pos tagged tokens
total_token_df = pd.read_csv('tokens_with_pos.csv')

# filter out the tokens that are not nouns, verbs, adjectives
valid_pos_tags = ["n", "nr", "ns", "nt", "nz",  
                  "v", "vn",                  
                  "a", "ad", "an"]   

filtered_df = total_token_df[(total_token_df['pos'].isin(valid_pos_tags)) & (total_token_df['token'].str.len() > 1)]

# filter out women related words based on keywords
female_df = filtered_df[filtered_df['token'].str.contains("妇|母|娘|女|妈|婆|奶|姥|姐|妹|姑|姨|嫂|婶|妻|巾帼|太太", na=False)]

# sort the words by total count and save to csv
female_df.sort_values(by='total_count', ascending=False).to_csv('total_female_words.csv', index=False)

female_df.sort_values(by='total_count', ascending=False).head(10)

Unnamed: 0,token,total_count,pos
3844,妇女,890167,n
3917,女子,252311,n
12243,母亲,234396,n
4700,男女,190554,n
8989,姑娘,167737,n
4921,子女,158636,n
11822,女儿,153111,n
2326,父母,141805,n
6271,妻子,141064,n
20502,妈妈,118755,n


Manually filter out:

1, words related to animal;

2, words not related to gender;

3, words containing male figures;

In [None]:
# filter out top 0.1% frequency words
threshold = filtered_df["total_count"].quantile(0.999)

top_df = filtered_df[filtered_df["total_count"] >= threshold]

top_df.to_csv('top_words.csv', index=False)

top_df.sort_values(by='total_count', ascending=False)

Unnamed: 0,token,total_count,pos
282,人民,11254859,n
542,中国,9702048,ns
1613,工作,8709047,vn
731,发展,8126300,vn
460,国家,6759309,n
...,...,...,...
16114,极端,110693,n
18459,支出,110576,v
6542,秘书,110393,n
8934,饭店,110337,n


In [None]:
# concatenate the two research dictionaries
female_df = pd.read_csv('female_words.csv')
top_df = pd.read_csv('top_words.csv')

unique_tokens = pd.concat([female_df['token'], top_df['token']]).drop_duplicates().reset_index(drop=True)
unique_tokens_df = pd.DataFrame({'token': unique_tokens})
unique_tokens_df.to_csv('research_dictionary.csv', index=False, encoding='utf-8')

In [None]:
# get the unique tokens set
unique_tokens_df = pd.read_csv('research_dictionary.csv')
unique_tokens_set = set(unique_tokens_df['token'])

In [None]:
def process_text_with_sliding_window(G, word_list, target_set, window):
    '''
    add nodes and edges to the graph G based on the co-occurrence of words in a sliding window
    '''
    padded_list = [None] * (window - 1) + word_list + [None] * (window - 1)  # fill the list with None
    S = set()

    # iterate through the list with a sliding window
    for i in range(len(padded_list) - window + 1):
        window_words = padded_list[i:i + window]
        
        # filter out the words that are not in the target set
        valid_words = list(set([word for word in window_words if word in target_set]))
        
        if len(valid_words) >= 2:
            for word in valid_words:
                if word not in S:
                    S.add(word)
                    G.add_node(word)

            # add edges between the valid words and update the weight
            for j in range(len(valid_words)):
                for k in range(j + 1, len(valid_words)):
                    w1, w2 = valid_words[j], valid_words[k]
                    if G.has_edge(w1, w2):
                        G[w1][w2]["weight"] += 1
                    else:
                        G.add_edge(w1, w2, weight=1)

In [None]:
# make tqdm work on pandas
tqdm.pandas()

In [None]:
# create the graph for each year and save it to a pickle file
target_set = unique_tokens_set
window = 5
folder_path = "rmrb_csv_files"
output_folder = "yearly_graphs_new"

os.makedirs(output_folder, exist_ok=True)

csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

for file_name in tqdm(csv_files, desc="Processing files"):
    file_path = os.path.join(folder_path, file_name)
    
    year = file_name.split('_')[1].split('.')[0]
    
    G = nx.Graph()
    
    df = pd.read_csv(file_path)
    df["content_tokenized"] = df["content_tokenized"].fillna("").astype(str)
    
    df["content_tokenized"].progress_apply(lambda x: process_text_with_sliding_window(G, x.split(','), target_set, window))
    
    output_file = os.path.join(output_folder, f"graph_{year}.pkl")
    with open(output_file, "wb") as f:
        pickle.dump(G, f)

print(f"All yearly graphs have been saved to the folder: {output_folder}")

100%|██████████| 222859/222859 [01:58<00:00, 1877.58it/s]
100%|██████████| 178139/178139 [01:46<00:00, 1676.45it/s]26.01s/it]
100%|██████████| 159671/159671 [02:37<00:00, 1010.71it/s]18.22s/it]
100%|██████████| 172305/172305 [02:17<00:00, 1255.48it/s]40.24s/it]
100%|██████████| 188542/188542 [01:44<00:00, 1806.69it/s]41.87s/it]
100%|██████████| 240929/240929 [02:16<00:00, 1766.82it/s]30.62s/it]
100%|██████████| 201955/201955 [01:46<00:00, 1897.63it/s]35.36s/it]
100%|██████████| 217369/217369 [02:21<00:00, 1535.26it/s]28.03s/it]
100%|██████████| 88754/88754 [01:57<00:00, 753.49it/s], 134.96s/it]
100%|██████████| 58694/58694 [01:20<00:00, 725.44it/s], 131.29s/it]
100%|██████████| 66293/66293 [01:22<00:00, 805.23it/s]8, 116.85s/it]
100%|██████████| 100607/100607 [01:56<00:00, 861.49it/s] 107.34s/it]
100%|██████████| 239622/239622 [02:18<00:00, 1726.93it/s]111.83s/it]
100%|██████████| 187373/187373 [01:45<00:00, 1768.52it/s]122.41s/it]
100%|██████████| 217119/217119 [11:15<00:00, 321.22it/

All yearly graphs have been saved to the folder: yearly_graphs_new





In [None]:
# create the graph for each month and save it to a pickle file
target_set = unique_tokens_set
window = 5
folder_path = "rmrb_csv_files"
output_folder = "monthly_graphs_new"

os.makedirs(output_folder, exist_ok=True)

csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

for file_name in tqdm(csv_files, desc="Processing files"):
    file_path = os.path.join(folder_path, file_name)
    
    df = pd.read_csv(file_path)
    df["content_tokenized"] = df["content_tokenized"].fillna("").astype(str)
    
    df["date"] = pd.to_datetime(df["date"])
    
    # group by month
    df["year_month"] = df["date"].dt.to_period("M")
    monthly_groups = df.groupby("year_month")
    
    # iterate through each month
    for year_month, group in tqdm(monthly_groups, desc=f"Processing {file_name}"):
        G = nx.Graph()
        
        group["content_tokenized"].progress_apply(
            lambda x: process_text_with_sliding_window(G, x.split(','), target_set, window)
        )
        
        output_file = os.path.join(output_folder, f"graph_{year_month}.pkl")
        with open(output_file, "wb") as f:
            pickle.dump(G, f)

print(f"All monthly graphs have been saved to the folder: {output_folder}")


Processing files:   0%|          | 0/58 [00:00<?, ?it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 8838/8838 [00:04<00:00, 2079.74it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 10264/10264 [00:04<00:00, 2200.36it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 12770/12770 [00:07<00:00, 1628.80it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[

All monthly graphs have been saved to the folder: monthly_graphs_new



