In [42]:
# create a word cloud from the review comments
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from matplotlib.font_manager import FontProperties
import matplotlib.ticker as mtick
import string
import re
import os
import pandas as pd
import numpy as np
font = FontProperties()
font.set_family('serif')
font.set_name('Times New Roman')

In [43]:
colormap ={
    # 'Male only':"rgb(20, 30, 70)",
    'Male only':"rgb(9,78,190)",
    'withFemale':"rgb(199, 0, 57)",
}

In [44]:
# load all data
exportfolder = "/group/geog_pyloo/XY/01_gender/_data/wordmerged"
varils = ['living', 'people', 'mobility', 'economy', 'governance', 'city','ai','environment','all']
graphicfolder = '/group/geog_pyloo/XY/01_gender/_graphics'


In [55]:
import pandas as pd
import os
import json

# 加载 dropwords
with open('dropwords.json') as json_file:
    dropwords = json.load(json_file)

# 定义更多需要排除的单词
morestopwords = [
    'go', 'got', 'even', 'really', 'around', 'much', 'definitely', 'Iam', 'place', 'first', 'time'
]
sciencestopwords = [
    'artificial', 'intelligence', 'data', 'science', 'machine', 'learning', 'deep', 'neural', 'network', 'nlp', 'ai',
    'computing', 'future', 'ml', 'ML', 'technology', 'technologies', 'technology', 'technological', 'technologically',
    'gpu', 'cpu' , 'review', 'researcher', 'literature', 'research', 'paper', 'study', 'studies', 'authors',
    'author', 'resnet', 'method', 'internet', 'introduction', 'conclusion', 'abstract', 'result', 'results', 'discussion', 'discussions',
    'show', 'present', 'proof', 'evidence', 'prove', 'shown', 'case','eg','ie','analysis','org','reveal','propose',
    'application', 'technique', 'techniques', 'approach', 'approaches', 'methodology', 'methodologies', 'model', 'models',
    'framework', 'published', 'recent', 'year', 'early', 'new', 'current', 'state', 'art', 'stateoftheart', 'stateofthearts', 'reserved', 'right', 'development', 'rapid', 'rapidly', 'fast', 'faster', 'slow', 'slower', 'slowly', 'quickly',
    'use', 'based', 'using', 'support vector machine', 'support vector machines', 'license', 'tech', 'important', 'enable', 'italy', 
    'support', 'vector', 'random forest', 'specifity', 'sensitivity', 'random', 'robustness', 'robust', 'random',
    'algorithm', 'algorithms', 'algorithmic', 'approach', 'approaches', 'method', 'methods', 'methodology', 'methodologies',
    'natural language processing'
]

# 合并所有需要排除的单词和双词
all_dropwords = set(dropwords + morestopwords + sciencestopwords)

# 加载和过滤数据
def load_data(keyword, exportfolder, dropna=False):
    df = pd.read_csv(os.path.join(exportfolder, f'gendercomp_{keyword}_big.csv'))
    
    # 过滤掉包含指定单词的双词
    def should_drop(text):
        words = text.split()
        return any(word.lower() in all_dropwords for word in words)
    
    df_update = df[~df['text'].apply(should_drop)].reset_index(drop=True)
    
    if dropna:
        df_update = df_update.dropna(subset=['freq_male', 'freq_female'])
    else:
        df_update['freq_male'] = df_update['freq_male'].fillna(0)
        df_update['freq_female'] = df_update['freq_female'].fillna(0)
    
    d_male = dict(zip(df_update['text'], df_update['freq_male']))
    d_female = dict(zip(df_update['text'], df_update['freq_female']))
    
    return d_male, d_female

In [47]:
def light_blue_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    return colormap['Male only']

def light_red_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    return colormap['withFemale']

def gen_wordcloud(keyword, taskdict, gender):
        wordcloud = WordCloud(width=2000, height=1200, margin=20, max_font_size=200, min_font_size=20,
                        background_color="white",prefer_horizontal=1, color_func=taskdict[gender][1]
                        )
        wordcloud.fit_words(taskdict[gender][0])
        
        plt.figure()
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")
        plt.show()
                # savefig
        plt.savefig(os.path.join(graphicfolder, f'wordcloud_delta_{keyword}_{gender}.png'), 
                dpi=300, bbox_inches='tight')

In [48]:
def revise_rank(gendercomp):
    gendercomp['freqxPMI_male_rank'] = gendercomp['freqxPMI_male'].rank(ascending = True, method = 'dense')
    gendercomp['freqxPMI_female_rank'] = gendercomp['freqxPMI_female'].rank(ascending = True, method = 'dense')
    # normalize the rank to 1 - 100
    gendercomp['freqxPMI_male_rank_score'] = (gendercomp['freqxPMI_male_rank'] 
                                              - gendercomp['freqxPMI_male_rank'].min())/(gendercomp['freqxPMI_male_rank'].max() - gendercomp['freqxPMI_male_rank'].min())*100
    gendercomp['freqxPMI_female_rank_score'] = (gendercomp['freqxPMI_female_rank'] 
                                                - gendercomp['freqxPMI_female_rank'].min())/(gendercomp['freqxPMI_female_rank'].max() 
                                                                                             - gendercomp['freqxPMI_female_rank'].min())*100
    gendercomp['rank_female_male_score'] = gendercomp.apply(lambda row:
    row['freqxPMI_female_rank_score'] - row['freqxPMI_male_rank_score'], axis = 1)
    gendercomp['abs_rank_female_male_score'] = gendercomp.apply(lambda row:
        abs(row['freqxPMI_female_rank_score'] - row['freqxPMI_male_rank_score']), axis = 1)
    gendercomp['total_rank_score'] = gendercomp.apply(lambda row:
        row['freqxPMI_female_rank_score'] + row['freqxPMI_male_rank_score'], axis = 1)
    return gendercomp

In [49]:
import gc
gc.collect()

23580

In [50]:
nodropwords = []
dfdict = {}
DFall = []
for keyword in varils:
    df = pd.read_csv(os.path.join(exportfolder, 
                                    'gendercomp_{}_big.csv'.format(keyword)))
    df['keyword'] = keyword
    df = revise_rank(df)
    DFall.append(df)
    # plotfig(df,dropwords, keyword)
    print(keyword, df['abs_rank_female_male'].max(), df['abs_rank_female_male'].min())
    dfdict[keyword] = df
DFall = pd.concat(DFall).reset_index(drop = True)
print("Unique text: ", DFall['text'].nunique())

living 1419.0 0.0
people 1978.0 0.0
mobility 1321.0 0.0
economy 1015.0 0.0
governance 438.0 0.0
city 2146.0 0.0
ai 16452.0 0.0
environment 2696.0 0.0
all 28285.0 0.0
Unique text:  31274


In [51]:
def plot_revise_wordcloud(df, dropwords, exportfolder, keyword):
        # df = pd.read_csv(os.path.join(exportfolder, 
        #                                 'gendercomp_{}_big.csv'.format(keyword)))
        df = df[df['text'].isin(dropwords)==False].reset_index(drop=True)
        df['rank_male_female'] = df['rank_female_male']*-1
        df_male = df.sort_values(['rank_female_male'], 
                                ascending = True).head(100).sort_values('freqxPMI_male_rank', ascending = False).reset_index(drop = True)
        df_male = df_male[df_male['rank_male_female']>10].reset_index(drop = True)
        df_female = df.sort_values(['rank_female_male'], 
                                ascending = False).head(100).sort_values('freqxPMI_female_rank', ascending = False).reset_index(drop = True)
        df_female = df_female[df_female['rank_female_male']>10].reset_index(drop = True)
        d_male = dict(zip(df_male ['text'], df_male ['freqxPMI_male_rank']))
        d_female = dict(zip(df_female['text'], df_female['freqxPMI_female_rank']))

        taskdict = {
                        'male':[d_male,light_blue_color_func],
                        'female':[d_female,light_red_color_func]
                        
                }
        print(keyword)
        for gender in ['male', 'female']:
                
                gen_wordcloud(keyword, taskdict, gender)

In [None]:
for keyword in varils:
    df = pd.read_csv(os.path.join(exportfolder, 
                                        'gendercomp_{}_big.csv'.format(keyword)))
    df = df[df['text'].isin(dropwords)==False].reset_index(drop=True)
    plot_revise_wordcloud(df,dropwords, exportfolder, keyword)

In [1]:
import pandas as pd
import os
import plotly.graph_objects as go
import numpy as np

graphicfolder = '/group/geog_pyloo/XY/01_gender/_graphics'
exportfolder = "/group/geog_pyloo/XY/01_gender/_data/wordmerged"
varils = ['living', 'people', 'mobility', 'economy', 'governance', 'city', 'ai', 'environment','all']
stopwords = []  # Replace with actual stopwords

def revise_rank(gendercomp):
    gendercomp['freqxPMI_male_rank'] = gendercomp['freqxPMI_male'].rank(ascending=True, method='dense')
    gendercomp['freqxPMI_female_rank'] = gendercomp['freqxPMI_female'].rank(ascending=True, method='dense')
    gendercomp['freqxPMI_male_rank_score'] = (gendercomp['freqxPMI_male_rank'] - gendercomp['freqxPMI_male_rank'].min()) / (gendercomp['freqxPMI_male_rank'].max() - gendercomp['freqxPMI_male_rank'].min()) * 100
    gendercomp['freqxPMI_female_rank_score'] = (gendercomp['freqxPMI_female_rank'] - gendercomp['freqxPMI_female_rank'].min()) / (gendercomp['freqxPMI_female_rank'].max() - gendercomp['freqxPMI_female_rank'].min()) * 100
    gendercomp['rank_female_male_score'] = gendercomp.apply(lambda row: row['freqxPMI_female_rank_score'] - row['freqxPMI_male_rank_score'], axis=1)
    gendercomp['abs_rank_female_male_score'] = gendercomp.apply(lambda row: abs(row['freqxPMI_female_rank_score'] - row['freqxPMI_male_rank_score']), axis=1)
    gendercomp['total_rank_score'] = gendercomp.apply(lambda row: row['freqxPMI_female_rank_score'] + row['freqxPMI_male_rank_score'], axis=1)
    return gendercomp

# Load and process data
DFall = []
for keyword in varils:
    filepath = os.path.join(exportfolder, f'gendercomp_{keyword}_big.csv')
    if os.path.exists(filepath):
        df = pd.read_csv(filepath)
        df['keyword'] = keyword
        df = revise_rank(df)
        DFall.append(df)
    else:
        print(f"File not found: {filepath}")

if DFall:
    DFall = pd.concat(DFall).reset_index(drop=True)
    print("Unique text: ", DFall['text'].nunique())
else:
    print("No data files found.")

# Function to interpolate color
def interpolate_color(val, color1, color2):
    return [int(v1 + val * (v2 - v1)) for v1, v2 in zip(color1, color2)]

# Function to interpolate size
def interpolate_size(val, size1, size2):
    return int((size1 + np.log(val+1) * (size2 - size1))**1.9 / 25)

# Initialize color and size ranges
color_start = [16, 113, 229]
color_mid = [177, 214, 240]
color_end = [199, 0, 57]
sizerange = [10, 30]

# Function to plot the figure
def plotfig(df, stopwords, keyword):
    df = df[df['text'].isin(stopwords) == False].reset_index(drop=True)
    
    fig = go.Figure()
    df = df.sort_values('abs_rank_female_male_score').reset_index(drop=True)
    
    min_rank = df['rank_female_male_score'].min()
    max_rank = df['rank_female_male_score'].max()
    median_rank = (min_rank + max_rank) / 2
    min_abs_rank = df['abs_rank_female_male_score'].min()
    max_abs_rank = df['abs_rank_female_male_score'].max()

    N = 10  # Number of points to label
    label_points = df.nsmallest(N, 'freqxPMI_male_rank').index.union(
        df.nsmallest(N, 'freqxPMI_female_rank').index)

    annotations = []

    for index, row in df.iterrows():
        x = row['freqxPMI_male_rank_score']
        y = row['freqxPMI_female_rank_score']
        text = row['text'] if index in label_points else ""
        
        abs_rank = row['abs_rank_female_male_score']
        normalized_rank = (abs_rank - min_abs_rank) / (max_abs_rank - min_abs_rank)
        fontsize = interpolate_size(normalized_rank, sizerange[0], sizerange[1])
        
        rank = row['rank_female_male_score']
        if rank <= median_rank:
            normalized_rank = (rank - min_rank) / (median_rank - min_rank)
            color = interpolate_color(normalized_rank, color_start, color_mid)
        else:
            normalized_rank = (rank - median_rank) / (max_rank - median_rank)
            color = interpolate_color(normalized_rank, color_mid, color_end)

        color_str = f'rgb({color[0]}, {color[1]}, {color[2]})'

        fig.add_trace(go.Scatter(
            x=[x],
            y=[y],
            text=[text],
            mode='markers+text',
            textposition='top center',
            marker=dict(
                size=fontsize,
                color=color_str,
                opacity=0.6
            ),
            textfont=dict(size=max(fontsize // 2, 1), color=color_str)  # Ensure font size is at least 1
        ))

    fig.update_layout(
        title=f'PMIxFrequency Score: {keyword}',
        xaxis_title='Male Only',
        yaxis_title='At least one Female',
        xaxis=dict(range=[-5, 100]),
        yaxis=dict(range=[-5, 100]),
        width=600,
        height=600,
        plot_bgcolor='white',
        paper_bgcolor='white',
        hovermode="closest",
        showlegend=False  # Hide legend
    )

    # Save figure to PNG 
    fig.write_image(os.path.join(graphicfolder, f'wordcloud_{keyword}_rank.png'))
    fig.show()

# Call the plotting function for each keyword
for keyword in varils:
    df = DFall[DFall['keyword'] == keyword]
    if not df.empty:
        plotfig(df, stopwords, keyword)
    else:
        print(f"No data found for keyword: {keyword}")


Unique text:  31274


ValueError: 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido


小提琴和箱线图

2+3 2025/02/24

In [7]:
import pandas as pd
import os
import plotly.graph_objects as go
import numpy as np

graphicfolder = '/group/geog_pyloo/XY/01_gender/_graphics'
exportfolder_bigram = "/group/geog_pyloo/XY/01_gender/_data/wordmerged"
exportfolder_trigram = "/group/geog_pyloo/XY/01_gender/trigram/data/wordmerged"

varils = ['living', 'people', 'mobility', 'economy', 'governance', 'city', 'ai', 'environment']
stopwords = []  # Replace with actual stopwords

def revise_rank(gendercomp):
    gendercomp['freqxPMI_male_rank'] = gendercomp['freqxPMI_male'].rank(ascending=True, method='dense')
    gendercomp['freqxPMI_female_rank'] = gendercomp['freqxPMI_female'].rank(ascending=True, method='dense')
    gendercomp['freqxPMI_male_rank_score'] = (gendercomp['freqxPMI_male_rank'] - gendercomp['freqxPMI_male_rank'].min()) / (gendercomp['freqxPMI_male_rank'].max() - gendercomp['freqxPMI_male_rank'].min()) * 100
    gendercomp['freqxPMI_female_rank_score'] = (gendercomp['freqxPMI_female_rank'] - gendercomp['freqxPMI_female_rank'].min()) / (gendercomp['freqxPMI_female_rank'].max() - gendercomp['freqxPMI_female_rank'].min()) * 100
    gendercomp['rank_female_male_score'] = gendercomp['freqxPMI_female_rank_score'] - gendercomp['freqxPMI_male_rank_score']
    gendercomp['abs_rank_female_male_score'] = abs(gendercomp['rank_female_male_score'])
    gendercomp['total_rank_score'] = gendercomp['freqxPMI_female_rank_score'] + gendercomp['freqxPMI_male_rank_score']
    return gendercomp

# Load and process data
DFall = []
for keyword in varils:
    df_list = []
    for folder, suffix in [(exportfolder_bigram, "big"), (exportfolder_trigram, "tri")]:
        filepath = os.path.join(folder, f'gendercomp_{keyword}_{suffix}.csv')
        if os.path.exists(filepath):
            df = pd.read_csv(filepath)
            df['keyword'] = keyword
            df['source'] = suffix  # Mark source as 'big' or 'tri'
            df = revise_rank(df)
            df_list.append(df)
        else:
            print(f"File not found: {filepath}")
    
    if df_list:
        df_merged = pd.concat(df_list)
        df_merged['freqxPMI_male_rank_score'] -= df_merged['freqxPMI_male_rank_score'].min()
        df_merged['freqxPMI_female_rank_score'] -= df_merged['freqxPMI_female_rank_score'].min()
        DFall.append(df_merged)

if DFall:
    DFall = pd.concat(DFall).reset_index(drop=True)
    print("Unique text: ", DFall['text'].nunique())
else:
    print("No data files found.")

# Function to interpolate color
def interpolate_color(val, color1, color2):
    return [int(v1 + val * (v2 - v1)) for v1, v2 in zip(color1, color2)]

# Function to interpolate size
def interpolate_size(val, size1, size2):
    return int((size1 + np.log(val+1) * (size2 - size1))**1.9 / 25)

# Initialize color and size ranges
color_start = [16, 113, 229]
color_mid = [177, 214, 240]
color_end = [199, 0, 57]
sizerange = [10, 30]

# Function to plot the figure
def plotfig(df, keyword):
    df = df[df['text'].isin(stopwords) == False].reset_index(drop=True)
    
    fig = go.Figure()
    df = df.sort_values('abs_rank_female_male_score').reset_index(drop=True)
    
    min_rank = df['rank_female_male_score'].min()
    max_rank = df['rank_female_male_score'].max()
    median_rank = (min_rank + max_rank) / 2
    min_abs_rank = df['abs_rank_female_male_score'].min()
    max_abs_rank = df['abs_rank_female_male_score'].max()

    for _, row in df.iterrows():
        x = row['freqxPMI_male_rank_score']
        y = row['freqxPMI_female_rank_score']
        
        abs_rank = row['abs_rank_female_male_score']
        normalized_rank = (abs_rank - min_abs_rank) / (max_abs_rank - min_abs_rank)
        fontsize = interpolate_size(normalized_rank, sizerange[0], sizerange[1])
        
        rank = row['rank_female_male_score']
        if rank <= median_rank:
            normalized_rank = (rank - min_rank) / (median_rank - min_rank)
            color = interpolate_color(normalized_rank, color_start, color_mid)
        else:
            normalized_rank = (rank - median_rank) / (max_rank - median_rank)
            color = interpolate_color(normalized_rank, color_mid, color_end)

        color_str = f'rgb({color[0]}, {color[1]}, {color[2]})'

        fig.add_trace(go.Scatter(
            x=[x],
            y=[y],
            mode='markers',  # No text on plot
            marker=dict(
                size=fontsize,
                color=color_str,
                opacity=0.6
            )
        ))

    fig.update_layout(
        title=f'PMIxFrequency Score: {keyword}',
        xaxis_title='Male Only',
        yaxis_title='At least one Female',
        xaxis=dict(range=[-5, 105], tickvals=[0, 20, 40, 60, 80, 100]),
        yaxis=dict(range=[-5, 105], tickvals=[0, 20, 40, 60, 80, 100]),
        width=600,
        height=600,
        plot_bgcolor='white',
        paper_bgcolor='white',
        hovermode=False,
        showlegend=False  # Hide legend
    )
    
    fig.write_image(os.path.join(graphicfolder, f'wordcloud_{keyword}_rank.png'))
    fig.show()


Unique text:  26121


2024/02/04 BI+TRIGRAM

In [55]:
import pandas as pd
import os
import plotly.graph_objects as go
import numpy as np

# 文件夹路径设置
graphicfolder = '/group/geog_pyloo/XY/01_gender/_graphics'
bigram_folder = "/group/geog_pyloo/XY/01_gender/_data/wordmerged"
trigram_folder = "/group/geog_pyloo/XY/01_gender/trigram/data/wordmerged"

# 关键词列表（注意'all'目前已注释掉）
varils = ['all',
    #'living', 'people', 'mobility', 'economy', 'governance', 'city', 'ai', 'environment'
    ]
stopwords = []  # 替换为实际停用词

def revise_rank(gendercomp):
    gendercomp['freqxPMI_male_rank'] = gendercomp['freqxPMI_male'].rank(ascending=True, method='dense')
    gendercomp['freqxPMI_female_rank'] = gendercomp['freqxPMI_female'].rank(ascending=True, method='dense')
    gendercomp['freqxPMI_male_rank_score'] = (gendercomp['freqxPMI_male_rank'] - gendercomp['freqxPMI_male_rank'].min()) / \
        (gendercomp['freqxPMI_male_rank'].max() - gendercomp['freqxPMI_male_rank'].min()) * 100
    gendercomp['freqxPMI_female_rank_score'] = (gendercomp['freqxPMI_female_rank'] - gendercomp['freqxPMI_female_rank'].min()) / \
        (gendercomp['freqxPMI_female_rank'].max() - gendercomp['freqxPMI_female_rank'].min()) * 100
    gendercomp['rank_female_male_score'] = gendercomp['freqxPMI_female_rank_score'] - gendercomp['freqxPMI_male_rank_score']
    gendercomp['abs_rank_female_male_score'] = gendercomp['rank_female_male_score'].abs()
    gendercomp['total_rank_score'] = gendercomp['freqxPMI_female_rank_score'] + gendercomp['freqxPMI_male_rank_score']
    return gendercomp

# 颜色和点大小的插值函数
def interpolate_color(val, color1, color2):
    return [int(v1 + val * (v2 - v1)) for v1, v2 in zip(color1, color2)]

def interpolate_size(val, size1, size2):
    return int((size1 + np.log(val + 1) * (size2 - size1))**1.9 / 25)

# 定义颜色范围和大小范围
color_start = [16, 113, 229]
color_mid = [177, 214, 240]
color_end = [199, 0, 57]
sizerange = [10, 30]

def process_and_merge(keyword):
    # 加载bigram数据
    bigram_path = os.path.join(bigram_folder, f'gendercomp_{keyword}_big.csv')
    df_big = None
    if os.path.exists(bigram_path):
        df_big = pd.read_csv(bigram_path)
        df_big['keyword'] = keyword
        df_big['gram_type'] = 'bigram'
        df_big = revise_rank(df_big)
    else:
        print(f"Bigram file not found: {bigram_path}")
    
    # 加载trigram数据
    trigram_path = os.path.join(trigram_folder, f'gendercomp_{keyword}_tri.csv')
    df_tri = None
    if os.path.exists(trigram_path):
        df_tri = pd.read_csv(trigram_path)
        df_tri['keyword'] = keyword
        df_tri['gram_type'] = 'trigram'
        df_tri = revise_rank(df_tri)
    else:
        print(f"Trigram file not found: {trigram_path}")
    
    if df_big is None and df_tri is None:
        return None
    
    # 若两个数据集均存在，则计算偏移量，使trigram的低值点对齐bigram
    if df_big is not None and df_tri is not None:
        # X轴调整：取 freqxPMI_male_rank_score 低5%的中位数
        quantile_bigram_x = df_big['freqxPMI_male_rank_score'].quantile(0.05)
        low_bigram_x = df_big[df_big['freqxPMI_male_rank_score'] <= quantile_bigram_x]
        median_bigram_x = low_bigram_x['freqxPMI_male_rank_score'].median()
        
        quantile_tri_x = df_tri['freqxPMI_male_rank_score'].quantile(0.05)
        low_tri_x = df_tri[df_tri['freqxPMI_male_rank_score'] <= quantile_tri_x]
        median_tri_x = low_tri_x['freqxPMI_male_rank_score'].median()
        
        offset_x = median_bigram_x - median_tri_x
        
        # Y轴调整：取 freqxPMI_female_rank_score 低5%的中位数
        quantile_bigram_y = df_big['freqxPMI_female_rank_score'].quantile(0.05)
        low_bigram_y = df_big[df_big['freqxPMI_female_rank_score'] <= quantile_bigram_y]
        median_bigram_y = low_bigram_y['freqxPMI_female_rank_score'].median()
        
        quantile_tri_y = df_tri['freqxPMI_female_rank_score'].quantile(0.05)
        low_tri_y = df_tri[df_tri['freqxPMI_female_rank_score'] <= quantile_tri_y]
        median_tri_y = low_tri_y['freqxPMI_female_rank_score'].median()
        
        offset_y = median_bigram_y - median_tri_y
        
        # 针对ai关键词，增大偏移量
        if keyword == 'all':
            offset_x *= 2.6
            offset_y *= 1.2
        
        # 对trigram数据进行偏移调整
        df_tri['freqxPMI_male_rank_score'] = df_tri['freqxPMI_male_rank_score'] + offset_x
        df_tri['freqxPMI_female_rank_score'] = df_tri['freqxPMI_female_rank_score'] + offset_y

    # 合并两个数据集（若只有一个存在，则直接返回该数据集）
    if df_big is not None and df_tri is not None:
        df_combined = pd.concat([df_big, df_tri], ignore_index=True)
    elif df_big is not None:
        df_combined = df_big
    else:
        df_combined = df_tri
        
    return df_combined

def plot_combined(df, keyword):
    # 过滤掉停用词（图中不显示文本）
    df = df[~df['text'].isin(stopwords)].copy()
    
    # 计算全局的颜色和大小插值参数
    min_rank = df['rank_female_male_score'].min()
    max_rank = df['rank_female_male_score'].max()
    median_rank = (min_rank + max_rank) / 2
    min_abs_rank = df['abs_rank_female_male_score'].min()
    max_abs_rank = df['abs_rank_female_male_score'].max()
    
    # 针对每个点计算 marker 大小和颜色
    def compute_marker(row):
        # 计算 marker 大小（依据绝对排名差归一化后再通过对数函数处理）
        abs_rank = row['abs_rank_female_male_score']
        norm_abs = (abs_rank - min_abs_rank) / (max_abs_rank - min_abs_rank) if max_abs_rank != min_abs_rank else 0
        size = interpolate_size(norm_abs, sizerange[0], sizerange[1])
        
        # 计算 marker 颜色（依据排名差与中位数的关系进行分段线性插值）
        rank_val = row['rank_female_male_score']
        if rank_val <= median_rank:
            norm_rank = (rank_val - min_rank) / (median_rank - min_rank) if median_rank != min_rank else 0
            col = interpolate_color(norm_rank, color_start, color_mid)
        else:
            norm_rank = (rank_val - median_rank) / (max_rank - median_rank) if max_rank != median_rank else 0
            col = interpolate_color(norm_rank, color_mid, color_end)
        color_str = f'rgb({col[0]}, {col[1]}, {col[2]})'
        return pd.Series({'marker_size': size, 'marker_color': color_str})
    
    marker_props = df.apply(compute_marker, axis=1)
    df = pd.concat([df, marker_props], axis=1)
    
    # 绘制图形，按 gram_type 分组，分别显示 bigram 与 trigram
    fig = go.Figure()
    for gram in df['gram_type'].unique():
        df_subset = df[df['gram_type'] == gram]
        # 根据 gram_type 决定 marker 符号，trigram 使用 square，其它使用 circle
        symbol = 'square' if gram == 'trigram' else 'circle'
        fig.add_trace(go.Scatter(
            x=df_subset['freqxPMI_male_rank_score'],
            y=df_subset['freqxPMI_female_rank_score'],
            mode='markers',
            marker=dict(
                size=df_subset['marker_size'],
                color=df_subset['marker_color'],
                opacity=0.6,
                symbol=symbol,
                line=dict(width=0)
            ),
            name=gram,
            hoverinfo='skip'
        ))
    
    fig.update_layout(
        title=f'PMIxFrequency Score: {keyword}',
        xaxis_title='Male Only',
        yaxis_title='At least one Female',
        xaxis=dict(
            range=[-5, 105],
            tickmode='array',
            tickvals=[0, 20, 40, 60, 80, 100]
        ),
        yaxis=dict(
            range=[-5, 105],
            tickmode='array',
            tickvals=[0, 20, 40, 60, 80, 100]
        ),
        width=600,
        height=600,
        plot_bgcolor='white',
        paper_bgcolor='white',
        hovermode=False,
        showlegend=True
    )
    
    # 直接显示图形，不调用 kaleido 或 nbformat
    fig.show()

# 针对每个关键词加载数据、合并并绘图
for keyword in varils:
    df_merged = process_and_merge(keyword)
    if df_merged is not None and not df_merged.empty:
        plot_combined(df_merged, keyword)
    else:
        print(f"No data found for keyword: {keyword}")
