In [1]:
import pandas as pd
from collections import Counter
import numpy as np
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
from statsmodels.miscmodels.ordinal_model import OrderedModel
from doubleml import DoubleMLPLIV, DoubleMLData, DoubleMLPLR
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from linearmodels.iv import IV2SLS
import statsmodels.api as sm
from scipy.stats import norm
import statsmodels.formula.api as smf

In [2]:
# import pic data
all_pic_temp = pd.read_excel("data/input/open_pic_new.xlsx")
res_pic_temp = pd.read_excel("data/input/restaurant_new.xlsx")
drink_pic_temp = pd.read_excel("data/input/drink_new.xlsx")
# import business data 
business_feature_df = pd.read_csv("data/input/business_feature.csv")

In [3]:
print(round(all_pic_temp["memory_score"].mean(),3),all_pic_temp["memory_score"].min(),all_pic_temp["memory_score"].max())
print(round(res_pic_temp["memory_score"].mean(),3),res_pic_temp["memory_score"].min(),res_pic_temp["memory_score"].max())
print(round(drink_pic_temp["memory_score"].mean(),3),drink_pic_temp["memory_score"].min(),drink_pic_temp["memory_score"].max())

0.752 0.412 0.993
0.749 0.412 0.993
0.74 0.412 0.991


In [4]:
# data shape
print(all_pic_temp.shape,res_pic_temp.shape,drink_pic_temp.shape,business_feature_df.shape)

(164839, 31) (139385, 31) (73268, 33) (150346, 12)


In [5]:
def process_data(df_open):
  # filter yolo nan data
    var = []
    person_other_count = []
    person_total_count = []
    for index,row in df_open.iterrows():
        if len(str(row['objects_content']))==1 or len(str(row['objects_content']))==0:
            temp = ''
        else:
            # 'object_content' columns str to list
            temp = str(row['objects_content']).split(",")
            temp = [s.strip() for s in temp]
        person_count = temp.count('person')
        other_count = len(temp)-person_count
        temp_res = abs(person_count-other_count)
        try:
            var_temp = temp_res/(person_count+other_count)
        except:
            var_temp = 0
        var.append(var_temp)
        person_total_count.append(len(temp))
    df_open['var'] = var
    df_open['person_total_count'] = person_total_count
    df_open['person_total_count_cluster'] = np.where(df_open['person_total_count'] != 0, 1, df_open['person_total_count'])
    # print(df_open['person_total_count_cluster'].value_counts())
    filter_data  = df_open[df_open['person_total_count']!=0]
    print("pre process data len:",len(df_open),"post process data len: ",len(filter_data))
    return df_open[df_open['person_total_count']!=0]

In [6]:
# pic process
all_df_pic = process_data(all_pic_temp)
print("business",all_df_pic[['label','memory_score']].groupby('label').mean().reset_index())
res_df_pic = process_data(res_pic_temp)
print("res_df",res_df_pic[['label','memory_score']].groupby('label').mean().reset_index())
drink_df_pic = process_data(drink_pic_temp)
print("drink_df",drink_df_pic[['label','memory_score']].groupby('label').mean().reset_index())


pre process data len: 164839 post process data len:  154945
business      label  memory_score
0    drink      0.830912
1     food      0.788275
2   inside      0.668729
3     menu      0.826144
4  outside      0.690957
pre process data len: 139385 post process data len:  131624
res_df      label  memory_score
0    drink      0.826183
1     food      0.783535
2   inside      0.659904
3     menu      0.827708
4  outside      0.688542
pre process data len: 73268 post process data len:  69548
drink_df      label  memory_score
0    drink      0.831727
1     food      0.782448
2   inside      0.662736
3     menu      0.824719
4  outside      0.676933


In [7]:
def calculate_bubble_data(df, group_name):
    """计算气泡图需要的统计量"""
    stats = df.groupby('label')['memory_score'].agg(
        memory_score='mean',      # 平均值
        sd='std',                 # 标准差
        n='count'                 # 样本量
    ).reset_index()
   
    # 计算百分比
    total_n = stats['n'].sum()
    stats['percentage'] = (stats['n'] / total_n * 100)
    
    # 计算标准误和95% CI
    stats['se'] = stats['sd'] / np.sqrt(stats['n'])
    stats['ci_lower'] = stats['memory_score'] - 1.96 * stats['se']
    stats['ci_upper'] = stats['memory_score'] + 1.96 * stats['se']
    
    # 添加分组标签
    stats['group'] = group_name
    
    return stats

# ====================== 计算三组数据 ======================
all_business_stats = calculate_bubble_data(all_df_pic, "All businesses")
restaurant_stats = calculate_bubble_data(res_df_pic, "Restaurants")
drink_stats = calculate_bubble_data(drink_df_pic, "Drinks")

# ====================== 合并并保存 ======================
# 合并三个数据框
bubble_data = pd.concat([all_business_stats, restaurant_stats, drink_stats], ignore_index=True)

# 调整列顺序
bubble_data = bubble_data[['group', 'label', 'memory_score', 'sd', 'n', 'percentage', 'se', 'ci_lower', 'ci_upper']]

# # 查看数据
# print(bubble_data)

# 保存为CSV文件
bubble_data.to_excel("data/output/bubble_plot_data.xlsx", index=False)

print("\n文件已保存至: data/output/bubble_plot_data.xlsx")


文件已保存至: data/output/bubble_plot_data.xlsx


In [8]:
print(round(all_df_pic["memory_score"].mean(),3),all_df_pic["memory_score"].min(),all_df_pic["memory_score"].max())
print(round(res_df_pic["memory_score"].mean(),3),res_df_pic["memory_score"].min(),res_df_pic["memory_score"].max())
print(round(drink_df_pic["memory_score"].mean(),3),drink_df_pic["memory_score"].min(),drink_df_pic["memory_score"].max())

0.751 0.412 0.993
0.748 0.412 0.993
0.74 0.412 0.991


In [9]:
# business_feature data process
business_df =  business_feature_df[business_feature_df['business_id'].isin(list(all_df_pic['business_id'].unique()))]
business_res = business_feature_df[business_feature_df['business_id'].isin(list(res_df_pic['business_id'].unique()))]
business_drink = business_feature_df[business_feature_df['business_id'].isin(list(drink_df_pic['business_id'].unique()))]
# 22397 18203 8192

# pic 数据的转换
# print(len(business_df),len(business_res),len(business_drink))

business_df = pd.merge(business_df,all_df_pic[["business_id","memory_score","average_hue","average_saturation",
                                               "average_value","sharpness_measure","person_count","beauty_score",'person_total_count']].groupby("business_id").mean().reset_index(),
                       how='left',on='business_id')
business_res = pd.merge(business_res,res_df_pic[["business_id","memory_score","average_hue","average_saturation",
                                               "average_value","sharpness_measure","person_count","beauty_score",'person_total_count']].groupby("business_id").mean().reset_index(),
                       how='left',on='business_id')
business_drink = pd.merge(business_drink,drink_df_pic[["business_id","memory_score","average_hue","average_saturation",
                                               "average_value","sharpness_measure","person_count","beauty_score",'person_total_count']].groupby("business_id").mean().reset_index(),
                       how='left',on='business_id')
print(len(business_df),len(business_res),len(business_drink))

27075 21279 9853


In [10]:
business_df = business_df[(business_df['star_std']!=0)].reset_index(drop=True)
business_res = business_res[(business_res['star_std']!=0)].reset_index(drop=True)
business_drink = business_drink[(business_drink['star_std']!=0)].reset_index(drop=True)
print(len(business_df),len(business_res),len(business_drink))

# print(len(all_bus), len(all_res), len(all_drink))
# print(len(business_df), len(business_res), len(business_drink))
# print(len(all_bus)-len(business_df), len(all_res)-len(business_res), len(all_drink)-len(business_drink))
# print(round((len(all_bus)-len(business_df))/len(all_bus),3),
#        round((len(all_bus)-len(business_df))/len(all_res),3), 
#        round((len(all_bus)-len(business_df))/len(all_drink),3))

# 27075 21279 9853
# 26769 21150 9751
# 306 129 102
# 0.011 0.014 0.031

26769 21150 9751


In [11]:
print(business_df['review_count'].sum())
print(business_res['review_count'].sum())
print(business_drink['review_count'].sum())

3456545
3155433
1481608


# study 1 :image feature ---- memorability

In [12]:
# pic 均值和标准差的统计
def result_mean(data):
    res = [round(data['memory_score'].mean(),3),
    round(len(data[data['label']=='food'])/len(data),3),
    round(len(data[data['label']=='menu'])/len(data),3),
    round(len(data[data['label']=='inside'])/len(data),3),
    round(len(data[data['label']=='outside'])/len(data),3),
    round(len(data[data['label']=='drink'])/len(data),3),
    round(data['average_hue'].mean(),3),
    round(data['average_saturation'].mean(),3),
    round(data['average_value'].mean(),3),
    round(len(data[data['person_exist']==1])/len(data),3),
    # round(data['person_count'].mean(),3),
    round(data['var'].mean(),3),
    round(data['person_total_count'].mean(),3),
    round(data['sharpness_measure'].mean(),3),
    round(data['beauty_score'].mean(),3)
    ]
    return res

def result_std(data):
    res = [
    round(data['memory_score'].std(),3),
    '-','-','-','-','-',
    round(data['average_hue'].std(),3),
    round(data['average_saturation'].std(),3),
    round(data['average_value'].std(),3),
    '-',
    # round(data['person_count'].std(),3),
    round(data['var'].std(),3),
    round(data['person_total_count'].std(),3),
    round(data['sharpness_measure'].std(),3),
    round(data['beauty_score'].std(),3)
    ]
    return res

In [13]:
# pic 均值和标准差的统计结果
data = {
    "name":["memory_score","food","menu","inside",
            "outside","drink","average_hue","average_saturation",
            "average_value","person_exist","var","person_total_count","sharpness_measure","beauty_score"],
    'All Busi mean': result_mean(all_df_pic),
    'All Busi std': result_std(all_df_pic),
    'All Res mean': result_mean(res_df_pic),
    'All Res std': result_std(res_df_pic),
    'Drink mean': result_mean(drink_df_pic),
    'Drink std': result_std(drink_df_pic)
    }

# 创建DataFrame
df = pd.DataFrame(data)

# 显示DataFrame
print(df)
df.head()
df.to_excel("data/output/study1_picture_data_summary_yolo11.xlsx",index=False)
print("数据保存成功")

                  name  All Busi mean All Busi std  All Res mean All Res std  \
0         memory_score          0.751        0.106         0.748       0.105   
1                 food          0.547            -         0.591           -   
2                 menu          0.006            -         0.007           -   
3               inside          0.277            -         0.254           -   
4              outside          0.083            -         0.077           -   
5                drink          0.086            -         0.071           -   
6          average_hue         40.808       22.614        39.745      21.768   
7   average_saturation        101.648        39.34       103.231      38.646   
8        average_value        141.041       41.988       141.740      40.902   
9         person_exist          0.332            -         0.316           -   
10                 var          0.824        0.307         0.830       0.302   
11  person_total_count          5.605   

In [14]:
def create_ols_model(data):
    df_dummies = pd.get_dummies(data['label'])
    df_dummies = df_dummies[['food','menu','inside','drink']].astype(int)
    df = pd.concat([data, df_dummies], axis=1)
    # use StandardScaler
    scaler = StandardScaler()
    df['average_hue_s'] = scaler.fit_transform(df[['average_hue']])
    df['average_saturation_s'] = scaler.fit_transform(df[['average_saturation']])
    df['average_value_s'] = scaler.fit_transform(df[['average_value']])
    df['var'] = df[['var']]
    # df['pic_beauty_s'] = scaler.fit_transform(df[['beauty_score']])
    df['sharpness_measure_s'] = scaler.fit_transform(df[['sharpness_measure']])
    # df['var2'] = scaler.fit_transform(df[['var2']])
    df['people_exist_var'] = df['person_exist'] * df['var']
    X =  df[list(df_dummies.columns)+['average_hue_s','average_saturation_s','average_value_s',
                                      'person_exist',
                                    #   'person_count',
                                      'var',
                                      "person_total_count"
                                      # "people_exist_var"
                                      ,'beauty_score','sharpness_measure_s'
                                      ]]
    y = data['memory_score']
    X = sm.add_constant(X)
    g2 = data['business_id']
    model = sm.OLS(y, X).fit(cov_type='HC3')
    
    return model


In [15]:
# ols模型结果
print('--------------------------------------------------1.all--------------------------------------------------')
ols_business = create_ols_model(process_data(all_df_pic))
print(ols_business.summary())
print('--------------------------------------------------2.res_data--------------------------------------------------')
ols_res = create_ols_model(process_data(res_df_pic))
print(ols_res.summary())
print('--------------------------------------------------3.drink_data--------------------------------------------------')
ols_drink = create_ols_model(process_data(drink_df_pic))
print(ols_drink.summary())

--------------------------------------------------1.all--------------------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


pre process data len: 154945 post process data len:  154945
                            OLS Regression Results                            
Dep. Variable:           memory_score   R-squared:                       0.365
Model:                            OLS   Adj. R-squared:                  0.365
Method:                 Least Squares   F-statistic:                     7580.
Date:                Tue, 10 Feb 2026   Prob (F-statistic):               0.00
Time:                        20:26:46   Log-Likelihood:             1.6271e+05
No. Observations:              154945   AIC:                        -3.254e+05
Df Residuals:                  154932   BIC:                        -3.253e+05
Df Model:                          12                                         
Covariance Type:                  HC3                                         
                           coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


pre process data len: 131624 post process data len:  131624
                            OLS Regression Results                            
Dep. Variable:           memory_score   R-squared:                       0.364
Model:                            OLS   Adj. R-squared:                  0.364
Method:                 Least Squares   F-statistic:                     6228.
Date:                Tue, 10 Feb 2026   Prob (F-statistic):               0.00
Time:                        20:26:52   Log-Likelihood:             1.4001e+05
No. Observations:              131624   AIC:                        -2.800e+05
Df Residuals:                  131611   BIC:                        -2.799e+05
Df Model:                          12                                         
Covariance Type:                  HC3                                         
                           coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


pre process data len: 69548 post process data len:  69548
                            OLS Regression Results                            
Dep. Variable:           memory_score   R-squared:                       0.407
Model:                            OLS   Adj. R-squared:                  0.407
Method:                 Least Squares   F-statistic:                     4465.
Date:                Tue, 10 Feb 2026   Prob (F-statistic):               0.00
Time:                        20:26:55   Log-Likelihood:                 72733.
No. Observations:               69548   AIC:                        -1.454e+05
Df Residuals:                   69535   BIC:                        -1.453e+05
Df Model:                          12                                         
Covariance Type:                  HC3                                         
                           coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------

In [16]:
dfoutput = summary_col([ols_business,ols_res,ols_drink],stars=True)
print(dfoutput)

beginningtex = """\\documentclass{report}
\\usepackage{booktabs}
\\begin{document}"""
endtex = "\end{document}"

f = open('data/output/study1_ols.tex', 'w')
f.write(beginningtex)
f.write(dfoutput.as_latex())
f.write(endtex)
f.close()


                     memory_score I memory_score II memory_score III
--------------------------------------------------------------------
const                0.6813***      0.6924***       0.6563***       
                     (0.0037)       (0.0040)        (0.0056)        
food                 0.0854***      0.0826***       0.0907***       
                     (0.0010)       (0.0012)        (0.0016)        
menu                 0.1191***      0.1207***       0.1355***       
                     (0.0025)       (0.0026)        (0.0038)        
inside               -0.0007        -0.0063***      0.0061***       
                     (0.0010)       (0.0012)        (0.0015)        
drink                0.1360***      0.1332***       0.1472***       
                     (0.0012)       (0.0013)        (0.0016)        
average_hue_s        0.0035***      0.0023***       0.0045***       
                     (0.0002)       (0.0003)        (0.0003)        
average_saturation_s -0.0026***  

# study 2: Memorability- business ratings

In [17]:
# doubleml data process
def contains_one(lst):
    return 1 if 1 in lst else 0


def bus_pic_data_process(business_df,pic_df):
    scaler = StandardScaler()
    business_id_unique = business_df['business_id'].unique().tolist()
    count = 0
    pic_bus_df = []
    for business_id in business_id_unique[:]:
        count+=1
        single_bus = pic_df[pic_df['business_id']==business_id]
        food_rate = round(len(single_bus[single_bus['label']=='food'])/len(single_bus),3)
        drink_rate = round(len(single_bus[single_bus['label']=='drink'])/len(single_bus),3)
        menu_rate = round(len(single_bus[single_bus['label']=='menu'])/len(single_bus),3)
        inside_rate = round(len(single_bus[single_bus['label']=='inside'])/len(single_bus),3)
        outside_rate = round(len(single_bus[single_bus['label']=='outside'])/len(single_bus),3)
        person_exist = contains_one(single_bus['person_exist'].values.tolist())
        # person percentage
        try:
            person_percentage = single_bus['person_exist'].sum()/len(single_bus)
        except:
            person_percentage = 0
        avg_var = round(single_bus['var'].mean(),3)
        person_total_count = round(single_bus['person_total_count'].mean(),3)
        person_count = round(single_bus['person_count'].mean(),3)
        pic_bus_df.append([business_id,food_rate,drink_rate,menu_rate,inside_rate,outside_rate,avg_var,person_exist,person_percentage,person_total_count, person_count])
    pic_bus_df = pd.DataFrame(pic_bus_df,columns=['business_id','food','drink','menu','inside','outside','var','person_exist',"person_percentage",'person_total_count','person_count'])
    bus_data = business_df
    # # data merge
    res = pd.merge(bus_data,pic_bus_df,on='business_id')
    return res
business_df = bus_pic_data_process(business_df,all_df_pic)
business_res = bus_pic_data_process(business_res,res_df_pic)
business_drink = bus_pic_data_process(business_drink,drink_df_pic)


In [18]:
# business 均值和标准差的统计
def bus_result_mean(data):
    res = [
    round(len(data[data['stars']==1.0])/len(data),3),
    round(len(data[data['stars']==1.5])/len(data),3),
    round(len(data[data['stars']==2.0])/len(data),3),
    round(len(data[data['stars']==2.5])/len(data),3),
    round(len(data[data['stars']==3.0])/len(data),3),
    round(len(data[data['stars']==3.5])/len(data),3),
    round(len(data[data['stars']==4.0])/len(data),3),
    round(len(data[data['stars']==4.5])/len(data),3),
    round(len(data[data['stars']==5.0])/len(data),3),
    round(data['memory_score'].mean(),3),
    round(data['review_count'].mean(),3),
    round(data['categories_counts'].mean(),3),
    round(data['star_std'].mean(),3),
    round(data['contents_score_avg'].mean(),3),
    # round(data['useful_avg'].mean(),3),
    # round(data['funny_avg'].mean(),3),
    # round(data['cool_avg'].mean(),3),
    # round(data['person_count'].mean(), 3),
    round(data['beauty_score'].mean(),3),
    round(data['sharpness_measure'].mean(),3),
    round(data['average_hue'].mean(), 3),
    round(data['average_saturation'].mean(),3),
    round(data['average_value'].mean(),3),
    round(data['person_total_count_x'].mean(),3),
    round(len(data[data['person_exist']==1])/len(data),3),
    round(data['var'].mean(),3),
    round(data['food'].mean(),3),
    round(data['drink'].mean(),3),
    round(data['menu'].mean(),3),
    round(data['inside'].mean(),3),
    round(data['outside'].mean(),3)
    ]
    return res

def bus_result_std(data):
    res = [
    '-','-','-','-','-','-','-','-','-',
    round(data['memory_score'].std(),3),
    round(data['review_count'].std(),3),
    round(data['categories_counts'].std(),3),
    round(data['star_std'].std(),3),
    round(data['contents_score_avg'].std(),3),
    # round(data['useful_avg'].std(),3),
    # round(data['funny_avg'].std(),3),
    # round(data['cool_avg'].std(),3),
    # round(data['person_count'].std(), 3),
    round(data['beauty_score'].std(),3),
    round(data['sharpness_measure'].std(),3),
    round(data['average_hue'].std(), 3),
    round(data['average_saturation'].std(),3),
    round(data['average_value'].std(),3),
    round(data['person_total_count_x'].std(),3),
    '-',
    round(data['var'].std(),3),
    round(data['food'].std(),3),
    round(data['drink'].std(),3),
    round(data['menu'].std(),3),
    round(data['inside'].std(),3),
    round(data['outside'].std(),3)
    ]
    return res

In [19]:
# business 均值和标准差的统计结果
data = {
    "name":["1","1.5","2","2.5","3","3.5","4","4.5","5",
            "memory_score","review_count",
    "categories_counts","star_std","contents_score_avg",
    # "useful_avg","funny_avg","cool_avg","person_count",
    "beauty_score","sharpness_measure","h","s","v",
    'person_total_count','person_exist','var',
    'food','drink','menu','inside','outside'
    ],
    'All Busi mean': bus_result_mean(business_df),
    'All Busi std': bus_result_std(business_df),
    'All Res mean': bus_result_mean(business_res),
    'All Res std': bus_result_std(business_res),
    'Drink mean': bus_result_mean(business_drink),
    'Drink std': bus_result_std(business_drink)
}
# 创建DataFrame
df = pd.DataFrame(data)

# 显示DataFrame
df.head()

df.to_excel("data/output/study2_business_data_summary_yolo11.xlsx",index=False)
print("数据保存成功")

数据保存成功


In [20]:
# 模型前对数据进行标准化
scaler = StandardScaler()
business_df[['average_hue','average_saturation','average_value','sharpness_measure']] =  scaler.fit_transform(business_df[['average_hue','average_saturation','average_value','sharpness_measure']])
business_res[['average_hue','average_saturation','average_value','sharpness_measure']] =  scaler.fit_transform(business_res[['average_hue','average_saturation','average_value','sharpness_measure']])
business_drink[['average_hue','average_saturation','average_value','sharpness_measure']] =  scaler.fit_transform(business_drink[['average_hue','average_saturation','average_value','sharpness_measure']])

In [21]:
# # 保存数据用于做ivtest
business_df['review_group'] = pd.qcut(business_df['review_count'], q=[0, 2/3, 1], labels=['low', 'high'])
business_df.to_excel("data/output/study1_2_business_data.xlsx",index=False)
business_res['review_group'] = pd.qcut(business_res['review_count'], q=[0, 2/3, 1], labels=['low', 'high'])
business_res.to_excel("data/output/study1_2_res_data.xlsx",index=False) 
business_drink['review_group'] = pd.qcut(business_drink['review_count'], q=[0, 2/3, 1], labels=['low', 'high']) 
business_drink.to_excel("data/output/study1_2_drink_data.xlsx",index=False)


In [22]:
def w2sls_model_bus_res(data: pd.DataFrame):
    data['review_group'] = pd.qcut(data['review_count'], q=[0, 2/3, 1], labels=['low', 'high'])
    data['review_high'] = (data['review_group'] == 'high').astype(int)
    data["memory_score_review_high"]= data['memory_score']*data['review_high']
    data["sharpness_measure_review_high"]= data['sharpness_measure']*data['review_high']
    data['log_review_count'] = np.log(data['review_count'])
    w2sls_model = IV2SLS(
    data["star_avg"], 
    sm.add_constant(data[['categories_counts', 
    'average_hue', 'average_saturation', 'average_value', 
    'food', 'drink', 'menu', 'inside', 
    'var', 'person_exist', 'beauty_score', 
    'review_high'
    ,'log_review_count'
    ]]), 
    data[["memory_score","memory_score_review_high"]], 
    data[["sharpness_measure","sharpness_measure_review_high","person_total_count_x"]]
    # weights=data["review_count"]
    ).fit(cov_type='robust')
    return w2sls_model


In [23]:
# order模型结果
print('--------------------------------------------------1.all_business-----------------------------------------')
order_df = w2sls_model_bus_res(business_df)
print(order_df)
print('--------------------------------------------------2.restaurant--------------------------------------------------')
order_res = w2sls_model_bus_res(business_res)
print(order_res)


--------------------------------------------------1.all_business-----------------------------------------
                          IV-2SLS Estimation Summary                          
Dep. Variable:               star_avg   R-squared:                      0.1214
Estimator:                    IV-2SLS   Adj. R-squared:                 0.1209
No. Observations:               26769   F-statistic:                    3908.3
Date:                Tue, Feb 10 2026   P-value (F-stat)                0.0000
Time:                        20:41:35   Distribution:                 chi2(15)
Cov. Estimator:                robust                                         
                                                                              
                                    Parameter Estimates                                     
                          Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
----------------------------------------------------------------------------

In [24]:
# 模型预测结果对比图数据加工
from linearmodels.iv import IV2SLS
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt


def fit_model_for_plot_2sls(df):
    """All Business / Restaurant 模型"""
    df2 = df.copy()
    df2['review_group'] = pd.qcut(df2['review_count'], q=[0, 2/3, 1], labels=['low', 'high'])
    df2['review_high'] = (df2['review_group'] == 'high').astype(int)
    df2['memory_score_review_high'] = df2['memory_score'] * df2['review_high']
    df2['sharpness_measure_review_high'] = df2['sharpness_measure'] * df2['review_high']
    df2['log_review_count'] = np.log(df2['review_count'])  #  加log变换

    exog = sm.add_constant(df2[['categories_counts',
                                'average_hue', 'average_saturation', 'average_value',
                                'food', 'drink', 'menu', 'inside',
                                'var', 'person_exist', 'beauty_score',
                                'review_high', 
                                'log_review_count']])  # 改为log_review_count
    endog = df2[['memory_score', 'memory_score_review_high']]
    instruments = df2[['sharpness_measure', 'sharpness_measure_review_high', 'person_total_count_x']]

    model = IV2SLS(df2['star_avg'], exog, endog, instruments).fit(cov_type='robust')
    return model, df2


def fit_model_for_plot_2sls_drink(df):
    """Drink 模型"""
    df2 = df.copy()
    df2['review_group'] = pd.qcut(df2['review_count'], q=[0, 2/3, 1], labels=['low', 'high'])
    df2['review_high'] = (df2['review_group'] == 'high').astype(int)
    df2['memory_score_review_high'] = df2['memory_score'] * df2['review_high']
    df2['sharpness_measure_review_high'] = df2['sharpness_measure'] * df2['review_high']
    df2['person_total_count_review_high'] = df2['person_total_count_x'] * df2['review_high']
    df2['log_review_count'] = np.log(df2['review_count'])  #添加log变换

    exog = sm.add_constant(df2[['categories_counts', 'sharpness_measure',
                                'average_saturation', 'average_value',
                                'food', 'drink', 'menu', 'inside',
                                'var', 'person_exist', 'beauty_score',  
                                'review_high',
                                'log_review_count']])  # 改为log_review_count
    endog = df2[['memory_score', 'memory_score_review_high']]
    instruments = df2[['person_total_count_review_high', 'person_total_count_x', 'average_hue']]

    model = IV2SLS(df2['star_avg'], exog, endog, instruments).fit(cov_type='robust')
    return model, df2


def make_pred_df_for_moderation(df, model, sample_label):
    """
    All Business / Restaurant 预测函数
    df: 对应样本的数据
    model: fit 出来的回归对象
    """
    dfc = df.copy()

    # memory 范围：5% ~ 95% 分位
    mem_min = dfc['memory_score'].quantile(0.05)
    mem_max = dfc['memory_score'].quantile(0.95)
    mem_grid = np.linspace(mem_min, mem_max, 200)

    control_vars = [
        'categories_counts',
        'average_hue', 'average_saturation', 'average_value',
        'food', 'drink', 'menu', 'inside',
        'var', 'person_exist', 'beauty_score'
    ]
    control_means = dfc[control_vars].mean()

    rows = []
    for rh in [0, 1]:  # 0 = emerging, 1 = established
        group_mean_rc = dfc.loc[dfc['review_high'] == rh, 'review_count'].mean()
        for m in mem_grid:
            row = {}
            for v in control_vars:
                row[v] = control_means[v]
            row['review_high'] = rh
            row['log_review_count'] = np.log(group_mean_rc)  # 改为log_review_count
            row['memory_score'] = m
            row['memory_score_review_high'] = m * rh
            row['sample'] = sample_label
            rows.append(row)

    newdata = pd.DataFrame(rows)

    # 用模型系数计算预测值和 CI
    param_names = model.params.index.tolist()
    newdata = newdata.assign(const=1.0)
    for name in param_names:
        if name not in newdata.columns:
            newdata[name] = 0.0

    X = newdata[param_names].values
    beta = model.params.values
    cov = model.cov.loc[param_names, param_names].values

    y_hat = X @ beta
    var_hat = np.einsum('ij,jk,ik->i', X, cov, X)
    se_hat = np.sqrt(np.maximum(var_hat, 0))

    newdata['pred'] = y_hat
    newdata['ci_low'] = y_hat - 1.96 * se_hat
    newdata['ci_high'] = y_hat + 1.96 * se_hat

    newdata['memory_score'] = np.tile(mem_grid, 2)
    newdata['review_high'] = np.repeat([0, 1], len(mem_grid))
    newdata['status'] = np.where(newdata['review_high'] == 1,
                                 'Established (high visibility)',
                                 'Emerging (low visibility)')
    return newdata


def make_pred_df_for_moderation_drink(df, model, sample_label):
    """
    Drink 专用预测函数（control_vars 不同）
    """
    dfc = df.copy()

    mem_min = dfc['memory_score'].quantile(0.05)
    mem_max = dfc['memory_score'].quantile(0.95)
    mem_grid = np.linspace(mem_min, mem_max, 200)

    # Drink模型的control_vars不同
    control_vars = [
        'categories_counts', 
        'sharpness_measure',  # Drink用sharpness_measure替代average_hue
        'average_saturation', 'average_value',
        'food', 'drink', 'menu', 'inside',
        'var', 'person_exist', 'beauty_score'
    ]
    control_means = dfc[control_vars].mean()

    rows = []
    for rh in [0, 1]:
        group_mean_rc = dfc.loc[dfc['review_high'] == rh, 'review_count'].mean()
        for m in mem_grid:
            row = {}
            for v in control_vars:
                row[v] = control_means[v]
            row['review_high'] = rh
            row['log_review_count'] = np.log(group_mean_rc)  # 改为log_review_count
            row['memory_score'] = m
            row['memory_score_review_high'] = m * rh
            row['sample'] = sample_label
            rows.append(row)

    newdata = pd.DataFrame(rows)

    param_names = model.params.index.tolist()
    newdata = newdata.assign(const=1.0)
    for name in param_names:
        if name not in newdata.columns:
            newdata[name] = 0.0

    X = newdata[param_names].values
    beta = model.params.values
    cov = model.cov.loc[param_names, param_names].values

    y_hat = X @ beta
    var_hat = np.einsum('ij,jk,ik->i', X, cov, X)
    se_hat = np.sqrt(np.maximum(var_hat, 0))

    newdata['pred'] = y_hat
    newdata['ci_low'] = y_hat - 1.96 * se_hat
    newdata['ci_high'] = y_hat + 1.96 * se_hat

    newdata['memory_score'] = np.tile(mem_grid, 2)
    newdata['review_high'] = np.repeat([0, 1], len(mem_grid))
    newdata['status'] = np.where(newdata['review_high'] == 1,
                                 'Established (high visibility)',
                                 'Emerging (low visibility)')
    return newdata


def plot_moderation(pred_df, title):
    fig, ax = plt.subplots(figsize=(6, 4))

    for rh, label in [(0, 'Emerging (low visibility)'),
                      (1, 'Established (high visibility)')]:
        tmp = pred_df[pred_df['review_high'] == rh].sort_values('memory_score')

        x = tmp['memory_score'].astype(float).values
        y = tmp['pred'].astype(float).values
        y1 = tmp['ci_low'].astype(float).values
        y2 = tmp['ci_high'].astype(float).values

        ax.plot(x, y, label=label)
        ax.fill_between(x, y1, y2, alpha=0.2)

    ax.set_xlabel('Image memorability (business level)')
    ax.set_ylabel('Predicted average rating')
    ax.set_title(title)
    ax.legend(title='Business status')
    fig.tight_layout()
    return fig


# ============== 调用代码 ==============
# 1）All businesses
model_bus, bus_clean = fit_model_for_plot_2sls(business_df)
bus_pred = make_pred_df_for_moderation(bus_clean, model_bus, "All businesses")

# 2）Restaurants
model_res, res_clean = fit_model_for_plot_2sls(business_res)
res_pred = make_pred_df_for_moderation(res_clean, model_res, "Restaurants")

# 3）Drinks-使用专用函数
model_drink, drink_clean = fit_model_for_plot_2sls_drink(business_drink)
drink_pred = make_pred_df_for_moderation_drink(drink_clean, model_drink, "Drinks")

# ============== 保存数据为 CSV ==============
all_pred = pd.concat([bus_pred, res_pred, drink_pred], ignore_index=True)
all_pred.to_csv("data/output/plot_data_moderation_all.csv", index=False)

print("数据已保存！")
print(all_pred[['sample', 'memory_score', 'review_high', 'status', 'pred', 'ci_low', 'ci_high']].head(10))

数据已保存！
           sample  memory_score  review_high                     status  \
0  All businesses      0.618850            0  Emerging (low visibility)   
1  All businesses      0.620182            0  Emerging (low visibility)   
2  All businesses      0.621515            0  Emerging (low visibility)   
3  All businesses      0.622847            0  Emerging (low visibility)   
4  All businesses      0.624180            0  Emerging (low visibility)   
5  All businesses      0.625512            0  Emerging (low visibility)   
6  All businesses      0.626844            0  Emerging (low visibility)   
7  All businesses      0.628177            0  Emerging (low visibility)   
8  All businesses      0.629509            0  Emerging (low visibility)   
9  All businesses      0.630842            0  Emerging (low visibility)   

       pred    ci_low   ci_high  
0  3.643660  3.512063  3.775256  
1  3.643062  3.512655  3.773469  
2  3.642465  3.513247  3.771682  
3  3.641867  3.513840  3.769895

In [25]:
def w2sls_model_drink(data: pd.DataFrame):
    data['review_group'] = pd.qcut(data['review_count'], q=[0, 2/3, 1], labels=['low', 'high'])
    data['review_high'] = (data['review_group'] == 'high').astype(int)
    data["memory_score_review_high"]= data['memory_score']*data['review_high']
    data["sharpness_measure_review_high"]= data['sharpness_measure']*data['review_high']
    data["person_total_count_review_high"]= data['person_total_count_x']*data['review_high']
    data['log_review_count'] = np.log(data['review_count'])
    w2sls_model = IV2SLS(
    data["star_avg"],
    sm.add_constant(data[['categories_counts', "sharpness_measure",
    'average_saturation', 'average_value', 
    'food', 'drink', 'menu', 'inside', 
    'var', 'person_exist', 'beauty_score', 
    'review_high'
    ,'log_review_count']]), 
    data[["memory_score","memory_score_review_high"]], 
    data[["person_total_count_review_high","person_total_count_x",'average_hue']]
    # weights=data["review_count"]
    ).fit(cov_type='robust')
    return w2sls_model


print('--------------------------------------------------3.drink--------------------------------------------------')
order_drink = w2sls_model_drink(business_drink)
print(order_drink)

--------------------------------------------------3.drink--------------------------------------------------
                          IV-2SLS Estimation Summary                          
Dep. Variable:               star_avg   R-squared:                      0.1252
Estimator:                    IV-2SLS   Adj. R-squared:                 0.1238
No. Observations:                9751   F-statistic:                    1172.3
Date:                Tue, Feb 10 2026   P-value (F-stat)                0.0000
Time:                        20:41:39   Distribution:                 chi2(15)
Cov. Estimator:                robust                                         
                                                                              
                                    Parameter Estimates                                     
                          Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
--------------------------------------------------------------------------

In [26]:
def data_split_new(df):
    df['review_group'] = pd.qcut(df['review_count'], q=[0, 2/3, 1], labels=['low', 'high'])
    df_high = df[df['review_group']=='high']
    df_low = df[df['review_group']=='low']
    return df_high,df_low

business_df_high,business_df_low = data_split_new(business_df)
business_res_high,business_res_low = data_split_new(business_res)
business_drink_high,business_drink_low = data_split_new(business_drink)


In [27]:
print("----------------------------高低评论数量 情感的倾向均值-------------------------")
# business
def sentiment_analysis(business_df_high,business_df_low,file_name):
    from scipy import stats
    print("high",round(business_df_high['contents_score_avg'].mean(),3),
    "low",round(business_df_low['contents_score_avg'].mean(),3))
    print("memory score")
    print("high",round(business_df_high['memory_score'].mean(),3),
    "low",round(business_df_low['memory_score'].mean(),3))
    # ANOVA
    t_stat, p_val = stats.f_oneway(business_df_high['contents_score_avg'],
                                    business_df_low['contents_score_avg'])
    print(f"ANOVA结果: F = {t_stat:.3f}, p = {p_val:.4f}")
    t_stat, p_val = stats.f_oneway(business_df_high['memory_score'],
                                    business_df_low['memory_score'])
    print(f"ANOVA结果: F = {t_stat:.3f}, p = {p_val:.4f}")
    # =====ANOVA结果显著，可以做事后检验（Tukey HSD）=====
    import statsmodels.api as sm
    from statsmodels.stats.multicomp import pairwise_tukeyhsd

    # 拼接成一个DataFrame
    df = pd.DataFrame({'sentiment_score': pd.concat([business_df_high['contents_score_avg'], 
                                        business_df_low['contents_score_avg']]),
        "group": (["High"]*len(business_df_high) +
                ["Low"]*len(business_df_low))
    })

    tukey = pairwise_tukeyhsd(endog=df['sentiment_score'], groups=df["group"], alpha=0.05)
    print(tukey)

    df.to_excel("data/output/{}.xlsx".format(file_name),index=False)

    # 拼接成一个DataFrame
    df = pd.DataFrame({'sentiment_score': pd.concat([business_df_high['memory_score'], 
                                        business_df_low['memory_score']]),
        "group": (["High"]*len(business_df_high) +
                ["Low"]*len(business_df_low))
    })
    tukey = pairwise_tukeyhsd(endog=df['sentiment_score'], groups=df["group"], alpha=0.05)
    print(tukey)


print("sentiment analysis")
print("--------------------------------------------------1.all_business--------------------------------------------------")
sentiment_analysis(business_df_high,business_df_low,"all_business_sentiment")
print("--------------------------------------------------2.restaurant--------------------------------------------------")
sentiment_analysis(business_res_high,business_res_low,"restaurant_sentiment")
print("--------------------------------------------------3.drink--------------------------------------------------")
sentiment_analysis(business_drink_high,business_drink_low,"drink_sentiment")


----------------------------高低评论数量 情感的倾向均值-------------------------
sentiment analysis
--------------------------------------------------1.all_business--------------------------------------------------
high 0.702 low 0.536
memory score
high 0.736 low 0.771
ANOVA结果: F = 2321.246, p = 0.0000
ANOVA结果: F = 1164.557, p = 0.0000
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj lower   upper  reject
--------------------------------------------------
  High    Low  -0.1662  -0.0 -0.173 -0.1595   True
--------------------------------------------------
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj lower  upper  reject
-------------------------------------------------
  High    Low   0.0352  -0.0 0.0332 0.0372   True
-------------------------------------------------
--------------------------------------------------2.restaurant--------------------------------------------------
high 0.707 low 0.537
memory score
high 0.734 low 0.765