In [None]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import os
import pandas as pd

In [None]:

def extract_features(image_path, model, preprocess):
    # load images
    img = Image.open(image_path)

    # pre-process
    img_t = preprocess(img)

    # add batch dimension
    img_t = img_t.unsqueeze(0)

    # extract features
    with torch.no_grad():
        features = model(img_t)

    # convert to 1-dimensional tensor
    features = features.squeeze()

    return features

In [None]:

model = models.resnet50(pretrained=True)
model = torch.nn.Sequential(*(list(model.children())[:-1]))
model.eval()

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:01<00:00, 58.3MB/s]


Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)


In [None]:
#preprocess
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


In [None]:

from google.colab import drive
import os

drive.mount('/content/drive')

# 设置你的图片文件夹路径
folder_path = '/content/drive/My Drive/Colab Notebooks/Data/place-pulse-2.0/images/'
files = []

if not os.path.exists(folder_path):
    print(f"指定的路径不存在：{folder_path}")
else:
    # 尝试使用os.scandir()来分批次读取文件
    try:
        with os.scandir(folder_path) as it:
            for entry in it:
                if entry.is_file():
                    files.append(entry.name)  # 直接将文件名添加到files列表
                    # 检查当前文件数量，如果需要可以在这里打印状态
                    if len(files) % 1000 == 0:  # 每1000个文件打印一次状态
                        print(f"已处理{len(files)}个文件...")
    except OSError as e:
        print(f"遇到错误：{e}")

# 最后，打印总文件数量确认
print(f"总共找到了{len(files)}个文件。")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
已处理1000个文件...
已处理2000个文件...
已处理3000个文件...
已处理4000个文件...
已处理5000个文件...
已处理6000个文件...
已处理7000个文件...
已处理8000个文件...
已处理9000个文件...
已处理10000个文件...
已处理11000个文件...
已处理12000个文件...
已处理13000个文件...
已处理14000个文件...
已处理15000个文件...
已处理16000个文件...
已处理17000个文件...
已处理18000个文件...
已处理19000个文件...
已处理20000个文件...
已处理21000个文件...
已处理22000个文件...
已处理23000个文件...
已处理24000个文件...
已处理25000个文件...
已处理26000个文件...
已处理27000个文件...
已处理28000个文件...
已处理29000个文件...
已处理30000个文件...
已处理31000个文件...
已处理32000个文件...
已处理33000个文件...
已处理34000个文件...
已处理35000个文件...
已处理36000个文件...
已处理37000个文件...
已处理38000个文件...
已处理39000个文件...
已处理40000个文件...
已处理41000个文件...
已处理42000个文件...
已处理43000个文件...
已处理44000个文件...
已处理45000个文件...
已处理46000个文件...
已处理47000个文件...
已处理48000个文件...
已处理49000个文件...
已处理50000个文件...
已处理51000个文件...
已处理52000个文件...
已处理53000个文件...
已处理54000个文件...
已处理55000个文件...
已处理56000个文件...
已处理57000个文件...
已处理58000个文件...
已处理59000个文

In [None]:
import random
# use 5000 elements to test
sampled_files = random.sample(files, 5000)

In [None]:
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Data/Qscores.csv')

# Filter the DataFrame
filtered_df = df[df['study_question'] == 'wealthier']
filtered_df

Unnamed: 0,image_id,pi,ni,ei,total,study_question,P,N,Q
0,50e5f7d4d7c3df413b00056a,1.0,2.0,0.0,3.0,wealthier,0.333333,0.666667,3.333333
1,50e5f7d4d7c3df413b00056b,1.0,2.0,1.0,4.0,wealthier,0.250000,0.500000,3.333333
2,50e5f7d4d7c3df413b00056c,0.0,0.0,1.0,1.0,wealthier,0.000000,0.000000,14.444444
3,50e5f7d4d7c3df413b00056d,4.0,0.0,1.0,5.0,wealthier,0.800000,0.000000,28.703704
4,50e5f7d4d7c3df413b00056e,2.0,1.0,1.0,4.0,wealthier,0.500000,0.250000,12.592593
...,...,...,...,...,...,...,...,...,...
578832,5185d51bfdc9f03fd500147a,1.0,1.0,0.0,2.0,wealthier,0.500000,0.500000,8.888889
578833,5185d52bfdc9f03fd5001486,1.0,0.0,0.0,1.0,wealthier,1.000000,0.000000,20.000000
578834,5185d531fdc9f03fd500148c,0.0,0.0,1.0,1.0,wealthier,0.000000,0.000000,14.444444
578835,5185d534fdc9f03fd500148f,0.0,1.0,0.0,1.0,wealthier,0.000000,1.000000,3.333333


In [None]:
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Data/Qscores.csv')

# Filter the DataFrame
filtered_df = df[df['study_question'] == 'wealthier']
sampled_df = filtered_df.sample(n=5000, random_state=1)
sampled_df

Unnamed: 0,image_id,pi,ni,ei,total,study_question,P,N,Q
38677,513d799dfdc9f0358700656c,1.0,0.0,0.0,1.0,wealthier,1.000000,0.000000,20.000000
77776,51414283fdc9f049260062db,2.0,1.0,0.0,3.0,wealthier,0.666667,0.333333,12.592593
50618,513e23abfdc9f0358700a6d4,1.0,0.0,1.0,2.0,wealthier,0.500000,0.000000,20.000000
33275,513d5ebdfdc9f03587003adf,2.0,1.0,0.0,3.0,wealthier,0.666667,0.333333,12.592593
565327,513ce09bfdc9f03587002164,1.0,1.0,0.0,2.0,wealthier,0.500000,0.500000,8.888889
...,...,...,...,...,...,...,...,...,...
46806,513e1b26fdc9f035870093c2,3.0,0.0,0.0,3.0,wealthier,1.000000,0.000000,26.481481
54144,513e6b83fdc9f0358700c03b,1.0,4.0,2.0,7.0,wealthier,0.142857,0.571429,-3.148148
36706,513d6c35fdc9f03587004ef4,2.0,2.0,0.0,4.0,wealthier,0.500000,0.500000,7.037037
566830,513d69a9fdc9f035870047ec,0.0,2.0,1.0,3.0,wealthier,0.000000,0.666667,-2.222222


In [None]:
# create a dictionary
id_to_filename = {}
folder_path = '/content/drive/My Drive/Colab Notebooks/Data/place-pulse-2.0/images/'
# Iterate through the list of filenames to populate the id_to_filename dictionary
for file in sampled_files:
    parts = file.split('_')
    image_id_with_extension = parts[2]
    image_id = image_id_with_extension.split('.')[0]
    # Mapping corrected image_id to filename
    id_to_filename[image_id] = file
    # 对应id和文件名


# Create an empty dictionary to store the mapping between image_id and the corresponding feature vector
features_dict = {}
#  iterate through each row in the DataFrame
for index, row in filtered_df.iterrows(): # filter_df
    image_id = str(row['image_id'])

    filename = id_to_filename.get(image_id)

    if filename:
        img_path = os.path.join(folder_path, filename.strip())
        if os.path.exists(img_path):
            features = extract_features(img_path, model, preprocess)
            features_dict[image_id] = features
        else:
            print(f"files not find: {img_path}")
    else:
        print(f"Image ID {image_id} has no corresponding file.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Image ID 5140b70ffdc9f049260018ca has no corresponding file.
Image ID 5140b70ffdc9f049260018cc has no corresponding file.
Image ID 5140b711fdc9f049260018d1 has no corresponding file.
Image ID 5140b716fdc9f049260018df has no corresponding file.
Image ID 5140b718fdc9f049260018e6 has no corresponding file.
Image ID 5140b719fdc9f049260018ea has no corresponding file.
Image ID 5140b71bfdc9f049260018ef has no corresponding file.
Image ID 5140b71bfdc9f049260018f0 has no corresponding file.
Image ID 5140b71dfdc9f049260018f4 has no corresponding file.
Image ID 5140b71efdc9f049260018f5 has no corresponding file.
Image ID 5140b720fdc9f049260018fc has no corresponding file.
Image ID 5140b721fdc9f04926001900 has no corresponding file.
Image ID 5140b723fdc9f04926001903 has no corresponding file.
Image ID 5140b725fdc9f0492600190c has no corresponding file.
Image ID 5140b727fdc9f04926001910 has no corresponding file.
Image ID 5140b728fdc

In [None]:
# Create a list of tuples, each containing the image_id and the corresponding feature vector converted to a list
data_list = [(key, value.tolist()) for key, value in features_dict.items()]

# Create a DataFrame with the image IDs and the list of features
df_features = pd.DataFrame(data_list, columns=['image_id', 'features'])

# Save the DataFrame to a CSV file without including the row index
df_features.to_csv('/content/drive/My Drive/Colab Notebooks/Data/place-pulse-2.0/sample_features.csv', index=False)

In [None]:
df_result = pd.merge(filtered_df, df_features, on='image_id')
df_cleaned = df_result.dropna(subset=['features'])
df_cleaned.head()

Unnamed: 0,image_id,pi,ni,ei,total,study_question,P,N,Q,features
0,50e5f7d9d7c3df413b0005a5,2.0,1.0,1.0,4.0,wealthier,0.5,0.25,12.592593,"[0.29630786180496216, 0.46996334195137024, 0.0..."
1,50e5faead7c3df413b000732,2.0,4.0,1.0,7.0,wealthier,0.285714,0.571429,0.555556,"[0.5029924511909485, 0.8509480953216553, 0.106..."
2,50e5faebd7c3df413b000752,1.0,2.0,0.0,3.0,wealthier,0.333333,0.666667,3.333333,"[0.2735321521759033, 0.896479070186615, 0.0138..."
3,50e5faecd7c3df413b000756,0.0,1.0,1.0,2.0,wealthier,0.0,0.5,3.333333,"[0.08667921274900436, 0.4759170711040497, 0.10..."
4,50e5faedd7c3df413b00076f,5.0,1.0,0.0,6.0,wealthier,0.833333,0.166667,19.444444,"[0.2994638979434967, 0.2625936269760132, 0.146..."


In [None]:
df_cleaned

Unnamed: 0,image_id,pi,ni,ei,total,study_question,P,N,Q,features
0,50e5f7d9d7c3df413b0005a5,2.0,1.0,1.0,4.0,wealthier,0.500000,0.250000,12.592593,"[0.29630786180496216, 0.46996334195137024, 0.0..."
1,50e5faead7c3df413b000732,2.0,4.0,1.0,7.0,wealthier,0.285714,0.571429,0.555556,"[0.5029924511909485, 0.8509480953216553, 0.106..."
2,50e5faebd7c3df413b000752,1.0,2.0,0.0,3.0,wealthier,0.333333,0.666667,3.333333,"[0.2735321521759033, 0.896479070186615, 0.0138..."
3,50e5faecd7c3df413b000756,0.0,1.0,1.0,2.0,wealthier,0.000000,0.500000,3.333333,"[0.08667921274900436, 0.4759170711040497, 0.10..."
4,50e5faedd7c3df413b00076f,5.0,1.0,0.0,6.0,wealthier,0.833333,0.166667,19.444444,"[0.2994638979434967, 0.2625936269760132, 0.146..."
...,...,...,...,...,...,...,...,...,...,...
4809,5185d2abfdc9f03fd5001211,0.0,1.0,0.0,1.0,wealthier,0.000000,1.000000,3.333333,"[0.27880772948265076, 0.5953271985054016, 0.03..."
4810,5185d32dfdc9f03fd5001287,1.0,2.0,0.0,3.0,wealthier,0.333333,0.666667,3.333333,"[0.37488314509391785, 0.5322535037994385, 0.10..."
4811,5185d39bfdc9f03fd50012f8,1.0,0.0,0.0,1.0,wealthier,1.000000,0.000000,20.000000,"[0.07558345049619675, 0.538662314414978, 0.069..."
4812,5185d3b6fdc9f03fd5001316,1.0,0.0,0.0,1.0,wealthier,1.000000,0.000000,20.000000,"[0.07611650973558426, 0.8130600452423096, 0.02..."


In [None]:
df['study_question'].unique()

array(['wealthier', 'more beautiful', 'livelier', 'more depressing',
       'safer', 'more boring'], dtype=object)

In [None]:
# Filter the DataFrame
filtered_df = df[df['study_question'] == 'more depressing']

# Create an empty dictionary to store the mapping between image_id and the corresponding feature vector
features_dict = {}
#  iterate through each row in the DataFrame
for index, row in filtered_df.iterrows(): # filter_df
    image_id = str(row['image_id'])

    filename = id_to_filename.get(image_id)

    if filename:
        img_path = os.path.join(folder_path, filename.strip())
        if os.path.exists(img_path):
            features = extract_features(img_path, model, preprocess)
            features_dict[image_id] = features
        else:
            print(f"files not find: {img_path}")
    else:
        print(f"Image ID {image_id} has no corresponding file.")

# Create a list of tuples, each containing the image_id and the corresponding feature vector converted to a list
data_list = [(key, value.tolist()) for key, value in features_dict.items()]

# Create a DataFrame with the image IDs and the list of features
df_features = pd.DataFrame(data_list, columns=['image_id', 'features'])

# Save the DataFrame to a CSV file without including the row index
df_features.to_csv('/content/drive/My Drive/Colab Notebooks/Data/place-pulse-2.0/sample_features_depressing.csv', index=False)

df_result = pd.merge(filtered_df, df_features, on='image_id')
df_cleaned_depress = df_result.dropna(subset=['features'])
df_cleaned_depress.head()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Image ID 5140cafdfdc9f049260029f2 has no corresponding file.
Image ID 5140cafdfdc9f049260029f4 has no corresponding file.
Image ID 5140cb01fdc9f049260029fe has no corresponding file.
Image ID 5140cb03fdc9f04926002a06 has no corresponding file.
Image ID 5140cb05fdc9f04926002a0c has no corresponding file.
Image ID 5140cb06fdc9f04926002a0f has no corresponding file.
Image ID 5140cb06fdc9f04926002a11 has no corresponding file.
Image ID 5140cb07fdc9f04926002a13 has no corresponding file.
Image ID 5140cb07fdc9f04926002a15 has no corresponding file.
Image ID 5140cb08fdc9f04926002a16 has no corresponding file.
Image ID 5140cb08fdc9f04926002a19 has no corresponding file.
Image ID 5140cb09fdc9f04926002a1a has no corresponding file.
Image ID 5140cb0afdc9f04926002a20 has no corresponding file.
Image ID 5140cb0cfdc9f04926002a26 has no corresponding file.
Image ID 5140cb10fdc9f04926002a30 has no corresponding file.
Image ID 5140cb10fdc

Unnamed: 0,image_id,pi,ni,ei,total,study_question,P,N,Q,features
0,50e5f7d8d7c3df413b00059f,3.0,2.0,1.0,6.0,more depressing,0.5,0.333333,9.814815,"[0.295329213142395, 1.0742920637130737, 0.0, 0..."
1,50e5f7d9d7c3df413b0005a5,2.0,1.0,0.0,3.0,more depressing,0.666667,0.333333,12.592593,"[0.29630786180496216, 0.46996334195137024, 0.0..."
2,50e5faead7c3df413b000732,1.0,2.0,0.0,3.0,more depressing,0.333333,0.666667,3.333333,"[0.5029924511909485, 0.8509480953216553, 0.106..."
3,50e5faecd7c3df413b000756,0.0,1.0,0.0,1.0,more depressing,0.0,1.0,3.333333,"[0.08667921274900436, 0.4759170711040497, 0.10..."
4,50e5faecd7c3df413b00075a,1.0,1.0,0.0,2.0,more depressing,0.5,0.5,8.888889,"[0.37665531039237976, 0.5113226175308228, 0.13..."


In [None]:
# Filter the DataFrame
filtered_df = df[df['study_question'] == 'more boring']

# Create an empty dictionary to store the mapping between image_id and the corresponding feature vector
features_dict = {}
#  iterate through each row in the DataFrame
for index, row in filtered_df.iterrows(): # filter_df
    image_id = str(row['image_id'])

    filename = id_to_filename.get(image_id)

    if filename:
        img_path = os.path.join(folder_path, filename.strip())
        if os.path.exists(img_path):
            features = extract_features(img_path, model, preprocess)
            features_dict[image_id] = features
        else:
            print(f"files not find: {img_path}")
    else:
        print(f"Image ID {image_id} has no corresponding file.")

# Create a list of tuples, each containing the image_id and the corresponding feature vector converted to a list
data_list = [(key, value.tolist()) for key, value in features_dict.items()]

# Create a DataFrame with the image IDs and the list of features
df_features = pd.DataFrame(data_list, columns=['image_id', 'features'])

# Save the DataFrame to a CSV file without including the row index
df_features.to_csv('/content/drive/My Drive/Colab Notebooks/Data/place-pulse-2.0/sample_features_boring.csv', index=False)

df_result = pd.merge(filtered_df, df_features, on='image_id')
df_cleaned_boring = df_result.dropna(subset=['features'])
df_cleaned_boring.head()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Image ID 5140cc47fdc9f04926002d6a has no corresponding file.
Image ID 5140cc49fdc9f04926002d70 has no corresponding file.
Image ID 5140cc4afdc9f04926002d71 has no corresponding file.
Image ID 5140cc4cfdc9f04926002d77 has no corresponding file.
Image ID 5140cc4dfdc9f04926002d7b has no corresponding file.
Image ID 5140cc4efdc9f04926002d7c has no corresponding file.
Image ID 5140cc4efdc9f04926002d7d has no corresponding file.
Image ID 5140cc53fdc9f04926002d89 has no corresponding file.
Image ID 5140cc54fdc9f04926002d8f has no corresponding file.
Image ID 5140cc57fdc9f04926002d96 has no corresponding file.
Image ID 5140cc57fdc9f04926002d97 has no corresponding file.
Image ID 5140cc58fdc9f04926002d9a has no corresponding file.
Image ID 5140cc5afdc9f04926002d9f has no corresponding file.
Image ID 5140cc5bfdc9f04926002da1 has no corresponding file.
Image ID 5140cc5bfdc9f04926002da3 has no corresponding file.
Image ID 5140cc6cfdc

Unnamed: 0,image_id,pi,ni,ei,total,study_question,P,N,Q,features
0,50e5f7d8d7c3df413b00059f,4.0,0.0,0.0,4.0,more boring,1.0,0.0,28.703704,"[0.295329213142395, 1.0742920637130737, 0.0, 0..."
1,50e5f7d9d7c3df413b0005a5,1.0,0.0,0.0,1.0,more boring,1.0,0.0,20.0,"[0.29630786180496216, 0.46996334195137024, 0.0..."
2,50e5faead7c3df413b000732,1.0,0.0,1.0,2.0,more boring,0.5,0.0,20.0,"[0.5029924511909485, 0.8509480953216553, 0.106..."
3,50e5faecd7c3df413b000756,3.0,0.0,0.0,3.0,more boring,1.0,0.0,26.481481,"[0.08667921274900436, 0.4759170711040497, 0.10..."
4,50e5faecd7c3df413b00075a,2.0,0.0,1.0,3.0,more boring,0.666667,0.0,23.703704,"[0.37665531039237976, 0.5113226175308228, 0.13..."


In [None]:
# Filter the DataFrame
filtered_df = df[df['study_question'] == 'safer']

# Create an empty dictionary to store the mapping between image_id and the corresponding feature vector
features_dict = {}
#  iterate through each row in the DataFrame
for index, row in filtered_df.iterrows(): # filter_df
    image_id = str(row['image_id'])

    filename = id_to_filename.get(image_id)

    if filename:
        img_path = os.path.join(folder_path, filename.strip())
        if os.path.exists(img_path):
            features = extract_features(img_path, model, preprocess)
            features_dict[image_id] = features
        else:
            print(f"files not find: {img_path}")
    else:
        print(f"Image ID {image_id} has no corresponding file.")

# Create a list of tuples, each containing the image_id and the corresponding feature vector converted to a list
data_list = [(key, value.tolist()) for key, value in features_dict.items()]

# Create a DataFrame with the image IDs and the list of features
df_features = pd.DataFrame(data_list, columns=['image_id', 'features'])

# Save the DataFrame to a CSV file without including the row index
df_features.to_csv('/content/drive/My Drive/Colab Notebooks/Data/place-pulse-2.0/sample_features_safer.csv', index=False)

df_result = pd.merge(filtered_df, df_features, on='image_id')
df_cleaned_safer = df_result.dropna(subset=['features'])
df_cleaned_safer.head()

In [None]:
# Filter the DataFrame
filtered_df = df[df['study_question'] == 'livelier']

# Create an empty dictionary to store the mapping between image_id and the corresponding feature vector
features_dict = {}
#  iterate through each row in the DataFrame
for index, row in filtered_df.iterrows(): # filter_df
    image_id = str(row['image_id'])

    filename = id_to_filename.get(image_id)

    if filename:
        img_path = os.path.join(folder_path, filename.strip())
        if os.path.exists(img_path):
            features = extract_features(img_path, model, preprocess)
            features_dict[image_id] = features
        else:
            print(f"files not find: {img_path}")
    else:
        print(f"Image ID {image_id} has no corresponding file.")

# Create a list of tuples, each containing the image_id and the corresponding feature vector converted to a list
data_list = [(key, value.tolist()) for key, value in features_dict.items()]

# Create a DataFrame with the image IDs and the list of features
df_features = pd.DataFrame(data_list, columns=['image_id', 'features'])

# Save the DataFrame to a CSV file without including the row index
df_features.to_csv('/content/drive/My Drive/Colab Notebooks/Data/place-pulse-2.0/sample_features_livelier.csv', index=False)

df_result = pd.merge(filtered_df, df_features, on='image_id')
df_cleaned_livelier = df_result.dropna(subset=['features'])
df_cleaned_livelier.head()

# Filter the DataFrame
filtered_df = df[df['study_question'] == 'more beautiful']

# Create an empty dictionary to store the mapping between image_id and the corresponding feature vector
features_dict = {}
#  iterate through each row in the DataFrame
for index, row in filtered_df.iterrows(): # filter_df
    image_id = str(row['image_id'])

    filename = id_to_filename.get(image_id)

    if filename:
        img_path = os.path.join(folder_path, filename.strip())
        if os.path.exists(img_path):
            features = extract_features(img_path, model, preprocess)
            features_dict[image_id] = features
        else:
            print(f"files not find: {img_path}")
    else:
        print(f"Image ID {image_id} has no corresponding file.")

# Create a list of tuples, each containing the image_id and the corresponding feature vector converted to a list
data_list = [(key, value.tolist()) for key, value in features_dict.items()]

# Create a DataFrame with the image IDs and the list of features
df_features = pd.DataFrame(data_list, columns=['image_id', 'features'])

# Save the DataFrame to a CSV file without including the row index
df_features.to_csv('/content/drive/My Drive/Colab Notebooks/Data/place-pulse-2.0/sample_features_beautiful.csv', index=False)

df_result = pd.merge(filtered_df, df_features, on='image_id')
df_cleaned_beautiful = df_result.dropna(subset=['features'])
df_cleaned_beautiful.head()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Image ID 513e6b49fdc9f0358700bfeb has no corresponding file.
Image ID 513e6b4cfdc9f0358700bfef has no corresponding file.
Image ID 513e6b51fdc9f0358700bff4 has no corresponding file.
Image ID 513e6b58fdc9f0358700bfff has no corresponding file.
Image ID 513e6b60fdc9f0358700c008 has no corresponding file.
Image ID 513e6b62fdc9f0358700c00a has no corresponding file.
Image ID 513e6b64fdc9f0358700c00f has no corresponding file.
Image ID 513e6b6bfdc9f0358700c017 has no corresponding file.
Image ID 513e6b6ffdc9f0358700c01e has no corresponding file.
Image ID 513e6b70fdc9f0358700c020 has no corresponding file.
Image ID 513e6b7ffdc9f0358700c034 has no corresponding file.
Image ID 513e6b8efdc9f0358700c04b has no corresponding file.
Image ID 513e6b99fdc9f0358700c056 has no corresponding file.
Image ID 513e6b9dfdc9f0358700c05d has no corresponding file.
Image ID 513e6bb5fdc9f0358700c080 has no corresponding file.
Image ID 513e6bb5fdc

Unnamed: 0,image_id,pi,ni,ei,total,study_question,P,N,Q,features
0,50e5f7d8d7c3df413b00059f,5.0,2.0,0.0,7.0,more beautiful,0.714286,0.285714,13.888889,"[0.295329213142395, 1.0742920637130737, 0.0, 0..."
1,50e5f7d9d7c3df413b0005a5,0.0,2.0,0.0,2.0,more beautiful,0.0,1.0,-2.222222,"[0.29630786180496216, 0.46996334195137024, 0.0..."
2,50e5faead7c3df413b000732,7.0,2.0,0.0,9.0,more beautiful,0.777778,0.222222,16.865079,"[0.5029924511909485, 0.8509480953216553, 0.106..."
3,50e5faebd7c3df413b000752,1.0,2.0,1.0,4.0,more beautiful,0.25,0.5,3.333333,"[0.2735321521759033, 0.896479070186615, 0.0138..."
4,50e5faecd7c3df413b000756,0.0,1.0,1.0,2.0,more beautiful,0.0,0.5,3.333333,"[0.08667921274900436, 0.4759170711040497, 0.10..."


In [None]:
# Check if the path exists
if os.path.exists(folder_path):
    print(f"Path exists: {folder_path}")
    # Get the list of files
    files = os.listdir(folder_path)
    print(f"Number of files: {len(files)}")
else:
    print(f"Path does not exist: {folder_path}")

# Read the DataFrame
df_path = '/content/drive/My Drive/Colab Notebooks/Data/Qscores.csv'
if os.path.exists(df_path):
    df = pd.read_csv(df_path)
    print(f"DataFrame loaded, number of rows: {len(df)}")
else:
    print(f"Incorrect path for DataFrame: {df_path}")


Path exists: /content/drive/My Drive/Colab Notebooks/Data/place-pulse-2.0/images/
Number of files: 78880
DataFrame loaded, number of rows: 653339


In [None]:
missing_files = []

for index, row in df.head().iterrows():
    image_id = row['image_id']
    if image_id in id_to_filename:
        filename = id_to_filename[image_id]
        img_path = os.path.join(folder_path, filename)
        if not os.path.exists(img_path):
            missing_files.append(image_id)
            print(f"文件未找到: {img_path}")
    else:
        print(f"在id_to_filename字典中未找到Image ID: {image_id}")

print(f"缺失的文件总数: {len(missing_files)}")


在id_to_filename字典中未找到Image ID: 50e5f7d4d7c3df413b00056a
在id_to_filename字典中未找到Image ID: 50e5f7d4d7c3df413b00056b
缺失的文件总数: 0
