#### Large RAM is required to load the larger models. Running on GPU can optimize inference speed.

In [None]:
import sys
!pip3 install salesforce-lavis

In [None]:
import torch
from PIL import Image
import requests
from lavis.models import load_model_and_preprocess

#### Load an example image

In [None]:
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' 
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')   
display(raw_image.resize((596, 437)))

In [None]:
# setup device to use
device = torch.device("cuda:7") if torch.cuda.is_available() else "cpu"

#### Load pretrained/finetuned BLIP2 captioning model

In [None]:
# we associate a model with its preprocessors to make it easier for inference.
model, vis_processors, _ = load_model_and_preprocess(
    name="blip2_t5", model_type="pretrain_flant5xl", is_eval=True, device=device
)
vis_processors.keys()

#### prepare the image as model input using the associated processors

In [None]:
image = vis_processors["eval"](raw_image).unsqueeze(0).to(device)

#### generate caption using beam search

In [None]:
model.generate({"image": image})

#### generate multiple captions using nucleus sampling

In [None]:
# due to the non-determinstic nature of necleus sampling, you may get different captions.
model.generate({"image": image}, use_nucleus_sampling=True, num_captions=3)

#### instructed zero-shot vision-to-language generation

In [None]:
model.generate({"image": image, "prompt": "Question: which city is this? Answer:"})

In [None]:
model.generate({
    "image": image,
    "prompt": "Question: which city is this? Answer: singapore. Question: why?"})

In [None]:
context = [
    ("which city is this?", "singapore"),
    ("why?", "it has a statue of a merlion"),
]
question = "where is the name merlion coming from?"
template = "Question: {} Answer: {}."

prompt = " ".join([template.format(context[i][0], context[i][1]) for i in range(len(context))]) + " Question: " + question + " Answer:"

print(prompt)

In [None]:
model.generate(
    {
    "image": image,
    "prompt": prompt
    },
    use_nucleus_sampling=False,
)

In [None]:
def get_files_sorted_by_number(folder_path):
    files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    sorted_files = sorted(files, key=lambda x: int(x.split('_')[1].split('.')[0]))
    return sorted_files

In [None]:
import os

def get_subfolders(folder_path):
    subfolders = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))]
    return subfolders

In [None]:
folder_path = ''
subfolders_list = get_subfolders(folder_path)
print(subfolders_list)
print(len(subfolders_list))

In [None]:

import csv
base_path = ''
csv_file_path = ''

data_list = [
    ['video_name','descirbe_list_1']
]

for subfolder in subfolders_list:
    print(subfolder)
    files = get_files_sorted_by_number(base_path + '/' + subfolder)
    print(files)

    describe_list_1 = []
    describe_list_2 = []
    
    for f in files:
        image_path  = base_path + '/' + subfolder + '/' + f
        # print(image_path)
        
        raw_image = Image.open(image_path).convert("RGB")
        image1 = vis_processors["eval"](raw_image).unsqueeze(0).to(device)
        # print(type(image1))
        data = model.generate({
            "image": image1,
            "prompt": "Question: Please provide a detailed description of the content in the picture, taking into consideration the scene information and the actions and behavior of the people depicted in the image."})        
        describe_list_1.append(data[0])
    print(describe_list_1)
    print(set(describe_list_1))
    data_list.append([subfolder,describe_list_1,set(describe_list_1)])
    # break
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file)
    for row in data_list:
        csv_writer.writerow(row)

print("数据已成功写入CSV文件。")

In [None]:
""" =========================== Filter =========================          """

In [None]:
from collections import OrderedDict

def remove_duplicates_and_preserve_order(input_list):
    unique_sentences = OrderedDict()

    for sentence in input_list:
        unique_sentences[sentence] = None

    return list(unique_sentences.keys())

In [None]:
csv_path = ''
filter_csv_path = ''

In [None]:
import csv
import sys
csv.field_size_limit(sys.maxsize)
# 打开CSV文件
with open(csv_path, 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    
    # 读取CSV文件的第一行，获取列标题
    header = next(reader)
    
    # 选择您想要提取的列的索引（假设您想要提取第二列和第四列）
    column_indices = [0,1,2]  # 这里使用索引从0开始
    
    # 创建一个字典来存储按列提取的数据
    extracted_data = {header[i]: [] for i in column_indices}
    
    # 遍历CSV文件中的每一行，并按列提取数据
    for row in reader:
        for i in column_indices:
            extracted_data[header[i]].append(row[i])
# 打印按列提取的数据
print(len(extracted_data['video_name']))
video_names = []
idx1_s = []
idx2_s = []
for i in range(0,len(extracted_data['video_name'])):
    video_name = extracted_data['video_name'][i]
    idx1 = extracted_data['descirbe_list_1'][i]

    idx1_list = idx1.split('\'')
    # print(idx1_list)
    idx11 = []
    for j in idx1_list:
        if len(j) <= 3:
            continue
        idx11.append(j)
    n_idx1 = remove_duplicates_and_preserve_order(idx11)

    video_names.append(video_name)
    idx1_s.append(n_idx1)
    # break
print(video_names)
print(idx1_s[2])
print(len(idx1_s))

In [None]:

with open('','w') as ta:
    for i in range(0,len(video_names)):
        name = video_names[i]
        des_list = idx1_s[i]
        total_des = ''
        for des in des_list:
            total_des = total_des + des + '.'
        print(total_des)
        ta.writelines(name +' '+ total_des + '\n')

In [None]:
names_list = []
total_des_list = []
for i in range(0,len(video_names)):
    name = video_names[i]
    des_list = idx1_s[i]
    total_des = ''
    for des in des_list:
        total_des = total_des + des + '.'
    names_list.append(name)
    total_des_list.append(total_des)

print(names_list)
print(total_des_list)

In [None]:
new_names_list = []
new_total_des_list = []
for i in range(0,len(names_list)):
    name = names_list[i]
    des = total_des_list[i]
    for j in range(0,5):
        n = name + '_' + str(j)
        new_names_list.append(n)
        new_total_des_list.append(des)

print(new_names_list)
print(new_total_des_list)
print(len(new_names_list))
print(len(new_total_des_list))


In [None]:
import csv

# 假设这是你的两个列表
# new_name_list = ['name1', 'name2', 'name3']
# new_total_des_list = ['description1', 'description2', 'description3']

# 确保两个列表长度相同
if len(new_names_list) == len(new_total_des_list):
    # 创建一个字典，将两个列表对应的元素作为键值对
    combined_dict = dict(zip(new_names_list, new_total_des_list))
    
    # 指定CSV文件的名称
    csv_file = ""

    # 打开文件进行写入
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # 写入标题行
        writer.writerow(['Name', 'Description'])
        # 写入数据
        for key, value in combined_dict.items():
            writer.writerow([key, value])
    print(f"file '{csv_file}' success。")
else:
    print("no equal")
