# 智能驾驶汽车虚拟仿真视频数据理解赛道Baseline

赛题链接+数据集链接：https://tianchi.aliyun.com/competition/entrance/532155/introduction

下面针对本赛事，介绍下解题思路及代码，希望大家都能拿到好成绩。

## 解题思路

- 基础思路：使用文本与图像进行匹配
- 进阶思路：
  - 使用图像进行视觉问答
  - 时序视频进行视频问答
  - 使用多模态大模型进行问答

## Clip模型介绍

本赛事基于Clip模型实现，下面来介绍一下该模型的一些理论知识。

CLIP的全称是Contrastive Language-Image Pre-Training，中文是对比语言-图像预训练，是一个预训练模型，简称为CLIP。

该模型是 OpenAI 在 2021 年发布的，最初用于匹配图像和文本的预训练神经网络模型，这个任务在多模态领域比较常见，可以用于文本图像检索，CLIP是近年来在多模态研究领域的经典之作。该模型大量的成对互联网数据进行预训练，在很多任务表现上达到了目前最佳表现（SOTA） 。



CLIP的思想非常简单，只需要看懂这幅图就可以了，左边是训练的原理，CLIP一共有两个模态，一个是文本模态，一个是视觉模态，分别对应了Text Encoder和Image Encoder。

- Text Encoder用于对文本进行编码，获得其Embedding；
- Image Encoder用于对图片编码，获得其Embedding。
- 两个Embedding均为一定长度的单一向量。


CLIP模型能够实现文本和图像之间的跨模态学习，这意味着它可以理解和关联文本和图像这两种不同的数据类型。通过对文本和图像进行联合学习，CLIP可以更好地理解和生成符合文本描述的图像。

由于CLIP模型在预训练阶段已经学习了大量的文本和图像知识，因此它可以在没有见过的新类别上**实现零样本学习**。这意味着CLIP模型可以处理那些在训练时没有见过的新的文本和图像，具有很强的适应能力。



## 代码实现

In [1]:
%env HF_ENDPOINT=https://hf-mirror.com

env: HF_ENDPOINT=https://hf-mirror.com


In [2]:
import glob, json, os
import cv2
from PIL import Image
from tqdm import tqdm_notebook
import numpy as np
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt

from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

config.json:   0%|          | 0.00/988 [00:00<?, ?B/s]

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [None]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities

cn_match_words = {
    "工况描述": ["高速/城市快速路", "城区", "郊区", "隧道", "停车场", "加油站/充电站", "未知"],
    "天气": ["晴天", "雨天", "多云", "雾天", "下雪", "未知"],
    "时间": ["白天", "夜晚", "拂晓/日暮", "未知"],
    "道路结构": ["十字路口", "丁字路口", "上下匝道", "车道汇入", "进出停车场", "环岛", "正常车道", "未知"],
    "一般障碍物": ["雉桶", "水马", "碎石/石块", "井盖", "减速带", "没有"],
    "道路异常情况": ["油污/水渍", "积水", "龟裂", "起伏不平", "没有", "未知"],
    "自车行为": ["直行", "左转", "右转", "停止", "掉头", "加速", "减速", "变道", "其它"],
    "最近的交通参与者": ["行人", "小型汽车", "卡车", "交警", "没有", "未知", "其它"],
    "最近的交通参与者行为": ["直行", "左转", "右转", "停止", "掉头", "加速", "减速", "变道", "其它"],
}

en_match_words = {
    "scerario": ["cityroad", "urban area", "suburb", "tunnel", "parking lot", "gas station/charging station"],
    "weather": ["sunny", "rainy", "cloudy", "foggy", "snowy"],
    "period": ["daytime", "night", "dawn/dusk"],
    "road_structure": ["intersection", "T-junction", "on-ramp/off-ramp", "merge lane", "enter/exit parking lot", "roundabout", "regular lane", "unknown"],
    "general_obstacle": ["speed bump", "water horse", "gravel/stones", "manhole cover", "speed bump", "none"],
    "abnormal_condition": ["oil stains/water stains", "water accumulation", "cracks", "uneven surface", "none", "unknown"],
    "ego_car_behavior": ["straight", "turning left", "turning right", "stop", "U-turn", "accelerate", "decelerate", "change lanes", "other"],
    "closest_participants_type": ["pedestrian", "small car", "truck", "traffic police", "none", "unknown", "other"], 
    "closest_participants_behavior": ["go straight", "turn left", "turn right", "stop", "make a U-turn", "accelerate", "decelerate", "change lanes", "other"]
}

cap = cv2.VideoCapture('./初赛测试视频/41.avi')
img = cap.read()[1]
image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
image = Image.fromarray(image)

image.resize((600, 300))


submit_json = {
    "作者" : "阿水" ,
    "时间" : "231011",
    "模型名字" : "model_name",
    "测试结果" : []
}

submit_json = {
    "author" : "abc" ,
    "time" : "231011",
    "model" : "model_name",
    "test_results" : []
}

paths = glob.glob('./初赛测试视频/*')
paths.sort()

for video_path in paths:
    print(video_path)

    clip_id = video_path.split('/')[-1]
    # clip_id = video_path.split('/')[-1][:-4]
    cap = cv2.VideoCapture(video_path)
    img = cap.read()[1]
    image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    image = Image.fromarray(image)

    single_video_result = {
        "clip_id": clip_id,
        "scerario" : "cityroad",
        "weather":"unknown",
        "period":"night",
        "road_structure":"ramp",
        "general_obstacle":"nothing",
        "abnormal_condition":"nothing",
        "ego_car_behavior":"turning right",
        "closest_participants_type":"passenger car",
        "closest_participants_behavior":"braking"
    }

    for keyword in en_match_words.keys():
        if keyword not in ["weather", "road_structure", 'scerario']:
            continue

        texts = np.array(en_match_words[keyword])
        inputs = processor(text=list(texts), images=image, return_tensors="pt", padding=True)
        outputs = model(**inputs)
        logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        probs = logits_per_image.softmax(dim=1)  # probs: [[1.2686e-03, 5.4499e-02, 6.7968e-04, 9.4355e-01]]

        single_video_result[keyword] = texts[probs[0].argsort().numpy()[::-1][0]]

    submit_json["test_results"].append(single_video_result)

len(paths)

with open('coggle_result.json', 'w', encoding='utf-8') as up:
    json.dump(submit_json, up, ensure_ascii=False)

# "作者" : "abc" ,
# "时间" : "YYMMDD",
# "模型名字" : "model_name",
# "测试结果" :[
# {
# "视频ID" : "xxxx_1",
# "工况描述" : "城市道路",
# "天气":"未知",
# "时间":"夜晚",
# "道路结构":"匝道",
# "一般障碍物":"无",
# "道路异常情况":"无",
# "自车行为":"右转",
# "最近的交通参与者":"小轿车",
# "最近的交通参与者行为":"制动"
# },

submit_json

In [None]:
参考资料；https://blog.csdn.net/manongtuzi/article/details/135589689
这个链接给出了很多详细的实现