# Import Libraries

In [78]:
import pandas as pd

import os
import ast
import json
import shutil

from utils import *
from modules.lingo_json_file_creator import LingoJsonFileCreator
from modules.drivegpt4_bddx_json_file_creator import DriveGPT4BDDXJsonFileCreator

# Important Paths

In [79]:
# BDD-X Paths
BDD_X_DATASET_PATH = './datasets/bdd_x_dataset'
BDD_X_TRAINING_VIDEOS_PATH = os.path.join(BDD_X_DATASET_PATH, 'train/videos')

# LingoQA Paths
LINGO_DATASET_PATH = './datasets/lingoqa_dataset'
LINGO_ACTION_PATH = os.path.join(LINGO_DATASET_PATH, 'action')
LINGO_SCENERY_PATH = os.path.join(LINGO_DATASET_PATH, 'scenery')
LINGO_EVAL_PATH = os.path.join(LINGO_DATASET_PATH, 'evaluation')
LINGO_IMAGES_PATH = os.path.join(LINGO_DATASET_PATH, 'images')
LINGO_TRAIN_PATH = os.path.join(LINGO_DATASET_PATH, 'train')
LINGO_VAL_PATH = os.path.join(LINGO_DATASET_PATH, 'val')

# DriveGPT4 Paths
DRIVE_BDDX_DATASET_PATH = './datasets/drivegpt4_dataset'
DRIVE_BDDX_IMAGES_PATH = os.path.join(DRIVE_BDDX_DATASET_PATH, 'BDD_X_imgs_select')
DRIVE_BDDX_VIDEOS_PATH = os.path.join(DRIVE_BDDX_DATASET_PATH, 'videos')


# Our Paths
OUR_DATASET_Path = './our_datasets'
OUR_LINGO_DATASET_PATH = os.path.join(OUR_DATASET_Path, 'lingoqa_dataset')
OUR_DRIVE_BDDX_DATASET_PATH = os.path.join(OUR_DATASET_Path, 'drivegpt4_dataset')

# Preprocess Datasets

## Preprocess LingoQA Dataset

### Preprocess Action Data

In [8]:
df_action = pd.read_parquet(os.path.join(LINGO_ACTION_PATH, 'train.parquet'))
df_action.head(2)

Unnamed: 0,question_id,segment_id,images,question,answer
0,959d64df1f47dd115fb4ed14997106b3,a9f0e311b0c6f46a9cc7cb923234e60a,[images/train/a9f0e311b0c6f46a9cc7cb923234e60a...,What are you currently doing and why?,I am starting as the zebra crossing becomes cl...
1,6c6e29403bbfc6761b7d9eafbf6d3040,a9f0e311b0c6f46a9cc7cb923234e60a,[images/train/a9f0e311b0c6f46a9cc7cb923234e60a...,What are you paying attention to and why?,I am paying attention to the zebra crossing to...


#### Create JSON File

In [9]:
os.makedirs(OUR_LINGO_DATASET_PATH, exist_ok=True)

dataframe = df_action.groupby('segment_id')

lingo_json_creator = LingoJsonFileCreator()
json_data = lingo_json_creator.format_to_train(dataframe)
lingo_json_creator.save_json(json_data, 'lingoqa_action.json', OUR_LINGO_DATASET_PATH)

#### Test Code Above

In [10]:
json_path = os.path.join(OUR_LINGO_DATASET_PATH, 'lingoqa_action.json')

questions_lst = df_action['question'].tolist()
compare_num_questions(json_path, questions_lst)

answers_lst = df_action['answer'].tolist()
compare_num_answers(json_path, answers_lst)

videos_lst = [video.replace('\n', ',') for video in df_action['images'].astype(str).unique().tolist()]
videos_lst = [ast.literal_eval(video)[0] for video in videos_lst]
compare_num_videos(json_path, videos_lst)

Number of questions in the dataset: 265323
Number of questions in the JSON file: 265323
They have the same questions.

Number of answers in the dataset: 265323
Number of answers in the JSON file: 265323
They have the same answers.

Number of videos in the dataset: 24491
Number of videos in the JSON file: 24491
They have the same videos.



### Preprocess Scenery Data

In [11]:
df_scenery = pd.read_parquet(os.path.join(LINGO_SCENERY_PATH, 'train.parquet'))
df_scenery.head(2)

Unnamed: 0,question_id,segment_id,images,question,answer
0,e0c57fef6e10d9d2a813b14cae5c9517,355a614a2263d30aac9cd1505852c4dd,[images/train/355a614a2263d30aac9cd1505852c4dd...,What action are you currently taking as the dr...,I am turning left as the road weaves in that d...
1,088a0fcd840c8e7e545adfc550aa1d9b,355a614a2263d30aac9cd1505852c4dd,[images/train/355a614a2263d30aac9cd1505852c4dd...,Why are you taking this action?,It’s necessary to follow the curvature of the ...


#### Create JSON File

In [12]:
os.makedirs(OUR_LINGO_DATASET_PATH, exist_ok=True)

dataframe = df_scenery.groupby('segment_id')

lingo_json_creator = LingoJsonFileCreator()
json_data = lingo_json_creator.format_to_train(dataframe)
lingo_json_creator.save_json(json_data, 'lingoqa_scenery.json', OUR_LINGO_DATASET_PATH)

#### Test the Code Above

In [13]:
json_path = os.path.join(OUR_LINGO_DATASET_PATH, 'lingoqa_scenery.json')

questions_lst = df_scenery['question'].tolist()
compare_num_questions(json_path, questions_lst)

answers_lst = df_scenery['answer'].tolist()
compare_num_answers(json_path, answers_lst)

videos_lst = [video.replace('\n', ',') for video in df_scenery['images'].astype(str).unique().tolist()]
videos_lst = [ast.literal_eval(video)[0] for video in videos_lst]
compare_num_videos(json_path, videos_lst)

Number of questions in the dataset: 148506
Number of questions in the JSON file: 148506
They have the same questions.

Number of answers in the dataset: 148506
Number of answers in the JSON file: 148506
They have the same answers.

Number of videos in the dataset: 3508
Number of videos in the JSON file: 3508
They have the same videos.



### Preprocess Evaluation Data

In [14]:
df_eval = pd.read_parquet(os.path.join(LINGO_EVAL_PATH, 'val.parquet'))
df_eval.head(2)

Unnamed: 0,question_id,segment_id,images,question,answer
0,1a938d25604410ccd63b60285919eaec,f2e4286e94457b8605069190e29f955a,[images/val/f2e4286e94457b8605069190e29f955a/0...,"Is there a traffic light? If yes, what color i...","Yes, green."
1,1a938d25604410ccd63b60285919eaec,f2e4286e94457b8605069190e29f955a,[images/val/f2e4286e94457b8605069190e29f955a/0...,"Is there a traffic light? If yes, what color i...","Yes, a temporary traffic light. It is showing ..."


#### Create JSON File

In [15]:
os.makedirs(OUR_LINGO_DATASET_PATH, exist_ok=True)

lingo_json_creator = LingoJsonFileCreator()
json_data = lingo_json_creator.format_to_evaluate(df_eval)
lingo_json_creator.save_json(json_data, 'lingoqa_eval.json', OUR_LINGO_DATASET_PATH)

#### Test the Code Above

In [16]:
json_path = os.path.join(OUR_LINGO_DATASET_PATH, 'lingoqa_eval.json')

questions_lst = df_eval['question'].tolist()
compare_num_questions(json_path, questions_lst, True)

answers_lst = df_eval['answer'].tolist()
compare_num_answers(json_path, answers_lst, True)

videos_lst = [video.replace('\n', ',') for video in df_eval['images'].astype(str).unique().tolist()]
videos_lst = [ast.literal_eval(video)[0] for video in videos_lst]
compare_num_videos(json_path, videos_lst, True)

Number of questions in the dataset: 1000
Number of questions in the JSON file: 1000
They have the same questions.

Number of answers in the dataset: 1000
Number of answers in the JSON file: 1000
They have the same answers.



TypeError: list indices must be integers or slices, not str

## Preprocess BDD-X DriveGPT4 Dataset

### Preprocess Training Data

In [3]:
with open(os.path.join(DRIVE_BDDX_DATASET_PATH, 'BDD_X_training_label.json'), 'r') as f:
    drive_bddx_train_data = json.load(f)

#### Create JSON File

In [4]:
os.makedirs(OUR_DRIVE_BDDX_DATASET_PATH, exist_ok=True)

drive_json_creator = DriveGPT4BDDXJsonFileCreator()
json_data = drive_json_creator.format_to_train(drive_bddx_train_data)
drive_json_creator.save_json(json_data, 'drivegpt_bddx_training.json', OUR_DRIVE_BDDX_DATASET_PATH)

#### Test the Code Above

In [7]:
json_path = os.path.join(OUR_DRIVE_BDDX_DATASET_PATH, 'drivegpt_bddx_training.json')

questions_lst = []
answers_lst = []
for data in drive_bddx_train_data:
    for conv in data['conversations']:
        if conv['from'] == 'human':
            questions_lst.append(conv['value'])
        else:
            answers_lst.append(conv['value'])

compare_num_questions(json_path, questions_lst)
compare_num_answers(json_path, answers_lst)

# video_lst = df_action['images'].astype(str).unique().tolist()
videos_original = set()
for data in drive_bddx_train_data:
    videos_original.add(data['id'])

compare_num_videos(json_path, list(videos_original), replace_this='_0.png')

Number of questions in the dataset: 114434
Number of questions in the JSON file: 114434
They have the same questions.

Number of answers in the dataset: 114434
Number of answers in the JSON file: 114434
They have the same answers.

Number of videos in the dataset: 14229
Number of videos in the JSON file: 14229
They have the same videos.



### Preprocess Testing Data

In [3]:
with open(os.path.join(DRIVE_BDDX_DATASET_PATH, 'BDD_X_testing_label.json'), 'r') as f:
    drive_bddx_test_data = json.load(f)

#### Create JSON File

In [4]:
os.makedirs(OUR_DRIVE_BDDX_DATASET_PATH, exist_ok=True)

drive_json_creator = DriveGPT4BDDXJsonFileCreator()
json_data = drive_json_creator.format_to_evaluate(drive_bddx_test_data)
drive_json_creator.save_json(json_data, 'drivegpt_bddx_testing.json', OUR_DRIVE_BDDX_DATASET_PATH)

#### Test the Code Above

In [None]:
json_path = os.path.join(OUR_DRIVE_BDDX_DATASET_PATH, 'drivegpt_bddx_testing.json')

questions_lst = []
answers_lst = []
for data in drive_bddx_test_data:
    for conv in data['conversations']:
        if conv['from'] == 'human':
            questions_lst.append(conv['value'])
        else:
            answers_lst.append(conv['value'])

compare_num_questions(json_path, questions_lst, True)
compare_num_answers(json_path, answers_lst, True)


videos_original = set()
for data in drive_bddx_test_data:
    videos_original.add(data['id'])

compare_num_videos(json_path, list(videos_original), '_0.png', True)

Number of questions in the dataset: 7244
Number of questions in the JSON file: 7244
They have the same questions.

Number of answers in the dataset: 7244
Number of answers in the JSON file: 7244
They have the same answers.

Number of videos in the dataset: 1811
Number of videos in the JSON file: 1811
They have the same videos.



## Preprocess BDD-X Original Dataset

### Preprocess Training Data

In [80]:
with open(os.path.join(BDD_X_DATASET_PATH, 'train.txt'), 'r') as f:
    bdd_x_training_data = f.read().splitlines()
    bdd_x_training_data = [line.split('_')[-1] for line in bdd_x_training_data]

downloaded_videos = os.listdir(BDD_X_TRAINING_VIDEOS_PATH)
downloaded_videos = [video.replace('.mov', '') for video in downloaded_videos]
len(set(bdd_x_training_data))

4590

In [81]:
len(set(downloaded_videos).difference(set(bdd_x_training_data)))

1059

In [82]:
with open(os.path.join(BDD_X_DATASET_PATH, 'val.txt'), 'r') as f:
    bdd_x_val_data = f.read().splitlines()
    bdd_x_val_data = [line.split('_')[-1] for line in bdd_x_val_data]
len(set(bdd_x_val_data))

698

In [83]:
len(set(bdd_x_val_data).intersection(set(downloaded_videos)))

536

In [84]:
with open(os.path.join(BDD_X_DATASET_PATH, 'test.txt'), 'r') as f:
    bdd_x_test_data = f.read().splitlines()
    bdd_x_test_data = [line.split('_')[-1] for line in bdd_x_test_data]

In [85]:
len(set(bdd_x_test_data).intersection(set(downloaded_videos)))

523

### Move videos to DriveGPT4 Dataset

In [86]:
with open(os.path.join(DRIVE_BDDX_DATASET_PATH, 'BDD_X_training_label.json'), 'r') as f:
    drive_bddx_train_data = json.load(f)
    
with open(os.path.join(DRIVE_BDDX_DATASET_PATH, 'BDD_X_testing_label.json'), 'r') as f:
    drive_bddx_test_data = json.load(f)

In [87]:
videos_train_lst = set()
for data in drive_bddx_train_data:
    videos_train_lst.add(data['id'].split('_')[1])

videos_test_lst = set()
for data in drive_bddx_test_data:
    videos_test_lst.add(data['id'].split('_')[1])
    

print(f'Number of videos in the training dataset: {len(videos_train_lst)}')
print(f'Number of videos in the testing dataset: {len(videos_test_lst)}')
print(f'Number of the intersected videos between downloaded videos and videos in training dataset: {len(set(downloaded_videos).intersection(videos_train_lst))}')
print(f'Number of the intersection videos between downloaded videos and videos in testing dataset: {len(set(downloaded_videos).intersection(videos_test_lst))}')

Number of videos in the training dataset: 3438
Number of videos in the testing dataset: 502
Number of the intersected videos between downloaded videos and videos in training dataset: 3438
Number of the intersection videos between downloaded videos and videos in testing dataset: 502


In [88]:
videos_to_move = list(set(downloaded_videos).intersection(videos_train_lst).union(set(downloaded_videos).intersection(videos_test_lst)))
print(f'Length of the videos to move: {len(videos_to_move)}')

os.makedirs(DRIVE_BDDX_VIDEOS_PATH, exist_ok=True)

for idx, video in enumerate(videos_to_move):
    if idx % 100 == 0:
        print(f'Moved {idx} video')
    video_src_path = os.path.join(BDD_X_TRAINING_VIDEOS_PATH, video+'.mov')
    video_dst_path = os.path.join(DRIVE_BDDX_VIDEOS_PATH, video+'.mov')
    shutil.copy2(video_src_path, video_dst_path)


Length of the videos to move: 3940
Moved 0 video
Moved 100 video
Moved 200 video
Moved 300 video
Moved 400 video
Moved 500 video
Moved 600 video
Moved 700 video
Moved 800 video
Moved 900 video
Moved 1000 video
Moved 1100 video
Moved 1200 video
Moved 1300 video
Moved 1400 video
Moved 1500 video
Moved 1600 video
Moved 1700 video
Moved 1800 video
Moved 1900 video
Moved 2000 video
Moved 2100 video
Moved 2200 video
Moved 2300 video
Moved 2400 video
Moved 2500 video
Moved 2600 video
Moved 2700 video
Moved 2800 video
Moved 2900 video
Moved 3000 video
Moved 3100 video
Moved 3200 video
Moved 3300 video
Moved 3400 video
Moved 3500 video
Moved 3600 video
Moved 3700 video
Moved 3800 video
Moved 3900 video


In [89]:
len(os.listdir(DRIVE_BDDX_VIDEOS_PATH))

3940