## Imports

In [1]:
import pandas as pd

In [2]:
path = '../data/msvd/MSVD-QA/train_qa.json'

In [3]:
df = pd.read_json(path)
df

Unnamed: 0,answer,id,question,video_id
0,animal,0,what is chewing on a nut?,1
1,nut,1,what is a small animal chewing on?,1
2,squirrel,2,what is eating a whole peanut?,1
3,chipmunk,3,what is eating a nut?,1
4,chipmunk,4,what is eating a peanut?,1
...,...,...,...,...
30928,pack,30928,what is a man doing?,1200
30929,put,30929,what is a man doing?,1200
30930,put,30930,what is someone doing?,1200
30931,marinate,30931,what is someone doing?,1200


## Create Answer Set

In [4]:
answer_freq = df['answer'].value_counts()
answer_freq.to_frame()

Unnamed: 0,answer
man,4209
woman,2280
person,934
someone,656
two,608
...,...
jellyroll,1
tupperware,1
barbeque,1
pop,1


### Take the first 1000 words
https://github.com/xudejing/video-question-answering/blob/462f6e599fb02cee77c0b3d86901593efeb11c88/preprocess_msvdqa.py#L64

In [5]:
top_1k = pd.DataFrame(answer_freq.iloc[:1000]).reset_index()
top_1k = top_1k['index']
top_1k

0            man
1          woman
2         person
3        someone
4            two
         ...    
995    cardboard
996      lecture
997      stuffed
998         doll
999      handbag
Name: index, Length: 1000, dtype: object

## Create Vocabulary
https://github.com/xudejing/video-question-answering/blob/462f6e599fb02cee77c0b3d86901593efeb11c88/preprocess_msvdqa.py#L77

In [8]:
vocab = dict()

# Only take the questions in the top 1k
filtered_df = df[df['answer'].isin(top_1k)]

questions = filtered_df['question'].values

print(f'Number of original rows: {len(df)}')
print(f'Number of rows after filtering: {len(filtered_df)}')

Number of original rows: 30933
Number of rows after filtering: 29883


### Extract tokens

In [10]:
for question in questions:
    words = question.rstrip('?').split()
    
    for word in words:
        if len(word) >= 2:
            vocab[word] = vocab.get(word, 0) + 1

vocab = pd.Series(vocab)
vocab.sort_values(ascending=False, inplace=True)
vocab

what        37234
is          37120
who         20662
the         14374
doing       11884
            ...  
dawn            2
turle           2
rae             2
knee            2
response        2
Length: 4729, dtype: int64

In [11]:
# Taking the top 3999
vocab = pd.DataFrame(vocab.iloc[0:3999])
vocab.loc['<UNK>'] = [0]
vocab

Unnamed: 0,0
what,37234
is,37120
who,20662
the,14374
doing,11884
...,...
jati,2
rhe,2
precision,2
imperial,2


In [13]:
vocab.to_csv('msvd_vocab', columns=[], header=False)