In [1]:
import re
import pandas as pd


def _preprocess_captions(df):
    def clean_caption(text):
        text = text.lower()
        text = re.sub(r"[^a-z ]", "", text)
        text = " ".join(text.split())
        return text

    df["clean_captions"] = df["caption"].apply(clean_caption)
    df["clean_captions"] = "<sos> " + df["clean_captions"] + " <eos>"

    return df


df = pd.read_csv("../data/flickr8k/captions.txt")
df = _preprocess_captions(df)

df.head()


Unnamed: 0,image,caption,clean_captions
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...,<sos> a child in a pink dress is climbing up a...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .,<sos> a girl going into a wooden building <eos>
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .,<sos> a little girl climbing into a wooden pla...
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...,<sos> a little girl climbing the stairs to her...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...,<sos> a little girl in a pink dress going into...


In [2]:
unique_images = df["image"].unique()

unique_images

array(['1000268201_693b08cb0e.jpg', '1001773457_577c3a7d70.jpg',
       '1002674143_1b742ab4b8.jpg', ..., '99679241_adc853a5c0.jpg',
       '997338199_7343367d7f.jpg', '997722733_0cb5439472.jpg'],
      dtype=object)

In [None]:
from sklearn.model_selection import train_test_split

train_imgs, temp_imgs = train_test_split(
    unique_images, test_size=(0.2), random_state=42
)


val_imgs, test_imgs = train_test_split(temp_imgs, test_size=0.5, random_state=42)

len(train_imgs), len(val_imgs), len(test_imgs)

(6472, 809, 810)

In [6]:
# reset_index(drop=True) drops the old index before creating a new df
train_df = df[df["image"].isin(train_imgs)].reset_index(drop=True)
val_df = df[df["image"].isin(val_imgs)].reset_index(drop=True)
test_df = df[df["image"].isin(test_imgs)].reset_index(drop=True)

train_df

Unnamed: 0,image,caption,clean_captions
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...,<sos> a child in a pink dress is climbing up a...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .,<sos> a girl going into a wooden building <eos>
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .,<sos> a little girl climbing into a wooden pla...
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...,<sos> a little girl climbing the stairs to her...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...,<sos> a little girl in a pink dress going into...
...,...,...,...
32355,99679241_adc853a5c0.jpg,A grey bird stands majestically on a beach whi...,<sos> a grey bird stands majestically on a bea...
32356,99679241_adc853a5c0.jpg,A large bird stands in the water on the beach .,<sos> a large bird stands in the water on the ...
32357,99679241_adc853a5c0.jpg,A tall bird is standing on the sand beside the...,<sos> a tall bird is standing on the sand besi...
32358,99679241_adc853a5c0.jpg,A water bird standing at the ocean 's edge .,<sos> a water bird standing at the ocean s edg...
