In [1]:
import sys
sys.path.append("..")

In [2]:
import pandas as pd
from llavart.utils.dirutils import get_data_dir
from PIL import Image

In [3]:
df = pd.read_csv(get_data_dir() / "texts" / "wiki_img_paragraph336_clean.csv")
df

Unnamed: 0,file_name,paragraph,similarity
0,33734529_0.jpg,Architecture is the process and the product of...,0.260930
1,33734529_1.jpg,Paintings of human figures can be found in the...,0.296242
2,33734529_2.jpg,Drawing and painting go back tens of thousands...,0.335946
3,33734529_3.jpg,The 17th century witnessed the emergence of th...,0.289089
4,33734529_4.jpg,"Towards the end of the 19th century, several y...",0.298388
...,...,...,...
506064,25646039_0.jpg,The Boston Journal of Natural History (1834-18...,0.327892
506065,1494135_0.jpg,"In 1903, under the direction of Ben E. Rich, t...",0.373264
506066,1494135_1.jpg,"In June 1907, the Elders' Journal was merged w...",0.355782
506067,12281645_1.jpg,Botaniska Notiser was a Swedish scientific per...,0.261090


In [4]:
import gradio as gr

df_filtered = df
last_query = ""
slider = gr.Slider(maximum=len(df_filtered), step=1, interactive=True)

def show_sample(slider):
    row = df_filtered.iloc[slider]
    image = Image.open(get_data_dir() / "wikipedia_images336" / row["file_name"])
    return image, row["paragraph"] + f"\n\nSimilarity: {row['similarity']:.2f}"

def make_query(query):
    global df_filtered
    df_filtered = df[df["paragraph"].str.contains(query)]

    if len(df_filtered) == 0:
        df_filtered = df

    df_filtered = df_filtered.reset_index(drop=True)
    slider = gr.Slider(maximum=len(df_filtered) - 1, value=0, step=1)
    image, paragraph = show_sample(0)
    return slider, image, paragraph

with gr.Blocks() as demo:
    gr.Markdown(
    """
    # The Wiki Visual Arts Image-Paragraph Dataset
    This is a dataset of images and paragraphs from Wikipedia articles on visual arts. The dataset
    can be used to train models for vision-language tasks such as **retrieval** and **captioning**.

    ## Demo Instructions
    Use the slider to select an example. Write a query to filter examples where the paragraph contains the query.
    """)
    image, paragraph = show_sample(0)
    
    with gr.Row():
        with gr.Column():
            slider = gr.Slider(maximum=len(df_filtered) - 1, step=1)
            query = gr.Textbox(lines=1, label="Search query")
        with gr.Column():
            image = gr.Image(label="Image", height=400, width=400, value=image)
            paragraph = gr.Textbox(lines=5, label="Paragraph", value=paragraph)
    query.change(fn=make_query, inputs=[query], outputs=[slider, image, paragraph])
    slider.change(fn=show_sample, inputs=[slider], outputs=[image, paragraph])

    gr.Markdown(
    """
    ## Pages Collection
    The dataset was collected by scraping Wikipedia articles on visual arts. Specifically,
    we collect all pages connected to the Wikipedia Category **Visual Arts** and, recursively,
    to all subcategories of **Visual Arts** and their subcategories up to a depth of 5.

    The collected pages amounted to a total of more than 500,000 pages.

    ## Image-Paragraph Pairs
    From the collected pages, we extracted image-paragraph pairs. We extract all images and paragraphs
    in a page and then match each image to the paragraph that is most similar to it. We use the
    cosine similarity between the image and paragraph CLIP embeddings to match them. The collected
    pairs were more than 2,000,000 pairs.

    ## Data Cleaning
    We clean the dataset by removing image-paragraph pairs that are not relevant to visual arts.

    Firstly, we match images against the following prompts to remove bad examples:

    - *A low-quality icon*
    - *A low-quality object*
    - *A broken image*
    - *A single color image*
    - *A badly cut-out icon*
    - *A gray image with a black square*
    - *A white image with black bands*
    - *An icon of a country flag*

    We then remove pairs where the image matches one of the prompts with a cosine similarity higher than 0.25.

    Then, we remove image-paragraphs containg photos that are not related to the visual arts,
    by defining the following list of prompts:

    - *A photo*
    - *An artistic photo*
    - *A painting*
    - *A sculpture*
    - *A print*
    - *A comic*
    - *An architecture*
    - *A design*
    - *A handicraft*
    - *A drawing*
    - *An illustration*
    - *A ceramic*

    We remove pairs where the image matches the prompt *A photo* with a cosine similarity higher than that obtained for tha other prompts.
    """)

demo.launch(share=True)

  from .autonotebook import tqdm as notebook_tqdm


Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://cc6772ae7b7845aa15.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


