In [1]:
import pandas as pd
import csv
import os
import requests
import time
from tqdm import tqdm

In [55]:
URI = f'https://afc3-171-7-53-34.ngrok-free.app/api/v1/generate'

In [20]:
MAX_RETRIES = 5
DELAY_SECONDS = 1

def run(prompt):
    request = {
        'prompt': prompt,
        'max_new_tokens': 200,

        # Generation params. If 'preset' is set to different than 'None', the values
        # in presets/preset-name.yaml are used instead of the individual numbers.
        'preset': 'None',  
        'do_sample': True,
        'temperature': 0.7,
        'top_p': 0.1,
        'typical_p': 1,
        'epsilon_cutoff': 0,  # In units of 1e-4
        'eta_cutoff': 0,  # In units of 1e-4
        'tfs': 1,
        'top_a': 0,
        'repetition_penalty': 1.18,
        'top_k': 40,
        'min_length': 0,
        'no_repeat_ngram_size': 0,
        'num_beams': 1,
        'penalty_alpha': 0,
        'length_penalty': 1,
        'early_stopping': False,
        'mirostat_mode': 0,
        'mirostat_tau': 5,
        'mirostat_eta': 0.1,

        'seed': -1,
        'add_bos_token': True,
        'truncation_length': 2048,
        'ban_eos_token': False,
        'skip_special_tokens': True,
        'stopping_strings': []
    }

    for attempt in range(MAX_RETRIES):
        try:
            response = requests.post(URI, json=request)
            if response.status_code == 200:
                result = response.json()['results'][0]['text']
                return result
        except requests.exceptions.RequestException as e:
            print(f"Error occurred during request: {e}")
        
        # Wait for a moment before retrying
        time.sleep(DELAY_SECONDS)
        
    # If all attempts failed, return None
    return None

In [21]:
prompt_format = open(r'data/prompts/fewshot.txt','r').read()

In [5]:
with open('data/dictionaries/tag_translation.txt', 'r') as file:
    csv_reader = csv.reader(file, delimiter='%')
    tag_dictionary = {row[0]: row[1] for row in csv_reader if len(row) == 2}

In [6]:
df = pd.read_json('data/datasets/full_dataset.json',lines=True)
df

In [8]:
old_df = df[pd.notna(df['caption_string'])]
new_df = df[pd.isna(df['caption_string'])]
print(len(old_df))
print(len(new_df))

14754
15246


In [26]:
def make_prompt(tags):
    tag_string = " ".join([tag_dictionary.get(x, x) for x in tags.split()])
    prompt = prompt_format.format(tags=tag_string)
    return prompt

In [27]:
new_df['prompt_string'] = new_df['filtered_tag_string'].apply(make_prompt)
new_df['prompt_string']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['prompt_string'] = new_df['filtered_tag_string'].apply(make_prompt)


0        A chat between a user and an assistant. The as...
1        A chat between a user and an assistant. The as...
2        A chat between a user and an assistant. The as...
3        A chat between a user and an assistant. The as...
4        A chat between a user and an assistant. The as...
                               ...                        
15241    A chat between a user and an assistant. The as...
15242    A chat between a user and an assistant. The as...
15243    A chat between a user and an assistant. The as...
15244    A chat between a user and an assistant. The as...
15245    A chat between a user and an assistant. The as...
Name: prompt_string, Length: 15246, dtype: object

In [61]:
prompts = new_df.prompt_string.to_list()
prompts

['A chat between a user and an assistant. The assistant replies with a concise image caption based on the given keywords without inferring. Some keywords may imply one another. For instance, a pink hairband implies that a hair ornament is present. In that case, only include the most specific keyword\n\nUSER: 1girl 2boys apple apron comic food fruit glasses left-to-right_manga monochrome multiple_boys pot sepia stairs\nASSISTANT: A girl and two boys depicted in a sepia monochrome left to right manga. The girl is wearing glasses, an apron and holding an apple while a pot is put to a boil and the boys are standing on a staircase\nUSER: 1boy ahoge brown_eyes brown_hair full_body male_focus necktie school_uniform solo transparent_background\nASSISTANT: A young man with brown eyes and hair wearing a school uniform with a necktie is standing against a transparent background.\nUSER: 1girl blonde_hair breasts cleavage closed_eyes hair_ornament hairband long_hair open_mouth pink_hairband small_b

In [62]:
prompts = prompts[12710:]
prompts[0]

'A chat between a user and an assistant. The assistant replies with a concise image caption based on the given keywords without inferring. Some keywords may imply one another. For instance, a pink hairband implies that a hair ornament is present. In that case, only include the most specific keyword\n\nUSER: 1girl 2boys apple apron comic food fruit glasses left-to-right_manga monochrome multiple_boys pot sepia stairs\nASSISTANT: A girl and two boys depicted in a sepia monochrome left to right manga. The girl is wearing glasses, an apron and holding an apple while a pot is put to a boil and the boys are standing on a staircase\nUSER: 1boy ahoge brown_eyes brown_hair full_body male_focus necktie school_uniform solo transparent_background\nASSISTANT: A young man with brown eyes and hair wearing a school uniform with a necktie is standing against a transparent background.\nUSER: 1girl blonde_hair breasts cleavage closed_eyes hair_ornament hairband long_hair open_mouth pink_hairband small_br

In [63]:
captions = []

batch_size = 10  # Adjust the batch size as per your requirements

# Split the data into batches
num_batches = (len(prompts) + batch_size - 1) // batch_size

for i in tqdm(range(num_batches), desc="Batch"):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    batch_prompts = prompts[start_idx:end_idx]

    batch_captions = []

    for prompt in batch_prompts:
        caption = run(prompt).strip()
        batch_captions.append(caption)

    captions.extend(batch_captions)

Batch: 100%|██████████| 254/254 [2:11:53<00:00, 31.16s/it]  


In [64]:
len(captions)

2536

In [69]:
new_df['caption_string'].iloc[12710:] = pd.Series(captions)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['caption_string'].iloc[12710:] = pd.Series(captions)


In [76]:
full_df = pd.merge(new_df, old_df, how='outer').sample(frac=1).reset_index(drop=True)

In [77]:
with open('data/datasets/full_dataset.json', 'w') as f:
    f.write(full_df.to_json(orient='records', lines=True))

In [59]:
df = pd.read_json('data/datasets/full_dataset.json',lines=True)