In [2]:
# import libraries
from os.path import isfile, join
import json
import pandas as pd
import numpy
from os import listdir
import matplotlib.pyplot as plt
from langdetect import detect, LangDetectException

In [3]:
def is_english(text):
    try:
        if pd.isna(text) or str(text).strip() == "":
            return False
        return detect(str(text)) == 'en'
    except LangDetectException:
        return False

In [6]:
midjourney_dataset_path = "E:\\Diploma\\model_training\\datasets\\MidjourneyPrompts"
files_list = [f for f in listdir(midjourney_dataset_path) if isfile(join(midjourney_dataset_path, f))]

midjourney_dataset = []

# Collect text content from JSON files
for json_file in files_list:
    with open(join(midjourney_dataset_path, json_file), encoding="utf-8") as current_file:
        temp_json_representation = json.load(current_file)
        for message_list in temp_json_representation.get("messages", []):
            for message in message_list:
                midjourney_dataset.append(message.get("content"))

# Convert list of strings into a DataFrame
df = pd.DataFrame(data=midjourney_dataset, columns=["content"])

# Extract text between < and >; create a new column
df["extracted"] = df["content"].str.extract(r">(.*?)<")

# Keep only rows where we found a match (non-NaN)
df.dropna(subset=["extracted"], inplace=True)

# Remove exact duplicates before regex extraction
df.drop_duplicates(inplace=True)

# Export to CSV
df.to_csv("datasets/midjourney_dataset.csv", index=False)

In [7]:
# Download missing datasets
df_SDXL_prompts = pd.read_parquet("hf://datasets/Falah/image_generation_prompts_SDXL/data/train-00000-of-00001-423acaf31a7beff6.parquet")
df_SDXL_prompts.to_csv("E:\Diploma\model_training\datasets\SDXL_prompts.csv")

df_midjourney_prompts_2 = pd.read_parquet("hf://datasets/Geonmo/midjourney-prompts-only/data/train-00000-of-00001-61a1e80026db4b04.parquet")
df_midjourney_prompts_2.to_csv("E:\Diploma\model_training\datasets\midjourney_prompts_2.csv")

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Load datasets
#https://www.kaggle.com/datasets/tanreinama/900k-diffusion-prompts-dataset
df_sd_prompts = pd.read_csv('E:\Diploma\model_training\datasets\SDPrompts\diffusion_prompts.csv')

#https://www.kaggle.com/datasets/succinctlyai/midjourney-texttoimage
df_midjourney_prompts = pd.read_csv('E:\Diploma\model_training\datasets\midjourney_dataset.csv')

df_chat_gpt_prompts = pd.read_csv('E:\Diploma\model_training\datasets\chatgpt_prompts.csv')

df_chat_gpt_4_prompts = pd.read_csv("E:\Diploma\model_training\datasets\chatgpt_dataset.csv")

df_SDXL_prompts = pd.read_csv("E:\Diploma\model_training\datasets\SDXL_prompts.csv")

df_midjourney_prompts_2 = pd.read_csv("E:\Diploma\model_training\datasets\midjourney_prompts_2.csv")

ParserError: Error tokenizing data. C error: out of memory

In [None]:
# Add columns to dataset and choose columns
df_sd_formatted = df_sd_prompts[["prompt"]]
df_midjourney_prompts_formatted = df_midjourney_prompts.rename(columns={"extracted": "prompt"})[["prompt"]]
df_chat_gpt_prompts_formatted = df_chat_gpt_prompts[["prompt"]]
df_chat_gpt_4_prompts_formatted = df_chat_gpt_4_prompts.rename(columns={"chatml_prompt": "prompt"})[["prompt"]]
df_SDXL_prompts_formatted = df_SDXL_prompts.rename(columns={"prompts": "prompt"})
df_midjourney_prompts_2_formatted = df_midjourney_prompts_2.rename(columns={'text': "prompt"})


In [None]:
# Choose data in dataframes
print(df_sd_prompts.head(), df_midjourney_prompts.head(), df_chat_gpt_prompts.head())

   Unnamed: 0                                    id  \
0           0  00000d0e-45cb-47b6-9f72-6a481e940d78   
1           1  00001a8f-993f-4d69-8fd2-f7d69dc1e8ef   
2           2  00002cfc-8170-4a93-a1f8-aa5681cb5f71   
3           3  00004467-fdef-41bc-bc73-20c68444a024   
4           4  000044ca-a4d7-46a2-81da-7ef3bf4cbeeb   

                                              prompt  \
0  man waking up, dark and still room, cinematic ...   
1  Yate con familia feliz navegando por el mar ca...   
2  Many friendly alien race individuals. fantasy,...   
3  theo james as cyclops, cyberpunk futuristic ne...   
4  Portrait of a beautiful woman with long hair o...   

                                                 url  width  height  \
0  https://krea-prod-v1-generations.s3.us-east-1....    512     512   
1  https://image.lexica.art/full_jpg/00001a8f-993...    640     640   
2  https://image.lexica.art/full_jpg/00002cfc-817...    512     768   
3  https://image.lexica.art/full_jpg/00004467-fd

In [None]:
# Combining datasets
combining_datasets = pd.concat([df_sd_formatted, df_midjourney_prompts_formatted,
                                df_chat_gpt_prompts_formatted, df_chat_gpt_4_prompts_formatted,
                                df_SDXL_prompts_formatted, df_midjourney_prompts_2_formatted], ignore_index=True)
combining_datasets = combining_datasets.sample(frac=1, random_state=4214).reset_index(drop=True)
print(combining_datasets.head())

                                              prompt  Unnamed: 0
0  photographic portrait of a stunningly beautifu...         NaN
1                                       blue house's         NaN
2                        malaga beach with big waves   3417089.0
3  Swirls  fog  phantom  ghost  human  glowing re...   1748148.0
4  pastel flowers, Sad cat in river, half cat bod...    346191.0


In [None]:
# Check dataset
print(len(df_sd_formatted), len(df_midjourney_prompts_formatted), len(df_chat_gpt_prompts_formatted), len(df_chat_gpt_4_prompts_formatted), len(combining_datasets))

907953 98161 153 40000 5538811


In [4]:
combining_datasets = pd.read_csv('./datasets/full_dataset.csv')
combining_datasets = combining_datasets.drop_duplicates()
combining_datasets_chunked = numpy.array_split(combining_datasets, 5)
combining_datasets = None
for  i, chunk in enumerate(combining_datasets_chunked, 1):
    processed_chunk = chunk[chunk['prompt'].str.split().str.len() >= 30]
    processed_chunk = processed_chunk[processed_chunk['prompt'].apply(is_english)]
    processed_chunk.to_csv(f'processed_chunk_{i}.csv', index=False)


  return bound(*args, **kwds)


In [None]:
# save clean dataset
sum([len(length) for length in combining_datasets_chunked])

4760963