In [1]:
from shared_code.utility.schemas.spark_table_schema import image_table_schema,image_table_schema_with_caption_tokens
from shared_code.utility.spark.set_environ import *
from shared_code.utility.storage.blob import BlobAdapter

from torch.utils.data import Dataset, random_split
from simpletransformers.language_modeling import LanguageModelingModel
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Model
from transformers import TrainingArguments, Trainer
import os
import pandas
import torch
import gc
import logging
import random


os.environ["WANDB_DISABLED"] = "true"
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

set_azure_env()

spark = get_session('nature-city-ai')

In [18]:
from pyspark.sql.functions import count, desc
from pyspark import F

spark_frame = spark.read.csv("D:\\data\\processed\\manifest.csv", header=True)

print(f"== Reading {spark_frame.count()} rows from manifest.csv ==")

grouped_frame = spark_frame.groupby("sub").agg(count("name"))

foo = spark_frame.join(grouped_frame, on="sub", how="inner").select("sub", "count(name)").orderBy(desc("count(name)")).distinct()

print("== Distinct Subreddits ==")
display(foo.toPandas())

== Reading 4638 rows from manifest.csv ==
== Distinct Subreddits ==


Unnamed: 0,sub,count(name)
0,CityPorn,624
1,SFWRedheads,821
2,SelfiesGoneMild,2
3,selfies,76
4,sfwpetite,299
5,Amicute,83
6,SFWNextDoorGirls,846
7,Faces,368
8,HotGirlNextDoor,342
9,amihot,114


In [3]:
print("== Reading images from blob ==")
blob: BlobAdapter = BlobAdapter("unprocessed")

data_df = spark_frame.where("sub = 'EarthPorn' or sub = 'CityPorn'").select("name", "sub", "caption", "text")

print("== Images For Subreddits EarthPorn and CityPorn ==")
display(data_df.toPandas())

== Reading images from blob ==


Unnamed: 0,name,sub,caption,text
0,3rog9ik4v2ia1.jpg,EarthPorn,a tree stump in the middle of a forest,"Trees Holding Hands. Lyman, ME USA [1080×1920]..."
1,q61cqjmce4ia1.jpg,EarthPorn,a beach scene with the sun setting on the horizon,"Praia Grande - SP, Brazil (4160x3120) Where th..."
2,jr75kz7ir7ia1.jpg,CityPorn,a yellow fire hydrant on a city street,"Chicago, IL"
3,ijebuyn8k6ia1.jpg,CityPorn,a large building with a clock on the side of it,"Forli, Italy"
4,xzxj3tu948ia1.jpg,CityPorn,a statue of an american flag on top of a city ...,"Piazza San Marco, Venezia, Italy"
...,...,...,...,...
1244,j20bptuca2ja1.jpg,CityPorn,an old photo of a town with lots of buildings,"Boston, MA (1860)"
1245,zq595awts3ja1.jpg,CityPorn,a tall building with a clock on top of it,W HOTEL Building
1246,lqvtbf9y24ja1.jpg,EarthPorn,a small group of wildflowers on a hillside,California poppies along a trail in Eagle Peak...
1247,0vdw42iio2ja1.jpg,EarthPorn,a herd of wildflowers in a field,Afternoon Crocus bloom in the Pacific Northwes...


In [4]:
!mkdir -p images

A subdirectory or file images already exists.
Error occurred while processing: images.


In [5]:
import shutil

df_1 = spark.createDataFrame(data_df.withColumnRenamed("name", "file_name").collect()).select("file_name", "text").toPandas().to_json(orient='records', lines=True)

foo = data_df.select("sub", "name").rdd.map(lambda x: f"D:\\data\\images\\{x.sub}\\thumbnail\\{x.name}").collect()

for elem in foo:
    if os.path.exists(elem):
        print(f"Copying {elem}")
        shutil.copy(elem, "images")
    else:
        print(f"File {elem} does not exist")

with open('metadata.jsonl', 'w', encoding='utf-8') as f:
    f.write(df_1)

shutil.copy("metadata.jsonl", "images")

Copying D:\data\images\EarthPorn\thumbnail\3rog9ik4v2ia1.jpg
Copying D:\data\images\EarthPorn\thumbnail\q61cqjmce4ia1.jpg
Copying D:\data\images\CityPorn\thumbnail\jr75kz7ir7ia1.jpg
Copying D:\data\images\CityPorn\thumbnail\ijebuyn8k6ia1.jpg
Copying D:\data\images\CityPorn\thumbnail\xzxj3tu948ia1.jpg
Copying D:\data\images\CityPorn\thumbnail\oztl2k17t6ia1.jpg
Copying D:\data\images\CityPorn\thumbnail\ptbcvl87u6ia1.jpg
Copying D:\data\images\CityPorn\thumbnail\u34q7hf0e8ia1.jpg
Copying D:\data\images\CityPorn\thumbnail\f5jwju9qf8ia1.jpg
Copying D:\data\images\CityPorn\thumbnail\i5tt4i43k8ia1.jpg
Copying D:\data\images\CityPorn\thumbnail\xwme1arla7ia1.jpg
Copying D:\data\images\CityPorn\thumbnail\sa2loj1ad7ia1.jpg
Copying D:\data\images\EarthPorn\thumbnail\foa3qj6f09ia1.jpg
Copying D:\data\images\EarthPorn\thumbnail\cabitja3j7ia1.jpg
Copying D:\data\images\EarthPorn\thumbnail\gubgu1guj7ia1.jpg
Copying D:\data\images\EarthPorn\thumbnail\9abctoyl49ia1.jpg
Copying D:\data\images\EarthPorn\t

'images\\metadata.jsonl'

In [6]:
!tar -a -c -f out.zip images

In [7]:
text_lines = list(data_df.select("text").rdd.map(lambda x: x.text).collect())

with open("training.txt", "wb") as f:
    for text_line in text_lines:
        line = "<|startoftext|>" + f"{text_line}" + "<|endoftext|>" + "\n"
        f.write(line.encode("utf-8"))

In [None]:
model_type = ""

model_name = f"nature-prompt-bot{model_type}"

parent_directory = "/content/MyDrive/RawData/model_base"

model_output_dir = f"{parent_directory}/{model_name}"

tokenizer_path = f"{model_output_dir}"

In [None]:
class CustomDataset(Dataset):
	_input_id: str = 'input_ids'
	_attention_mask: str = 'attention_mask'

	def __init__(self, text_list, tokenizer, max_length, truncation=False):
		self.input_ids = []
		self.attention_mask = []
		self.labels = []
		for text in text_list:
			encodings_dict = tokenizer(text, truncation=truncation, max_length=max_length)
			self.input_ids.append(torch.tensor(encodings_dict[self._input_id]))
			self.attention_mask.append(torch.tensor(encodings_dict[self._attention_mask]))

	def __len__(self):
		return len(self.input_ids)

	def __getitem__(self, index):
		return self.input_ids[index], self.attention_mask[index]

In [None]:
data_lines = []
with open('training.txt', 'r', encoding="UTF-8") as f:
	lines = f.readlines()
	for line in lines:
		foo = line
		print(foo)
		data_lines.append(foo)

random.shuffle(data_lines)

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained(f"gpt2{model_type}")

model = GPT2LMHeadModel.from_pretrained(f"gpt2{model_type}")

special_tokens_dict = {
    "bos_token": "<|startoftext|>",
    "eos_token": "<|endoftext|>",
    "additional_special_tokens": [
        "<|endoftext|>",
        "<|startoftext|>"
    ]
}

print(tokenizer.eos_token)

num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

print("We have added", num_added_toks, "tokens")

print(tokenizer.eos_token)

# Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
model.resize_token_embeddings(len(tokenizer))

model.save_pretrained(model_output_dir)

tokenizer.save_pretrained(tokenizer_path)

model = GPT2LMHeadModel.from_pretrained(model_output_dir)

tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)

model.cuda()

In [None]:
generator = torch.Generator()

generator.manual_seed(0)

print(f":: Total Number Of Samples {len(data_lines)}")

max_length = max([len(tokenizer.encode(prompt)) for prompt in data_lines])

print(f":: Max Length Of Sample {max_length}")

dataset = CustomDataset(data_lines, tokenizer, max_length=max_length)

train_size = int(0.9 * len(dataset))

train_dataset, eval_dataset = random_split(dataset, [train_size, len(dataset) - train_size], generator=generator)

In [None]:
training_args = TrainingArguments(output_dir=model_output_dir)
training_args.num_train_epochs = 5
training_args.per_device_train_batch_size = 1
training_args.per_device_eval_batch_size = 1
training_args.logging_steps=50
training_args.save_steps=1000
training_args.weight_decay=0.0
training_args.logging_dir='./logs'
training_args.fp16=True
training_args.auto_find_batch_size=True
training_args.gradient_accumulation_steps=50
training_args.learning_rate=1e-4

In [None]:
trainer: Trainer = Trainer(
		model=model,
		args=training_args,
		train_dataset=train_dataset,
		eval_dataset=eval_dataset,
		data_collator=lambda data: {
			'input_ids': torch.stack([f[0] for f in data]),
			'attention_mask': torch.stack([f[1] for f in data]),
			'labels': torch.stack([f[0] for f in data])
		}
	)

In [None]:
trainer.train()

trainer.save_model(model_output_dir)

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)

model = GPT2LMHeadModel.from_pretrained(model_output_dir)

In [None]:
question = "<|startoftext|>"

prompt = f"{question}"

device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")

generation_prompt = tokenizer(prompt, add_special_tokens=False, return_tensors="pt")

model.to(device)

generation_prompt.to(device)

inputs = generation_prompt.input_ids

attention_mask = generation_prompt['attention_mask']

sample_outputs = model.generate(inputs=inputs,
                                attention_mask=attention_mask,
                                do_sample=True,
                                top_k=0,
                                top_p=0.95,
                                max_length=1024,
                                num_return_sequences=5,
                                repetition_penalty=1.1)

result = ""
for i, sample_output in enumerate(sample_outputs):
    result = tokenizer.decode(sample_output, skip_special_tokens=True)
    print(result)