In [1]:
import os.path

import pandas as pd


%run D:\workspaces\General\notebooks\image-processing\0-0-common.ipynb

In [2]:
%%time

from shared_code.utility.storage.azure_file_storage import AzureFileStorageAdapter
from adlfs import AzureBlobFileSystem

storage_adapter: AzureFileStorageAdapter = AzureFileStorageAdapter('data')
file_system: AzureBlobFileSystem = storage_adapter.get_file_storage()


CPU times: total: 0 ns
Wall time: 0 ns


In [3]:
%%time

local_df = pd.read_parquet('data/processed_raw_data.parquet', filesystem=file_system, engine='pyarrow')

print(local_df.shape)

de_duplicated = local_df.drop_duplicates(subset=['id'], keep='first')

print(de_duplicated.shape)

(41505, 13)
(38051, 13)
CPU times: total: 781 ms
Wall time: 3.14 s


In [4]:
%%time

df_pre_index = de_duplicated.set_index('id')

df_pre_index['id'] = df_pre_index.index

print(df_pre_index.shape)

(38051, 13)
CPU times: total: 31.2 ms
Wall time: 13.2 ms


In [5]:
loaded_files = [os.path.basename(file) for file in file_system.ls('data/image/')]

In [17]:
from tqdm import tqdm

records = df_pre_index.to_dict(orient='records')
extant_records = []

for record in tqdm(records, total=len(records), desc='Checking Local records'):
    try:
        if os.path.exists(record['path']):
            extant_records.append(record)
    except Exception as e:
        continue

out = []

for record in tqdm(extant_records, total=len(extant_records), desc='Saving Extant Records'):
    if not record['id'] + ".jpg" in loaded_files:
        file_system.upload(record['path'], f'data/image/{record["id"]}.jpg', overwrite=True)
    else:
        record['path'] = f'data/image/{record["id"]}.jpg'
        record['image_name'] = f'{record["id"]}.jpg'
        record['accept'] = False
        record['curated'] = False
        try:
            record['model'] = [source['name'] for source in sources if record['subreddit'] in source['data']][0]
        except:
            record['model'] = ""
        record['tags'] = []
        out.append(record)

df = pd.DataFrame(data=out)

Checking Local records: 100%|██████████| 38051/38051 [00:01<00:00, 28538.62it/s]
Saving Extant Records: 100%|██████████| 33710/33710 [00:08<00:00, 4140.47it/s]


In [79]:
df = df.drop_duplicates(subset=['id'], keep='first')
df_index = df.set_index(['id'], drop=False)
df_index.reindex()
display(df_index)
print(df_index.shape)

Unnamed: 0_level_0,id,subreddit,author,title,caption,hash,permalink,original_url,image_name,path,model,exists,curated,accept,tags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
100rn7k,100rn7k,AmIhotAF,veritynicole,"hey, hows your new year going (23F)",a woman in a white shirt and black pants is po...,4bd00c19fa0ff2ade855e6d364b0760b,/r/AmIhotAF/comments/100rn7k/hey_hows_your_new...,https://i.redd.it/n7r47s0gkh9a1.jpg,100rn7k.jpg,data/image/100rn7k.jpg,SexyDiffusion,True,True,False,[]
1013bdt,1013bdt,AmIhotAF,RaulDea9286,36F - ITALIAN,arafed image of a woman in a bikini top,7c0d158cba8654ef1c635cbc5471d597,/r/AmIhotAF/comments/1013bdt/36f_italian/,https://i.redd.it/bg0wwdlt5k9a1.jpg,1013bdt.jpg,data/image/1013bdt.jpg,SexyDiffusion,True,False,False,[]
105mekt,105mekt,AmIhotAF,lindaniz,interesting in good forward relationship (f24),a close up of a woman with red hair and a whit...,ba4a0962cca2266a741e1e1700589c04,/r/AmIhotAF/comments/105mekt/interesting_in_go...,https://i.redd.it/4avjshsz8naa1.jpg,105mekt.jpg,data/image/105mekt.jpg,SexyDiffusion,True,False,False,[]
105qvgl,105qvgl,AmIhotAF,CaitVLove11,Laughing is my favorite 😆,a woman in a blue tank top and shorts is smili...,27bfe82c37314a0bcf02ab72eaf3a9e5,/r/AmIhotAF/comments/105qvgl/laughing_is_my_fa...,https://i.redd.it/2pulzr0lxmaa1.jpg,105qvgl.jpg,data/image/105qvgl.jpg,SexyDiffusion,True,False,False,[]
105rpcj,105rpcj,AmIhotAF,Flashy-Desk1858,[f22] What do you think when you see me?,a woman in a blue bikini top and a blue bra top,329eb42b8267fa1cc2980da8e48bcef1,/r/AmIhotAF/comments/105rpcj/f22_what_do_you_t...,https://i.redd.it/rz68pf934naa1.jpg,105rpcj.jpg,data/image/105rpcj.jpg,SexyDiffusion,True,False,False,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10kzd5c,10kzd5c,wallstreetbets,pocfdept,ii illegal short selling… 🤔,a screenshote of a cell phone showing a messag...,d68443295aac515459b13ad786e9649f,/r/wallstreetbets/comments/10kzd5c/ii_illegal_...,https://i.redd.it/g6xzdy7mm8ea1.jpg,10kzd5c.jpg,data/image/10kzd5c.jpg,WallStreetDiffusion,True,False,False,[]
10kzbey,10kzbey,wallstreetbets,pocfdept,II illegal short selling 🤔,a screenshote of a cell phone showing a screen...,959579cc440ec31a1cbb228fa3e4a3af,/r/wallstreetbets/comments/10kzbey/ii_illegal_...,https://i.redd.it/cltd77i7m8ea1.jpg,10kzbey.jpg,data/image/10kzbey.jpg,WallStreetDiffusion,True,False,False,[]
10kyx5z,10kyx5z,wallstreetbets,Bg_KiLLeR,BAER expected to begin trading in Nasdaq on Ja...,a screenshote of a dashboard with a bunch of d...,07a6f67b65c0452c98aa59c1b57fc75d,/r/wallstreetbets/comments/10kyx5z/baer_expect...,https://i.redd.it/grm4vq4ui8ea1.jpg,10kyx5z.jpg,data/image/10kyx5z.jpg,WallStreetDiffusion,True,False,False,[]
10kywrx,10kywrx,wallstreetbets,lostin_dasauce,Thought of you guys,a black and white photo of a man in a suit and...,03c75f7d48f92b73c5fd14ad5ee46b83,/r/wallstreetbets/comments/10kywrx/thought_of_...,https://i.redd.it/78nm8qdqi8ea1.jpg,10kywrx.jpg,data/image/10kywrx.jpg,WallStreetDiffusion,True,False,False,[]


(33710, 15)


In [126]:
print(df_index.shape)

(33710, 15)


In [102]:
# foo.to_parquet('data/parquet/', filesystem=file_system, engine='pyarrow', schema=schema, partition_cols=['subreddit'], index=['id'])

In [3]:
# import pandas

# foo = pandas.read_parquet('data/parquet/back.parquet', filesystem=file_system, engine='pyarrow', schema=schema)

# foo.set_index('id', inplace=True, drop=False)

# foo.drop_duplicates(subset=['id'], keep='last', inplace=True)

# print(foo.shape)

# display(foo)

# foo.to_parquet('data/parquet/back.parquet', filesystem=file_system, engine='pyarrow', schema=schema)
# df_index.to_parquet('data/parquet/', filesystem=file_system, engine='pyarrow', schema=schema)
# records = foo.to_dict(orient='records')
#
# thing = records[0]
#
# bar = pandas.DataFrame(data=[thing], index=['id'])
#
# display(bar)
#
# foo.update(bar, overwrite=True)
#
# print(foo.shape)
#
# display(foo)
#
# display(foo.loc[foo['id'] == '106mh03'])

(33710, 15)


Unnamed: 0,id,subreddit,author,title,caption,hash,permalink,original_url,image_name,path,model,exists,curated,accept,tags
0,100rn7k,AmIhotAF,veritynicole,"hey, hows your new year going (23F)",a woman in a white shirt and black pants is po...,4bd00c19fa0ff2ade855e6d364b0760b,/r/AmIhotAF/comments/100rn7k/hey_hows_your_new...,https://i.redd.it/n7r47s0gkh9a1.jpg,100rn7k.jpg,data/image/100rn7k.jpg,SexyDiffusion,True,True,False,[]
1,1013bdt,AmIhotAF,RaulDea9286,36F - ITALIAN,arafed image of a woman in a bikini top,7c0d158cba8654ef1c635cbc5471d597,/r/AmIhotAF/comments/1013bdt/36f_italian/,https://i.redd.it/bg0wwdlt5k9a1.jpg,1013bdt.jpg,data/image/1013bdt.jpg,SexyDiffusion,True,True,True,[]
2,105mekt,AmIhotAF,lindaniz,interesting in good forward relationship (f24),a close up of a woman with red hair and a whit...,ba4a0962cca2266a741e1e1700589c04,/r/AmIhotAF/comments/105mekt/interesting_in_go...,https://i.redd.it/4avjshsz8naa1.jpg,105mekt.jpg,data/image/105mekt.jpg,SexyDiffusion,True,True,True,[]
3,105qvgl,AmIhotAF,CaitVLove11,Laughing is my favorite 😆,a woman in a blue tank top and shorts is smili...,27bfe82c37314a0bcf02ab72eaf3a9e5,/r/AmIhotAF/comments/105qvgl/laughing_is_my_fa...,https://i.redd.it/2pulzr0lxmaa1.jpg,105qvgl.jpg,data/image/105qvgl.jpg,SexyDiffusion,True,True,True,[]
4,105rpcj,AmIhotAF,Flashy-Desk1858,[f22] What do you think when you see me?,a woman in a blue bikini top and a blue bra top,329eb42b8267fa1cc2980da8e48bcef1,/r/AmIhotAF/comments/105rpcj/f22_what_do_you_t...,https://i.redd.it/rz68pf934naa1.jpg,105rpcj.jpg,data/image/105rpcj.jpg,SexyDiffusion,True,True,True,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33705,10kzd5c,wallstreetbets,pocfdept,ii illegal short selling… 🤔,a screenshote of a cell phone showing a messag...,d68443295aac515459b13ad786e9649f,/r/wallstreetbets/comments/10kzd5c/ii_illegal_...,https://i.redd.it/g6xzdy7mm8ea1.jpg,10kzd5c.jpg,data/image/10kzd5c.jpg,WallStreetDiffusion,True,False,False,[]
33706,10kzbey,wallstreetbets,pocfdept,II illegal short selling 🤔,a screenshote of a cell phone showing a screen...,959579cc440ec31a1cbb228fa3e4a3af,/r/wallstreetbets/comments/10kzbey/ii_illegal_...,https://i.redd.it/cltd77i7m8ea1.jpg,10kzbey.jpg,data/image/10kzbey.jpg,WallStreetDiffusion,True,False,False,[]
33707,10kyx5z,wallstreetbets,Bg_KiLLeR,BAER expected to begin trading in Nasdaq on Ja...,a screenshote of a dashboard with a bunch of d...,07a6f67b65c0452c98aa59c1b57fc75d,/r/wallstreetbets/comments/10kyx5z/baer_expect...,https://i.redd.it/grm4vq4ui8ea1.jpg,10kyx5z.jpg,data/image/10kyx5z.jpg,WallStreetDiffusion,True,False,False,[]
33708,10kywrx,wallstreetbets,lostin_dasauce,Thought of you guys,a black and white photo of a man in a suit and...,03c75f7d48f92b73c5fd14ad5ee46b83,/r/wallstreetbets/comments/10kywrx/thought_of_...,https://i.redd.it/78nm8qdqi8ea1.jpg,10kywrx.jpg,data/image/10kywrx.jpg,WallStreetDiffusion,True,False,False,[]


Unnamed: 0,id,subreddit,author,title,caption,hash,permalink,original_url,image_name,path,model,exists,curated,accept,tags
12,106mh03,AmIhotAF,SleepHeavy3935,[F25] Who wants to go for a dip?,woman in bikini getting on a boat,e856ecb714dc9acb82548764684306c8,/r/AmIhotAF/comments/106mh03/f25_who_wants_to_...,https://i.redd.it/4pjc7v12duaa1.jpg,106mh03.jpg,data/image/106mh03.jpg,SexyDiffusion,True,True,True,[]


In [129]:
# temp = foo
# display(temp)

Unnamed: 0,id,subreddit,author,title,caption,hash,permalink,original_url,image_name,path,model,exists,curated,accept,tags
0,100rn7k,AmIhotAF,veritynicole,"hey, hows your new year going (23F)",a woman in a white shirt and black pants is po...,4bd00c19fa0ff2ade855e6d364b0760b,/r/AmIhotAF/comments/100rn7k/hey_hows_your_new...,https://i.redd.it/n7r47s0gkh9a1.jpg,100rn7k.jpg,data/image/100rn7k.jpg,SexyDiffusion,True,True,False,[]
1,1013bdt,AmIhotAF,RaulDea9286,36F - ITALIAN,arafed image of a woman in a bikini top,7c0d158cba8654ef1c635cbc5471d597,/r/AmIhotAF/comments/1013bdt/36f_italian/,https://i.redd.it/bg0wwdlt5k9a1.jpg,1013bdt.jpg,data/image/1013bdt.jpg,SexyDiffusion,True,False,False,[]
2,105mekt,AmIhotAF,lindaniz,interesting in good forward relationship (f24),a close up of a woman with red hair and a whit...,ba4a0962cca2266a741e1e1700589c04,/r/AmIhotAF/comments/105mekt/interesting_in_go...,https://i.redd.it/4avjshsz8naa1.jpg,105mekt.jpg,data/image/105mekt.jpg,SexyDiffusion,True,False,False,[]
3,105qvgl,AmIhotAF,CaitVLove11,Laughing is my favorite 😆,a woman in a blue tank top and shorts is smili...,27bfe82c37314a0bcf02ab72eaf3a9e5,/r/AmIhotAF/comments/105qvgl/laughing_is_my_fa...,https://i.redd.it/2pulzr0lxmaa1.jpg,105qvgl.jpg,data/image/105qvgl.jpg,SexyDiffusion,True,False,False,[]
4,105rpcj,AmIhotAF,Flashy-Desk1858,[f22] What do you think when you see me?,a woman in a blue bikini top and a blue bra top,329eb42b8267fa1cc2980da8e48bcef1,/r/AmIhotAF/comments/105rpcj/f22_what_do_you_t...,https://i.redd.it/rz68pf934naa1.jpg,105rpcj.jpg,data/image/105rpcj.jpg,SexyDiffusion,True,False,False,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33705,10kzd5c,wallstreetbets,pocfdept,ii illegal short selling… 🤔,a screenshote of a cell phone showing a messag...,d68443295aac515459b13ad786e9649f,/r/wallstreetbets/comments/10kzd5c/ii_illegal_...,https://i.redd.it/g6xzdy7mm8ea1.jpg,10kzd5c.jpg,data/image/10kzd5c.jpg,WallStreetDiffusion,True,False,False,[]
33706,10kzbey,wallstreetbets,pocfdept,II illegal short selling 🤔,a screenshote of a cell phone showing a screen...,959579cc440ec31a1cbb228fa3e4a3af,/r/wallstreetbets/comments/10kzbey/ii_illegal_...,https://i.redd.it/cltd77i7m8ea1.jpg,10kzbey.jpg,data/image/10kzbey.jpg,WallStreetDiffusion,True,False,False,[]
33707,10kyx5z,wallstreetbets,Bg_KiLLeR,BAER expected to begin trading in Nasdaq on Ja...,a screenshote of a dashboard with a bunch of d...,07a6f67b65c0452c98aa59c1b57fc75d,/r/wallstreetbets/comments/10kyx5z/baer_expect...,https://i.redd.it/grm4vq4ui8ea1.jpg,10kyx5z.jpg,data/image/10kyx5z.jpg,WallStreetDiffusion,True,False,False,[]
33708,10kywrx,wallstreetbets,lostin_dasauce,Thought of you guys,a black and white photo of a man in a suit and...,03c75f7d48f92b73c5fd14ad5ee46b83,/r/wallstreetbets/comments/10kywrx/thought_of_...,https://i.redd.it/78nm8qdqi8ea1.jpg,10kywrx.jpg,data/image/10kywrx.jpg,WallStreetDiffusion,True,False,False,[]


In [130]:
temp = foo
display(temp)
# df_index.to_parquet('data/parquet/ready_to_curate.parquet', filesystem=file_system, engine='pyarrow', schema=schema)

Unnamed: 0,id,subreddit,author,title,caption,hash,permalink,original_url,image_name,path,model,exists,curated,accept,tags
0,100rn7k,AmIhotAF,veritynicole,"hey, hows your new year going (23F)",a woman in a white shirt and black pants is po...,4bd00c19fa0ff2ade855e6d364b0760b,/r/AmIhotAF/comments/100rn7k/hey_hows_your_new...,https://i.redd.it/n7r47s0gkh9a1.jpg,100rn7k.jpg,data/image/100rn7k.jpg,SexyDiffusion,True,True,False,[]
1,1013bdt,AmIhotAF,RaulDea9286,36F - ITALIAN,arafed image of a woman in a bikini top,7c0d158cba8654ef1c635cbc5471d597,/r/AmIhotAF/comments/1013bdt/36f_italian/,https://i.redd.it/bg0wwdlt5k9a1.jpg,1013bdt.jpg,data/image/1013bdt.jpg,SexyDiffusion,True,False,False,[]
2,105mekt,AmIhotAF,lindaniz,interesting in good forward relationship (f24),a close up of a woman with red hair and a whit...,ba4a0962cca2266a741e1e1700589c04,/r/AmIhotAF/comments/105mekt/interesting_in_go...,https://i.redd.it/4avjshsz8naa1.jpg,105mekt.jpg,data/image/105mekt.jpg,SexyDiffusion,True,False,False,[]
3,105qvgl,AmIhotAF,CaitVLove11,Laughing is my favorite 😆,a woman in a blue tank top and shorts is smili...,27bfe82c37314a0bcf02ab72eaf3a9e5,/r/AmIhotAF/comments/105qvgl/laughing_is_my_fa...,https://i.redd.it/2pulzr0lxmaa1.jpg,105qvgl.jpg,data/image/105qvgl.jpg,SexyDiffusion,True,False,False,[]
4,105rpcj,AmIhotAF,Flashy-Desk1858,[f22] What do you think when you see me?,a woman in a blue bikini top and a blue bra top,329eb42b8267fa1cc2980da8e48bcef1,/r/AmIhotAF/comments/105rpcj/f22_what_do_you_t...,https://i.redd.it/rz68pf934naa1.jpg,105rpcj.jpg,data/image/105rpcj.jpg,SexyDiffusion,True,False,False,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33705,10kzd5c,wallstreetbets,pocfdept,ii illegal short selling… 🤔,a screenshote of a cell phone showing a messag...,d68443295aac515459b13ad786e9649f,/r/wallstreetbets/comments/10kzd5c/ii_illegal_...,https://i.redd.it/g6xzdy7mm8ea1.jpg,10kzd5c.jpg,data/image/10kzd5c.jpg,WallStreetDiffusion,True,False,False,[]
33706,10kzbey,wallstreetbets,pocfdept,II illegal short selling 🤔,a screenshote of a cell phone showing a screen...,959579cc440ec31a1cbb228fa3e4a3af,/r/wallstreetbets/comments/10kzbey/ii_illegal_...,https://i.redd.it/cltd77i7m8ea1.jpg,10kzbey.jpg,data/image/10kzbey.jpg,WallStreetDiffusion,True,False,False,[]
33707,10kyx5z,wallstreetbets,Bg_KiLLeR,BAER expected to begin trading in Nasdaq on Ja...,a screenshote of a dashboard with a bunch of d...,07a6f67b65c0452c98aa59c1b57fc75d,/r/wallstreetbets/comments/10kyx5z/baer_expect...,https://i.redd.it/grm4vq4ui8ea1.jpg,10kyx5z.jpg,data/image/10kyx5z.jpg,WallStreetDiffusion,True,False,False,[]
33708,10kywrx,wallstreetbets,lostin_dasauce,Thought of you guys,a black and white photo of a man in a suit and...,03c75f7d48f92b73c5fd14ad5ee46b83,/r/wallstreetbets/comments/10kywrx/thought_of_...,https://i.redd.it/78nm8qdqi8ea1.jpg,10kywrx.jpg,data/image/10kywrx.jpg,WallStreetDiffusion,True,False,False,[]


In [57]:

# foo = pandas.DataFrame(data=[{
#     'id': '10kzbey',
#     'path': 'data/image/1.jpg',
#     'image_name': '1.jpg',
#     'accept': True,
#     'curated': True,
#     'model': 'test',
#     'tags': []
# }])
#
# temp = df

# display(df.loc[df['id'] == '105styc'])

# display(temp.loc[temp['id'] == '10kzbey'])

# display(temp.loc[temp['id'] == '10kzbey'])

Unnamed: 0,id,subreddit,author,title,caption,hash,permalink,original_url,image_name,path,model,exists,curated,accept,tags
5,105styc,AmIhotAF,Gizzygirl127,Low key… still bangable?,smiling woman sitting on couch with remote con...,6d555943be4fbc21ff92417c6f582298,/r/AmIhotAF/comments/105styc/low_key_still_ban...,https://i.redd.it/aiaxxoz9uoaa1.jpg,105styc.jpg,data/image/105styc.jpg,SexyDiffusion,True,False,False,[]
187,105styc,AmIhotAF,Gizzygirl127,Low key… still bangable?,A smiling woman sitting on couch with taking a...,6d555943be4fbc21ff92417c6f582298,/r/AmIhotAF/comments/105styc/low_key_still_ban...,https://i.redd.it/aiaxxoz9uoaa1.jpg,105styc.jpg,data/image/105styc.jpg,SexyDiffusion,True,True,True,[]
369,105styc,AmIhotAF,Gizzygirl127,Low key… still bangable?,smiling woman sitting on couch with remote con...,6d555943be4fbc21ff92417c6f582298,/r/AmIhotAF/comments/105styc/low_key_still_ban...,https://i.redd.it/aiaxxoz9uoaa1.jpg,105styc.jpg,data/image/105styc.jpg,SexyDiffusion,True,False,False,[]
551,105styc,AmIhotAF,Gizzygirl127,Low key… still bangable?,smiling woman sitting on couch with remote con...,6d555943be4fbc21ff92417c6f582298,/r/AmIhotAF/comments/105styc/low_key_still_ban...,https://i.redd.it/aiaxxoz9uoaa1.jpg,105styc.jpg,data/image/105styc.jpg,SexyDiffusion,True,False,False,[]
733,105styc,AmIhotAF,Gizzygirl127,Low key… still bangable?,smiling woman sitting on couch with remote con...,6d555943be4fbc21ff92417c6f582298,/r/AmIhotAF/comments/105styc/low_key_still_ban...,https://i.redd.it/aiaxxoz9uoaa1.jpg,105styc.jpg,data/image/105styc.jpg,SexyDiffusion,True,False,False,[]
