In [1]:
!aws s3 sync s3://visualneurons.com-logos-outputs /home/ubuntu/data/visualneurons.com-logos-outputs
!aws s3 sync s3://visualneurons.com-logos /home/ubuntu/data/visualneurons.com-logos

In [2]:
from fastcore.all import Path
import os, json
import pandas as pd

data_path = Path("/home/ubuntu/data")
input_path = data_path / "visualneurons.com-logos"
output_path = data_path / "visualneurons.com-logos-outputs"

filter_users_out = ["pochetti", "lgvaz", "vzz", "lucas"]
filter_dirs_out = ["zipped", "tests", "sagemaker"]

In [3]:
def is_one_of_us(user):
    for f in filter_users_out:
        if f in user:
            return True
    return False

def is_ok_dir(dir_name):
    for f in filter_dirs_out + filter_users_out:
        if f in dir_name:
            return False
    return True

def list_files_recursive(directory):
    all_files = []
    for dirpath, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            file_path = os.path.join(dirpath, filename)
            all_files.append(Path(file_path))
    return all_files

In [4]:
total_users = [user for user in input_path.ls() if not is_one_of_us(str(user))]
user_activity = [user for user in output_path.ls() if is_ok_dir(str(user))]

len(total_users), len(user_activity)

(33, 33)

In [5]:
files = []
for user in total_users:
    files += list_files_recursive(user)

input_ims = pd.DataFrame(files, columns=["path"])
input_ims["fname"] = input_ims["path"].apply(lambda x: x.name)
input_ims["fname_id"] = input_ims["fname"].apply(lambda x: x.split(".")[0])
input_ims["img_id"] = input_ims["path"].apply(lambda x: str(x).replace("/home/ubuntu/data/", "s3://"))
input_ims["email"] = input_ims["path"].apply(lambda x: str(x).split("/")[5].replace("_at_", "@"))
input_ims.head()

Unnamed: 0,path,fname,fname_id,img_id,email
0,/home/ubuntu/data/visualneurons.com-logos/tom....,foo.png,foo,s3://visualneurons.com-logos/tom.godden_at_gma...,tom.godden@gmail.com
1,/home/ubuntu/data/visualneurons.com-logos/mmcd...,Carslogo.png,Carslogo,s3://visualneurons.com-logos/mmcdonald_at_cars...,mmcdonald@cars.com
2,/home/ubuntu/data/visualneurons.com-logos/just...,FB_IMG_1683855619095.jpg,FB_IMG_1683855619095,s3://visualneurons.com-logos/justinrmarks_at_g...,justinrmarks@gmail.com
3,/home/ubuntu/data/visualneurons.com-logos/chee...,Screenshot_2023_05_03_at_22.18.49.png,Screenshot_2023_05_03_at_22,s3://visualneurons.com-logos/cheema8raman_at_g...,cheema8raman@gmail.com
4,/home/ubuntu/data/visualneurons.com-logos/fran...,Entrega_AI_10.png,Entrega_AI_10,s3://visualneurons.com-logos/franco.medinna_at...,franco.medinna@gmail.com


In [6]:
def extract_time(s):
    s = str(s.path).split("/")[6:-2]
    year = s[0]
    month = s[1]
    day = s[2]
    hour = s[3]
    minute = s[4]
    second = s[5] if len(s) == 6 else "0"
    return int(year), int(month), int(day), int(hour), int(minute), int(second)

def count_files(s):
    files = s.path.parent.ls()
    n_zip_files = len([f for f in files if ".zip" in str(f)])
    n_jpeg_files = len([f for f in files if ".jpeg" in str(f)])
    n_json_files = len([f for f in files if ".json" in str(f)])
    return n_zip_files, n_jpeg_files, n_json_files

files = []
for user in user_activity:
    files += list_files_recursive(user)

df = pd.DataFrame(files, columns=["path"])
df["fname"] = df["path"].apply(lambda x: x.name)
df["fname_id"] = df["fname"].apply(lambda x: x.split(".")[0])
df["email"] = df["path"].apply(lambda x: str(x).split("/")[5].replace("_at_", "@"))
df[["year", "month", "day", "hour", "minute", "second"]] = df.apply(extract_time, axis=1, result_type="expand")
df["extension"] = df["fname"].apply(lambda x: x.split(".")[-1])
df["input_output"] = df["path"].apply(lambda x: "input" if "input" in str(x) else "output")
df[["n_zip_files", "n_jpeg_files", "n_json_files"]] = df.apply(count_files, axis=1, result_type="expand")
df["output_parent"] = df["path"].apply(lambda x: str(x.parent).replace("input", "output"))
df = df[df.month>4]

df.sample(6)

Unnamed: 0,path,fname,fname_id,email,year,month,day,hour,minute,second,extension,input_output,n_zip_files,n_jpeg_files,n_json_files,output_parent
932,/home/ubuntu/data/visualneurons.com-logos-outp...,94c928e69ca54da08fa0d34b38a4fa0c.jpeg,94c928e69ca54da08fa0d34b38a4fa0c,claus.verner@gmail.com,2023,5,2,7,59,0,jpeg,output,0,44,0,/home/ubuntu/data/visualneurons.com-logos-outp...
1189,/home/ubuntu/data/visualneurons.com-logos-outp...,1debc7d7e7f747fd8fb3ce3677c6bfad.jpeg,1debc7d7e7f747fd8fb3ce3677c6bfad,adrienlebret@gmail.com,2023,5,3,7,7,0,jpeg,output,1,60,0,/home/ubuntu/data/visualneurons.com-logos-outp...
1182,/home/ubuntu/data/visualneurons.com-logos-outp...,329b85dcf1cb423895a3e21a92338ce8.jpeg,329b85dcf1cb423895a3e21a92338ce8,adrienlebret@gmail.com,2023,5,3,7,7,0,jpeg,output,1,60,0,/home/ubuntu/data/visualneurons.com-logos-outp...
1052,/home/ubuntu/data/visualneurons.com-logos-outp...,d3c0db6ba3f547b8b7f37c2265a3d2af.jpeg,d3c0db6ba3f547b8b7f37c2265a3d2af,josephidziorek@gmail.com,2023,5,6,1,52,0,jpeg,output,1,60,0,/home/ubuntu/data/visualneurons.com-logos-outp...
1252,/home/ubuntu/data/visualneurons.com-logos-outp...,37f3c33ce47d48099edb303dded588d3.jpeg,37f3c33ce47d48099edb303dded588d3,adrienlebret@gmail.com,2023,5,3,8,55,0,jpeg,output,1,60,0,/home/ubuntu/data/visualneurons.com-logos-outp...
1201,/home/ubuntu/data/visualneurons.com-logos-outp...,e01561f6d6b148cb8efdbee5988b807a.jpeg,e01561f6d6b148cb8efdbee5988b807a,adrienlebret@gmail.com,2023,5,3,7,7,0,jpeg,output,1,60,0,/home/ubuntu/data/visualneurons.com-logos-outp...


In [7]:
_in = df[df["input_output"] == "input"]
_in = _in.merge(input_ims[["fname_id", "email", "img_id"]], on=["fname_id", "email"], how="left")

outputs = df[df["input_output"] == "output"].output_parent.unique()
_in["has_output"] = _in.apply(lambda x: True if x.output_parent in outputs else False, axis=1)
assert len(_in.path.unique()) == len(_in)

_out = df[df["input_output"] == "output"]
subs = _in.groupby("email").path.count().to_frame("n_submissions").reset_index()
print(_out.shape)
_out = _out.merge(subs, on="email")
print(_out.shape)
_out["successful_output"] = _out.apply(lambda x: x.n_zip_files==1, axis=1)

inter = _out.groupby(["output_parent"])[["successful_output"]].min().reset_index()
_in = _in.merge(inter, on="output_parent", how="left")
_in.successful_output.fillna(False, inplace=True)

(1349, 16)
(1349, 17)


In [8]:
print(f"{len(_in)} unique submissions from {len(_in.email.unique())} users, of which {_in.successful_output.sum()} are successful")

44 unique submissions from 32 users, of which 21 are successful


In [9]:
s = _in.groupby("email").agg({"path": "count", "successful_output": "sum"}).sort_values("path", ascending=False)
s.columns = ["n_submissions", "n_successful_submissions"]
s["success_rate"] = s["n_successful_submissions"] / s["n_submissions"]
s

Unnamed: 0_level_0,n_submissions,n_successful_submissions,success_rate
email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
sparsamu@amazon.com,8,4,0.5
adrienlebret@gmail.com,4,4,1.0
mmcdonald@cars.com,2,0,0.0
khaliladib11@gmail.com,2,0,0.0
a.shiban@gmail.com,1,0,0.0
tony.walasik@gmail.com,1,1,1.0
tom.godden@gmail.com,1,0,0.0
thesarthakrastogi@gmail.com,1,0,0.0
theo@unifai.fr,1,1,1.0
sjc@gmail.com,1,1,1.0


In [10]:
#_in.loc[_in.successful_output==False, "path"].to_list()

In [36]:
unhappy = _in.loc[_in.successful_output==False, ["email", "fname_id", "img_id", "output_parent"]]
unhappy["s3_location"] = unhappy["output_parent"].apply(lambda x: str(x).replace("/output", "/output_corrected").replace("/home/ubuntu/data/visualneurons.com-logos-outputs/", ""))
print(unhappy.shape)
unhappy = unhappy.drop_duplicates(subset=["email", "fname_id"], keep="last")
print(unhappy.shape)
unhappy["payload"] = unhappy.apply(lambda row: f'{{"email": "{row.email}", "s3_location": "{row.s3_location}", "img_id": "{row.img_id}"}}', axis=1)
unhappy["command"] = unhappy.payload.apply(lambda x: f"(echo -n '{x}') | curl -H 'Content-Type: application/json' -d @-  http://172.17.0.2:8080/invocations")
unhappy.to_csv("unhappy.csv", index=False)

(23, 5)
(20, 5)
