In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("cleanup_thumb.csv")

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,image_url
0,7,MTM3Mjk5LTgzMjE2MDIyX3R3ZWV0XzE3NDgz,https://media.allure.com/photos/5a0ca6ff949532...
1,16,MTM3MzI2LTgzMTQ4MDI3X3R3ZWV0XzE3NDgz,https://assets.vogue.com/photos/5a0b1d2aa29362...
2,242,MTM3MzI5LTgzMDAxNzA1X3R3ZWV0XzE3NDgz,https://media.wmagazine.com/photos/5a03e76c931...
3,247,MTM3MzI5LTgyOTcxMzI0X3R3ZWV0XzA,https://media.wmagazine.com/photos/59f8f095dc2...
4,252,MTM3MzI2LTgyOTQ5NTY0X3R3ZWV0XzA,https://assets.vogue.com/photos/5891eb6223f988...


In [3]:
import boto
import boto3
conn = boto.connect_s3()
import requests

In [4]:
# Uses the creds in ~/.aws/credentials
s3 = boto3.resource('s3')
bucket_name_to_upload_image_to = 'thumbcleanup'

In [5]:
# Do this as a quick and easy check to make sure your S3 access is OK
for bucket in s3.buckets.all():
    if bucket.name == bucket_name_to_upload_image_to:
        print('Good to go. Found the bucket to upload the image into.')
        good_to_go = True

if not good_to_go:
    print('Not seeing your s3 bucket, might want to double check permissions in IAM')

Good to go. Found the bucket to upload the image into.


In [7]:
mapping_dict ={}
for i, img_url in enumerate(df["image_url"]):
    img_name = "thumbcleanup_%05d" % (i,)
    mapping_dict[img_name] = img_url
    
    if (img_url == np.nan) | (str(img_url) == "nan"):
        continue
    else:
        # Uses the creds in ~/.aws/credentials
        s3_image_filename = img_name
        internet_image_url = img_url

        # Given an Internet-accessible URL, download the image and upload it to S3,
        # without needing to persist the image to disk locally
        req_for_image = requests.get(internet_image_url, stream=True)
        file_object_from_req = req_for_image.raw
        req_data = file_object_from_req.read()

        # Do the actual upload to s3
        s3.Bucket(bucket_name_to_upload_image_to).put_object(Key=s3_image_filename, Body=req_data)

In [8]:
md_01 = pd.DataFrame(mapping_dict, index = range(0,len(mapping_dict)))
md_02 = pd.DataFrame(md_01.T[0])
pd.DataFrame(md_02).to_csv('../assets/thumbnail_cleanup.csv', index = range(0,len(mapping_dict)))

In [9]:
bucket_name = 'thumbcleanup'
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name)
images = [img.key for img in bucket.objects.all()]
client = boto3.client('rekognition')

results_wide = []
results_long = []

for img in images:
    img_dict_wide = {'img': img}
    #print(img)
    try:
        labels = client.detect_labels(Image={'S3Object':{'Bucket':bucket_name,'Name':img}},MinConfidence=75)
        if 'Labels' in labels:
            for l, label in enumerate(labels['Labels']):
                results_long.append({'img': img, 'type': 'Label', 'label': label['Name'], 
                                     'confidence': label['Confidence']})
                col = 'label_' + str(l)
                img_dict_wide[col] = label['Name']
                img_dict_wide[col + '_confidence'] = label['Confidence'] 
    except:
        continue
    try:        
        celebrities = client.recognize_celebrities(Image={'S3Object':{'Bucket':bucket_name,'Name':img}})
        if 'CelebrityFaces' in celebrities:
            for f, face in enumerate(celebrities['CelebrityFaces']):
                results_long.append({'img': img, 'type': 'Celebrity', 'label': face['Name'], 
                                     'confidence': face['Face']['Confidence']})
                col = 'celeb_' + str(f)
                img_dict_wide[col] = face['Name']
                img_dict_wide[col + '_confidence'] = face['Face']['Confidence']
    except:
        continue
    try:
        text_in_image = client.detect_text(Image={'S3Object':{'Bucket':bucket_name,'Name':img}})
        if "TextDetections" in text_in_image:
            for w, word in enumerate(text_in_image["TextDetections"]):
                results_long.append({'img': img, 'type': "Text", 'label': word["DetectedText"],
                                    'confidence': word["Confidence"]})
                col = 'word_' + str(w)
                img_dict_wide[col] = word["DetectedText"]
                img_dict_wide[col+ '_confidence'] = word["Confidence"]
    except:
        continue
            
    if 'Labels' not in labels and 'CelebrityFaces' not in celebrities and "TextDetections" not in text_in_image:
        results_long.append({'img': img, 'type': None, 'label': None, 'confidence': None})
        
    results_wide.append(img_dict_wide)
####
####
img_df_long = pd.DataFrame(results_long, columns=['img', 'type', 'label', 'confidence'])
img_df_wide = pd.DataFrame(results_wide)
cols = sorted(img_df_wide.columns)
cols.remove('img')
img_df_wide = img_df_wide[['img'] + cols]

In [10]:
md_01 = pd.DataFrame(mapping_dict, index = range(0,len(mapping_dict)))
md_02 = pd.DataFrame(md_01.T[0])
pd.DataFrame(md_02).to_csv('../assets/thumb_cleanup.csv', index = range(0,len(mapping_dict)))

In [11]:
img_df_long

Unnamed: 0,img,type,label,confidence
0,thumbcleanup_00046,Label,Finger,95.046822
1,thumbcleanup_00046,Label,Hand,76.756683
2,thumbcleanup_00047,Label,Road,87.269890
3,thumbcleanup_00047,Text,.y,47.015461
4,thumbcleanup_00047,Text,ACE,85.307434
5,thumbcleanup_00047,Text,.y,47.015461
6,thumbcleanup_00047,Text,ACE,85.307434
7,thumbcleanup_00048,Label,Human,99.211685
8,thumbcleanup_00048,Label,People,99.211685
9,thumbcleanup_00048,Label,Person,99.211685


In [12]:
# for-loop to drop all "Human", "People", "Person" Label rows where the image 
# contains a celebrity.
for img in img_df_long["img"]:
    if len(img_df_long.loc[(img_df_long["img"] == img) & (img_df_long["type"]=="Celebrity")])>0:
        img_df_long = img_df_long.loc[~((img_df_long['img'] == img) 
                          & (img_df_long['label'].isin(['Human', 'People', 'Person'])))]

In [13]:
len(img_df_long)

261

In [14]:
# for-loop to drop all "Label" rows below 90% confidence if there is a celebrity
for img in img_df_long["img"]:
    if len(img_df_long.loc[(img_df_long["img"] == img) & (img_df_long["type"]=="Celebrity")])>0:
        img_df_long = img_df_long.loc[~((img_df_long['img'] == img) 
                          & (img_df_long['type'].isin(['Label'])) & (img_df_long['confidence']<90))]

In [15]:
len(img_df_long)

240

In [16]:
# for loop to drop all "Label", "Sticker", "Text" label rows where image contains text.
for img in img_df_long["img"]:
    if len(img_df_long.loc[(img_df_long["img"] == img) & (img_df_long["type"]=="Text")])>0:
        img_df_long = img_df_long.loc[~((img_df_long['img'] == img) 
                          & (img_df_long['label'].isin(['Label', 'Sticker', 'Text'])))]

In [17]:
len(img_df_long)

240

In [18]:
media_new = []
for img in img_df_long['img'].unique():
    img_dict = {'img': img}
    if len(img_df_long[(img_df_long['img']==img) & (img_df_long['type']=='Label')])>0:
        img_dict['label'] = ' '.join(img_df_long.loc[(img_df_long['img']==img) & (img_df_long['type']=='Label'), 'label'].tolist())
    else:
        img_dict['label'] = None
    if len(img_df_long[(img_df_long['img']==img) & (img_df_long['type']=='Text')])>0:
        text = [str(detected_text) 
                for detected_text in img_df_long.loc[(img_df_long['img']==img) & (img_df_long['type']=='Text'), 'label'].tolist()]
        img_dict['text'] = ' '.join(text)
    else:
        img_dict['text'] = None
    img_dict['celebrity'] = len(img_df_long[(img_df_long['img']==img) & (img_df_long['type']=='Celebrity')])>0
    media_new.append(img_dict)
media_new_df = pd.DataFrame(media_new)

In [19]:
media_new_df

Unnamed: 0,celebrity,img,label,text
0,False,thumbcleanup_00046,Finger Hand,
1,False,thumbcleanup_00047,Road,.y ACE .y ACE
2,False,thumbcleanup_00048,Human People Person Female Girl Woman,
3,True,thumbcleanup_00049,,
4,False,thumbcleanup_00050,,Aol. Log In wl our Aol a ount Aol. our wl In L...
5,True,thumbcleanup_00051,Flora Jar Plant Potted Plant Pottery Vase,T T
6,True,thumbcleanup_00052,Bikini Clothing Swimwear,
7,False,thumbcleanup_00053,Knot Apartment Building Building City High Ris...,H I H I
8,False,thumbcleanup_00054,Human People Person,lt lt
9,True,thumbcleanup_00055,Clothing Maillot Swimwear Bikini,


In [20]:
media_new_df.to_csv("01_thumb_cleanup_text_data.csv")