In [56]:
import torch
import clip
from PIL import Image
import os
from os.path import join, isdir, expanduser
from tqdm import tqdm

data_path = expanduser('~/datasets')

# need to adapt locally

raw_image_path = ["share", "raw_images"]
construction_path = ["share", "construction"]
finished_path = ["share", "finished"]

image_files = os.listdir(join(data_path, *raw_image_path))


device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)


In [None]:
# Test CLIP performance for keyword sentences with already correctly separated images 

num_images = 1000
size = (256,256)

# Prepare the inputs
raw_image_batch = [Image.open(join(data_path, *raw_image_path, f))
          for f in tqdm(image_files[:num_images], desc="Raw Images loading") if f.endswith('.jpg')]

# image = raw_image_batch[1]
images = raw_image_batch

classes=["building in construction", "finished building"]

pp_images = [preprocess(img) for img in images]

for img in pp_images:
    image_input = img.unsqueeze(0).to(device)
    # image_input = preprocess(images).unsqueeze(0).to(device)
    text_inputs = torch.cat([clip.tokenize(f"a image of a {c}") 
                                 for c in classes]).to(device)

    # Calculate features
    with torch.no_grad():
        image_features = model.encode_image(image_input)
        text_features = model.encode_text(text_inputs)

    # Pick the top 5 most similar labels for the image
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    values, indices = similarity[0].topk(2)

    # Print the result
    print("\nTop predictions:\n")
    for value, index in zip(values, indices):
        print(f"{classes[index]}: {100 * value.item():.2f}%")


In [None]:
import requests
import pandas as pd
from io import BytesIO
from PIL import UnidentifiedImageError

def save_file(path, filename):
    if os.path.exists(join(path, filename)):
        save_file(path, "0"+filename)
    else:
        file = open(join(path, filename), "wb")
        file.write(response.content)
        file.close()


classes=["skyscraper in construction", "finished skyscraper", "construction site"]

# data = pd.read_csv(join(data_path, "share", "buildingconstruction.tsv"),
#                    sep="\t", error_bad_lines=False)
# data = data.rename(columns={
#                    'http://farm4.staticflickr.com/3055/2330466409_fc8133ec39.jpg': 'image_url'})

data = pd.read_csv(join(data_path, "share", "skyscraper.tsv"),
                   sep="\t", error_bad_lines=False)
data = data.rename(columns={
                   'http://farm3.staticflickr.com/2384/3543591719_b5f2cf8c98.jpg': 'image_url'})

# data = data.rename(columns={
#    'http://farm3.staticflickr.com/2384/3543591719_b5f2cf8c98.jpg': 'image_url'})
data = data['image_url']
# print(len(data.tolist()))

for i, url in enumerate(data.tolist()[0:10000]):
    # response = requests.get("https://i.imgur.com/ExdKOOz.png")
    try:
        response = requests.get(url)
    except requests.exceptions.MissingSchema as err:
        print(err)
        continue
        
    try:
        img = Image.open(BytesIO(response.content))
    except UnidentifiedImageError:
        continue

    
    image_input = preprocess(img).unsqueeze(0).to(device)
    text_inputs = torch.cat([clip.tokenize(f"a image of a {c}") 
                                 for c in classes]).to(device)

    # Calculate features
    with torch.no_grad():
        image_features = model.encode_image(image_input)
        text_features = model.encode_text(text_inputs)

    # Pick the top 5 most similar labels for the image
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    values, indices = similarity[0].topk(2)

    # save image if it is more likely to be finished
    if indices[0] == 1:
        # save_file(join(data_path, *construction_path), f"{i}.jpg")
        save_file(join(data_path, *finished_path), f"{i}.jpg")
        

        # Print the result
        print("\nTop predictions:\n")
        for value, index in zip(values, indices):
            print(f"{classes[index]}: {100 * value.item():.2f}%")




  data = pd.read_csv(join(data_path, "share", "skyscraper.tsv"),
  data = pd.read_csv(join(data_path, "share", "skyscraper.tsv"),



Top predictions:

finished skyscraper: 78.81%
skyscraper in construction: 20.89%

Top predictions:

finished skyscraper: 84.33%
skyscraper in construction: 15.36%

Top predictions:

finished skyscraper: 49.44%
skyscraper in construction: 49.44%

Top predictions:

finished skyscraper: 84.77%
skyscraper in construction: 15.20%

Top predictions:

finished skyscraper: 73.97%
skyscraper in construction: 25.17%

Top predictions:

finished skyscraper: 58.45%
skyscraper in construction: 41.43%

Top predictions:

finished skyscraper: 70.12%
skyscraper in construction: 29.69%


In [15]:
clip.available_models()


['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [35]:
from PIL import Image
import requests
from io import BytesIO
url = 'https://farm4.staticflickr.com/3055/2330466409_fc8133ec39.jpg'
response = requests.get(url)
file = open(join(data_path, *construction_path, f"{1}.jpg"), "wb")
file.write(response.content)
file.close()

# img = Image.open(BytesIO(response.content))
# img

In [36]:
image_files = os.listdir(join(data_path, *construction_path))
for i in image_files:
    print(i)

1.jpg
