In [0]:
import numpy as np
import pandas as pd
from sklearn.utils.multiclass import unique_labels
import urllib.request
from progressbar import *
import socket

# Reading data

[FaceScrub dataset link](http://vintage.winklerbros.net/facescrub.html "FaceScrub")   

<img align="left" width="580" height="200" src="http://vintage.winklerbros.net/Images/facescrub.jpg">   

In [0]:
base_dir = "/content/drive/My Drive/Colab/Roonyx/Face recognition"
folder_for_images = base_dir + "/faceScrub/images"
# !mkdir '$folder_for_images'

f = open(base_dir + "/faceScrub/facescrub_actors.txt", 'r')
actors = []
for line in f:
    actor_info = line.split('\t')
    actors.append([actor_info[0], actor_info[3]])
actors = np.array(actors)
f.close()

f = open(base_dir + "/faceScrub/facescrub_actresses.txt", 'r')
actresses = []
for line in f:
    actresses_info = line.split('\t')
    actresses.append([actresses_info[0], actresses_info[3]])
actresses = np.array(actresses)
f.close()

# Example of data

In [0]:
pd.DataFrame(data=actors[1:11], columns=actors[0])

Unnamed: 0,name,url
0,Aaron Eckhart,http://upload.wikimedia.org/wikipedia/commons/...
1,Aaron Eckhart,http://movies.dosthana.com/sites/default/files...
2,Aaron Eckhart,http://upload.wikimedia.org/wikipedia/commons/...
3,Aaron Eckhart,http://25.media.tumblr.com/nJ2vga5sae9o2ks4Flt...
4,Aaron Eckhart,http://upload.wikimedia.org/wikipedia/commons/...
5,Aaron Eckhart,http://media.zenfs.com/en_us/Movies/PhotoG/2nd...
6,Aaron Eckhart,http://img2.timeinc.net/people/i/2008/news/080...
7,Aaron Eckhart,http://latimesblogs.latimes.com/photos/uncateg...
8,Aaron Eckhart,http://collider.com/wp-content/uploads/Aaron-E...
9,Aaron Eckhart,http://movies.dosthana.com/sites/default/files...


# Dataset info

In [0]:
names_actors = unique_labels(actors[1:,0])
names_actresses = unique_labels(actresses[1:,0])
names_all = np.append(names_actors, names_actresses)
names_all = np.array(names_all)

np.savez(base_dir + "/faceScrub/all_names", names_all=names_all)

print("Num images: ", len(actors) + len(actresses) - 2)
print("Num images with actors: ", len(actors) - 1)
print("Num images with actresses: ", len(actresses) - 1)
print("Num actors:", len(names_actors))
print("Num actresses:", len(names_actresses))

Num images:  106863
Num images with actors:  55306
Num images with actresses:  51557
Num actors: 265
Num actresses: 265


# Download images

In [0]:
folder_for_metadata = base_dir + "/faceScrub/metadata"
# !mkdir '$folder_for_metadata'

def download(index_start, index_stop, sex='male'):

  def url_is_alive(url):
    try:
        urllib.request.urlopen(url)
        return True
    except Exception:
        return False

  if sex == "male":
    peoples = actors
  elif sex == "male":
    peoples = actresses
  else: 
    return None

  data_info = []
  error_indices = []
  pbar = ProgressBar(maxval=index_stop-index_start)
  pbar.start()
  socket.setdefaulttimeout(5)

  # Download actors
  progress = 0
  for image_id in range(index_start,index_stop,1):
    href = peoples[image_id,1]
    if(url_is_alive(href)):
      try:
        if sex == 'female':
          urllib.request.urlretrieve(href, folder_for_images+"/{}.jpg".format(image_id + len(actors)-1))
        else:  urllib.request.urlretrieve(href, folder_for_images+"/{}.jpg".format(image_id))
        people_id = np.where(names_all == peoples[image_id,0])[0][0]
        name = peoples[image_id,0]
        data_info.append([image_id, people_id, name, sex])
      except Exception:
        error_indices.append(image_id)
    else: error_indices.append(image_id)
    pbar.update(progress)
    progress+=1
        
  pbar.finish()

  data_info = np.array(data_info)
  error_indices = np.array(error_indices)
  np.savez(folder_for_metadata + "/metadata_{}_{}_{}".format(index_start, index_stop, sex), data_info=data_info)
  np.savez(folder_for_metadata + "/error_indices_{}_{}_{}".format(index_start, index_stop, sex), error_indices=error_indices)

In [0]:
download(1,10000,"male")

 58% (5823 of 9999) |###########         | Elapsed Time: 2:18:20 ETA:   1:32:41

In [0]:
download(10000,20000,"male")

100% (10000 of 10000) |##################| Elapsed Time: 1:42:45 Time:  1:42:45


In [0]:
download(20000,30000,"male")

100% (10000 of 10000) |##################| Elapsed Time: 1:47:03 Time:  1:47:03


In [0]:
download(30000,40000,"male")

100% (10000 of 10000) |##################| Elapsed Time: 1:43:21 Time:  1:43:21


In [0]:
download(40000,55306,"male")

 39% (6066 of 15306) |#######            | Elapsed Time: 1:53:26 ETA:   2:02:53

In [0]:
download(1,10000,"female")

In [0]:
download(10000,20000,"female")

In [0]:
download(20000,30000,"female")

In [0]:
download(30000,40000,"female")

In [0]:
download(40000,51557,"female")