In [0]:
import numpy as np
import pandas as pd
from sklearn.utils.multiclass import unique_labels
import urllib.request
from progressbar import *
import socket
import os

# Reading data

[FaceScrub dataset link](http://vintage.winklerbros.net/facescrub.html "FaceScrub")   

<img align="left" width="580" height="200" src="http://vintage.winklerbros.net/Images/facescrub.jpg">   

In [0]:
f = open(base_dir + "/faceScrub/facescrub_actors.txt", 'r')
actors = []
for line in f:
    actor_info = line.split('\t')
    actors.append([actor_info[0], actor_info[3]])
actors = np.array(actors)
f.close()

f = open(base_dir + "/faceScrub/facescrub_actresses.txt", 'r')
actresses = []
for line in f:
    actresses_info = line.split('\t')
    actresses.append([actresses_info[0], actresses_info[3]])
actresses = np.array(actresses)
f.close()

# Example of data

In [0]:
pd.DataFrame(data=actors[1:11], columns=actors[0])

Unnamed: 0,name,url
0,Aaron Eckhart,http://upload.wikimedia.org/wikipedia/commons/...
1,Aaron Eckhart,http://movies.dosthana.com/sites/default/files...
2,Aaron Eckhart,http://upload.wikimedia.org/wikipedia/commons/...
3,Aaron Eckhart,http://25.media.tumblr.com/nJ2vga5sae9o2ks4Flt...
4,Aaron Eckhart,http://upload.wikimedia.org/wikipedia/commons/...
5,Aaron Eckhart,http://media.zenfs.com/en_us/Movies/PhotoG/2nd...
6,Aaron Eckhart,http://img2.timeinc.net/people/i/2008/news/080...
7,Aaron Eckhart,http://latimesblogs.latimes.com/photos/uncateg...
8,Aaron Eckhart,http://collider.com/wp-content/uploads/Aaron-E...
9,Aaron Eckhart,http://movies.dosthana.com/sites/default/files...


# Dataset info

In [0]:
names_actors = unique_labels(actors[1:,0])
names_actresses = unique_labels(actresses[1:,0])
names_all = np.append(names_actors, names_actresses)
names_all = np.array(names_all)

np.savez(base_dir + "/faceScrub/all_names", names_all=names_all)

print("Num images: ", len(actors) + len(actresses) - 2)
print("Num images with actors: ", len(actors) - 1)
print("Num images with actresses: ", len(actresses) - 1)
print("Num actors:", len(names_actors))
print("Num actresses:", len(names_actresses))

Num images:  106863
Num images with actors:  55306
Num images with actresses:  51557
Num actors: 265
Num actresses: 265


# Download images

In [0]:
base_dir = "/content/drive/My Drive/Colab/Roonyx/Face recognition"
folder_for_actors = base_dir + "/faceScrub/actors_images"
!mkdir '$folder_for_actors'
folder_for_actresses = base_dir + "/faceScrub/actresses_images"
!mkdir '$folder_for_actresses'

In [0]:
def download(index_start, index_stop, sex='male'):

  def url_is_alive(url):
    try:
        urllib.request.urlopen(url)
        return True
    except Exception:
        return False

  if sex == "male":
    peoples = actors
  elif sex == "female":
    peoples = actresses
  else: 
    return None

  pbar = ProgressBar(maxval=index_stop-index_start)
  pbar.start()
  socket.setdefaulttimeout(5)

  # Download images
  progress = 0
  for image_id in range(index_start,index_stop,1):
    href = peoples[image_id,1]
    if(url_is_alive(href)):
      try:
        if sex == 'female':
          urllib.request.urlretrieve(href, folder_for_actresses+"/{}.jpg".format(image_id + len(actors)-1))
        else:  urllib.request.urlretrieve(href, folder_for_actors+"/{}.jpg".format(image_id))
      except Exception: pass
    pbar.update(progress)
    progress+=1
        
  pbar.finish()

In [5]:
download(1,10000,"male")

100% (9999 of 9999) |####################| Elapsed Time: 3:57:58 Time:  3:57:58


In [0]:
download(10000,20000,"male")

100% (10000 of 10000) |##################| Elapsed Time: 1:42:45 Time:  1:42:45


In [0]:
download(20000,30000,"male")

100% (10000 of 10000) |##################| Elapsed Time: 1:47:03 Time:  1:47:03


In [0]:
download(30000,40000,"male")

100% (10000 of 10000) |##################| Elapsed Time: 1:43:21 Time:  1:43:21


In [0]:
download(40000,55306,"male")

100% (15306 of 15306) |##################| Elapsed Time: 4:41:44 Time:  4:41:44


In [0]:
download(1,10000,"female")

100% (9999 of 9999) |####################| Elapsed Time: 2:13:33 Time:  2:13:33


In [0]:
download(10000,20000,"female")

100% (10000 of 10000) |##################| Elapsed Time: 1:58:13 Time:  1:58:13


In [0]:
download(20000,30000,"female")

100% (10000 of 10000) |##################| Elapsed Time: 1:38:27 Time:  1:38:27


In [0]:
download(30000,40000,"female")

 53% (5393 of 10000) |##########         | Elapsed Time: 2:05:20 ETA:   1:54:23

In [0]:
download(40000,51557,"female")

 84% (9716 of 11557) |###############    | Elapsed Time: 1:58:05 ETA:   0:31:06

# Save metadata

In [0]:
folder_for_metadata = base_dir + "/faceScrub/metadata"
!mkdir '$folder_for_metadata'

In [0]:
actors_files_names = os.listdir(folder_for_actors)
actresses_files_names = os.listdir(folder_for_actresses)

for i in range(0,len(actors_files_names),1):
  index_dot = actors_files_names[i].find('.')
  actors_files_names[i] = int(actors_files_names[i][:index_dot])

for i in range(0,len(actresses_files_names),1):
  index_dot = actresses_files_names[i].find('.')
  actresses_files_names[i] = int(actresses_files_names[i][:index_dot])

print(actors_files_names)
print(actresses_files_names)

In [74]:
count1 = len(os.listdir(folder_for_actors))
count2 = len(os.listdir(folder_for_actresses))
print("Num downloaded images:", count1+count2)

Num downloaded images: 53865
