In [1]:
# S3 prefix
prefix = 'facerecogproj'

import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

In [2]:
# Change this to Lifecycle Configurations
!pip install --upgrade pip
!pip install facenet_pytorch

Collecting pip
  Using cached https://files.pythonhosted.org/packages/54/0c/d01aa759fdc501a58f431eb594a17495f15b88da142ce14b5845662c13f3/pip-20.0.2-py2.py3-none-any.whl
[31mfastai 1.0.60 requires nvidia-ml-py3, which is not installed.[0m
Installing collected packages: pip
  Found existing installation: pip 10.0.1
    Uninstalling pip-10.0.1:
      Successfully uninstalled pip-10.0.1
Successfully installed pip-20.0.2
[33mYou are using pip version 20.0.2, however version 20.1b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting facenet_pytorch
  Downloading facenet_pytorch-2.2.9-py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 11.4 MB/s eta 0:00:01
Installing collected packages: facenet-pytorch
Successfully installed facenet-pytorch-2.2.9


In [3]:
from facenet_pytorch import InceptionResnetV1, MTCNN

## Download the images from bucket

In [4]:
import boto3
import botocore

BUCKET_NAME = 'images-facerecogproj'
s3_res = boto3.resource('s3')

In [9]:
import re
import boto3
import os

def get_s3_keys(bucket):
    """Get a list of keys in an S3 bucket."""
    keys = []
    s3 = boto3.client('s3')
    resp = s3.list_objects_v2(Bucket= bucket)
    for obj in resp['Contents']:
        keys.append(obj['Key'])
    return keys

def download_keys(keys, file, bucket, verbose = False):
    """ Download all the keys in a list to file"""
    create_dirs(keys, file)
    s3 = boto3.client('s3')
    for i,key in enumerate(keys):
        if verbose:
            print(key)
        try:
            # download as local file
            s3.download_file(bucket, key, os.path.join(file,key))
            
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                print("The object does not exist.")
            else:
                raise
    return print("{} files were downloaded!".format(i))

def create_dirs(keys, file):
    """ Given a set of keys will create the needed files"""
    if not os.path.exists(file):
        os.mkdir(file)
        
    folders = [re.split("/", key)[:-1] for key in keys]
    unique_folders = [list(x) for x in set(tuple(x) for x in folders)]
    success = 0
    for folders in unique_folders:
        path = os.path.join(file,"/".join(folders))
        if not os.path.exists(path):
            os.makedirs(path)
            success += 1
    return print("{} Folders were created".format(success))

In [6]:
import cv2

def crop(input_img):
    '''
      :return:  a cropped face in numpy array format
    '''
    img = cv2.imread(input_img)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    mtcnn = MTCNN(select_largest=False, post_process = False, margin = 50)
    img_cropped = mtcnn(img)
    #simple message notifying if a face was detected or not
    if img_cropped is None:
        print(f'Face not detected in file {input_img}')
        return
    img_cropped = img_cropped.permute(1, 2, 0).int().numpy() 
    return img_cropped
  
def plot_crop_face(img_cropped, output_img):
    '''
    :params:
    :return:
    '''
    plt.imshow(img_cropped)  
    plt.axis('off')
    plt.savefig(output_img)

def cropFace(input_img, output_img = None):
    '''
    :params:
    :return:
    '''
    if output_img is None:
        return crop(input_img)
    else:
        img_cropped = crop(input_img)
        plot_crop_face(img_cropped,output_img)

def cropAllAux(file_dir, output_dir, keys = None):
    """
    Crop all the functions given a directory and save it locally or in a bucket

    Args:
       file_dir (str): Name of the directory that contains all images
       output_dir (str): Name of the directory or bucket were the images should be saved
       keys (json): Json with credential keys


    Returns:
       Print statement
    """
     # It creates the folder if it does not exist
    if not keys:
        os.makedirs(output_dir, exist_ok= True)
        
    for file in os.listdir(file_dir):
        if ".jpg" in file:
            user = re.sub("_.*$","",file)
            img_cropped = crop(os.path.join(file_dir,file))
            if img_cropped is not None:
                if not keys:
                    os.makedirs(os.path.join(output_dir,user), exist_ok= True)
                    #print("Saving file {file} in directory {out}".format(file = file, out = output_dir))
                    cv2.imwrite(os.path.join(output_dir,user,file),img_cropped)
                else:
                    tmp_file = "tmp_"+file
                    tmp_path = os.path.join(os.getcwd(),tmp_file)
                    cv2.imwrite(tmp_file, img_cropped) 
                    try:
                        # Uploading to the bucket
                        print("Saving file {file} in bucket {out}".format(file = file, out = output_dir))
                        uploadBucket(tmp_path, output_dir, file, keys = keys)
                        os.remove(tmp_path) 
                    except: 
                        print("error")        
    return print("Done!")

def cropAll(file_dir, output_dir, keys = None):
    """
    Crop all the functions given a directory and save it locally or in a bucket

    Args:
       file_dir (str): Name of the directory that contains all images
       output_dir (str): Name of the directory or bucket were the images should be saved
       keys (json): Json with credential keys


    Returns:
       Print statement
    """
     # It creates the folder if it does not exist
    for roots,dirs,files in os.walk(file_dir):
        cropAllAux(roots, output_dir, keys = None)
    return print("Done!")

In [7]:
keys = get_s3_keys(BUCKET_NAME)

In [10]:
download_keys(keys,"Images","images-facerecogproj")

0 Folders were created
170 files were downloaded!


In [11]:
cropAll("Images", "Face", keys = None)

Done!
Face not detected in file Images/Azucena/Azucena_6.jpg
Face not detected in file Images/Azucena/Azucena_36.jpg
Face not detected in file Images/Azucena/Azucena_8.jpg
Face not detected in file Images/Azucena/Azucena_56.jpg
Face not detected in file Images/Azucena/Azucena_16.jpg
Face not detected in file Images/Azucena/Azucena_14.jpg
Face not detected in file Images/Azucena/Azucena_33.jpg
Face not detected in file Images/Azucena/Azucena_15.jpg
Face not detected in file Images/Azucena/Azucena_9.jpg
Face not detected in file Images/Azucena/Azucena_34.jpg
Face not detected in file Images/Azucena/Azucena_17.jpg
Face not detected in file Images/Azucena/Azucena_18.jpg
Face not detected in file Images/Azucena/Azucena_55.jpg
Face not detected in file Images/Azucena/Azucena_25.jpg
Face not detected in file Images/Azucena/Azucena_23.jpg
Face not detected in file Images/Azucena/Azucena_54.jpg
Face not detected in file Images/Azucena/Azucena_35.jpg
Face not detected in file Images/Azucena/Azuc

## Ready to have some fun!

In [12]:
from facenet_pytorch import InceptionResnetV1, MTCNN
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import os
import argparse
import cv2
import torch
from PIL import Image
import torchvision.transforms as transforms
import re
import random 

In [13]:
resnet = InceptionResnetV1(pretrained='vggface2').eval()

Downloading parameters (1/2)
Downloading parameters (2/2)


In [14]:
def readFaces(file,model,tensor = False):
    face_dict = {}
    for roots,dirs,files in os.walk(file):
        emb_list = []
        for file in files:
            if '.jpg' in file:
                print(file)
                path = os.path.join(roots,file)
                img_emb = embeddings(path,model)
                if not tensor:
                    img_emb = img_emb.detach().numpy()
                    emb_list.append(img_emb)
        face_dict[re.sub("_.*$","",file)] = emb_list
    train, label = [], []
    for key, values in face_dict.items():
        for val in values:
            train.append(val)
            label.append(key)
    return train, label
        

def embeddings(file, model):
    img = Image.open(file).convert('RGB')
    img_tensor = transforms.functional.to_tensor(img)
    embedding = model(img_tensor.unsqueeze(0))[0]
    return embedding

In [17]:
train, label = readFaces("./Face/", resnet)

Azucena_12.jpg
Azucena_52.jpg
Azucena_59.jpg
Azucena_47.jpg
Azucena_64.jpg
Azucena_61.jpg
Azucena_26.jpg
Azucena_60.jpg
Azucena_11.jpg
Azucena_58.jpg
Azucena_62.jpg
Azucena_20.jpg
Azucena_44.jpg
Azucena_45.jpg
Azucena_7.jpg
Azucena_4.jpg
Azucena_27.jpg
Azucena_1.jpg
Azucena_22.jpg
Azucena_38.jpg
Azucena_42.jpg
Azucena_13.jpg
Azucena_50.jpg
Azucena_49.jpg
Azucena_31.jpg
Azucena_57.jpg
Azucena_40.jpg
Azucena_39.jpg
Azucena_43.jpg
Azucena_10.jpg
Azucena_29.jpg
Azucena_2.jpg
Azucena_51.jpg
Azucena_46.jpg
Azucena_65.jpg
Azucena_21.jpg
Azucena_41.jpg
Azucena_3.jpg
Azucena_37.jpg
Azucena_24.jpg
Azucena_19.jpg
Azucena_63.jpg
Azucena_5.jpg
Azucena_48.jpg
Azucena_30.jpg
Azucena_28.jpg
Iuliia_4.jpg
Iuliia_28.jpg
Iuliia_24.jpg
Iuliia_31.jpg
Iuliia_21.jpg
Iuliia_13.jpg
Iuliia_12.jpg
Iuliia_26.jpg
Iuliia_36.jpg
Iuliia_22.jpg
Iuliia_5.jpg
Iuliia_2.jpg
Iuliia_33.jpg
Iuliia_29.jpg
Iuliia_34.jpg
Iuliia_17.jpg
Iuliia_23.jpg
Iuliia_16.jpg
Iuliia_11.jpg
Iuliia_20.jpg
Iuliia_25.jpg
Iuliia_19.jpg
Iuliia_18.j

In [18]:
# Shuffling the lists
temp = list(zip(train, label)) 
random.shuffle(temp) 
train_rnd, label_rnd = zip(*temp) 
train_rnd, label_rnd = list(train_rnd), list(label_rnd)

In [19]:
#Splitting training and test
test_num = 10
testX, testY = train_rnd[-test_num:],label_rnd[-test_num:]
trainX, trainY = train_rnd[:-test_num],label_rnd[:-test_num]

In [103]:
# Fitting an SVM model
model = SVC(kernel='linear', probability=True)
model.fit(trainX, trainY)
model.predict_proba(testX)

array([[0.03097119, 0.94691604, 0.02211277],
       [0.95348396, 0.02845077, 0.01806527],
       [0.01996723, 0.95584316, 0.02418961],
       [0.03232885, 0.94615404, 0.0215171 ],
       [0.01174646, 0.00870606, 0.97954748],
       [0.0322914 , 0.95446479, 0.01324381],
       [0.94395577, 0.03784279, 0.01820143],
       [0.01137761, 0.00949141, 0.97913098],
       [0.94805945, 0.03152467, 0.02041588],
       [0.01648286, 0.00990969, 0.97360745]])

In [104]:
model.predict(testX)

array(['Iuliia', 'Azucena', 'Iuliia', 'Iuliia', 'Joaquin', 'Iuliia',
       'Azucena', 'Joaquin', 'Azucena', 'Joaquin'], dtype='<U7')

In [105]:
testY

['Iuliia',
 'Azucena',
 'Iuliia',
 'Iuliia',
 'Joaquin',
 'Iuliia',
 'Azucena',
 'Joaquin',
 'Azucena',
 'Joaquin']

In [108]:
import json

with open("new1.txt") as json_data:
    test = json.load(json_data)

In [109]:
import ast 

res = ast.literal_eval(test['body']) 

In [110]:
import numpy as np
model.predict(np.asarray(res).reshape(1,-1))

array(['Joaquin'], dtype='<U7')

In [111]:
model.predict_proba(np.asarray(res).reshape(1,-1))

array([[0.00995952, 0.00784139, 0.98219909]])

In [33]:
!wget https://upload.wikimedia.org/wikipedia/commons/c/c1/Lionel_Messi_20180626.jpg

--2020-04-26 03:49:37--  https://upload.wikimedia.org/wikipedia/commons/c/c1/Lionel_Messi_20180626.jpg
Resolving upload.wikimedia.org (upload.wikimedia.org)... 208.80.154.240, 2620:0:861:ed1a::2:b
Connecting to upload.wikimedia.org (upload.wikimedia.org)|208.80.154.240|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 578583 (565K) [image/jpeg]
Saving to: ‘Lionel_Messi_20180626.jpg’


2020-04-26 03:49:37 (29.1 MB/s) - ‘Lionel_Messi_20180626.jpg’ saved [578583/578583]



In [34]:
mtcnn = MTCNN(select_largest=False, post_process = False, margin = 50)

In [35]:
img_test = cv2.imread("Lionel_Messi_20180626.jpg")
img_test = cv2.cvtColor(img_test, cv2.COLOR_BGR2RGB)
img_test_cropped = mtcnn(img_test)
test_emb = resnet(img_test_cropped.unsqueeze(0))[0]
test_emb = test_emb.detach().numpy()

In [107]:
model.predict_proba(test_emb.reshape(1,-1))

array([[0.20291716, 0.04471621, 0.75236663]])

## Upload the data

In [71]:
import numpy as np
import os
from sklearn import datasets
import pickle as pkl 

# Load data and join it in a dataframe
data = {'data': trainX, "label": trainY}

# Create directory and write csv
os.makedirs('./data', exist_ok=True)
#data.to_csv("./data/data.csv", index = False)

with open('./data/data.pickle', 'wb') as handle:
    pkl.dump(data, handle, protocol=pkl.HIGHEST_PROTOCOL)

In [72]:
with open(os.path.join("./data", "data.pickle"), 'rb') as handle:
    data = pkl.load(handle)

In [73]:
import argparse
import pandas as pd
import os
import pickle as pkl 

from sklearn.svm import SVC
from sklearn.externals import joblib


if __name__ == '__main__':
    
    parser = argparse.ArgumentParser()

    # Hyperparameters are described here. In this simple example we are just including one hyperparameter.
    parser.add_argument('--C', type=float, default= 1)
    parser.add_argument('--kernel', type=str, default='linear')
    parser.add_argument('--gamma', type= str, default='scale')
    parser.add_argument('--probability', type= bool, default= True)

    

    # Sagemaker specific arguments. Defaults are set in the environment variables.
    parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])

    args, unknown = parser.parse_known_args()

    # Take the set of files and read them all into a single pandas dataframe
    with open(os.path.join(args.train, "data.pickle"), 'rb') as handle:
        data = pkl.load(handle)

    # labels are in the first column
    train_y = data['label']
    train_X = data['data']

    # Now use scikit-learn's NN to train the model.
    model = model = SVC(C = args.C,
                        kernel= args.kernel,
                        gamma = args.gamma,
                        probability= args.probability
                       )


    
    model = model.fit(train_X, train_y)

    # Print the coefficients of the trained classifier, and save the coefficients
    joblib.dump(model, os.path.join(args.model_dir, "model.joblib"))
    
def model_fn(model_dir):
    """Deserialized and return fitted model

    Note that this should have the same name as the serialized model in the main method
    """
    model = joblib.load(os.path.join(model_dir, "model.joblib"))
    return model

KeyError: 'SM_OUTPUT_DATA_DIR'

In [78]:
from sagemaker.sklearn.estimator import SKLearn

script_path = './model/model.py'

sklearn = SKLearn(
    entry_point=script_path,
    train_instance_type="ml.m4.xlarge",
    role=role,
    sagemaker_session=sagemaker_session
    #hyperparameters={'max_leaf_nodes': 30}
)

In [79]:
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()
role=sagemaker.get_execution_role()
inputs = sagemaker_session.upload_data(path='data', key_prefix='data/data.pickle', bucket='video-facerecogproj')


In [80]:
sklearn.fit({'train': inputs})

2020-04-26 04:47:36 Starting - Starting the training job...
2020-04-26 04:47:38 Starting - Launching requested ML instances.........
2020-04-26 04:49:08 Starting - Preparing the instances for training...
2020-04-26 04:49:56 Downloading - Downloading input data...
2020-04-26 04:50:28 Training - Downloading the training image...
2020-04-26 04:51:01 Uploading - Uploading generated training model
2020-04-26 04:51:01 Completed - Training job completed
[34m2020-04-26 04:50:49,417 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-04-26 04:50:49,419 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-04-26 04:50:49,431 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-04-26 04:50:49,723 sagemaker-containers INFO     Module model does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m2020-04-26 04:50:49,723 sagemaker-containers INFO     Generating setup

In [112]:
predictor = sklearn.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

---------------!

In [13]:
import os
import pickle as pkl
with open(os.path.join("data/", "data.pickle"), 'rb') as handle:
    data = pkl.load(handle)

In [16]:
train = data['data']
label = data['label']

In [19]:
print(predictor.predict(train[:10]))

['Joaquin' 'Joaquin' 'Joaquin' 'Joaquin' 'Joaquin' 'Joaquin' 'Joaquin'
 'Joaquin' 'Joaquin' 'Joaquin']


In [20]:
print(label[:10])

['Joaquin', 'Joaquin', 'Joaquin', 'Joaquin', 'Joaquin', 'Joaquin', 'Joaquin', 'Joaquin', 'Joaquin', 'Joaquin']


In [115]:
sklearn.delete_endpoint()

In [114]:
train[0]

array([-2.57115811e-02,  1.98415630e-02, -1.46681396e-03, -4.20558453e-02,
        8.22106823e-02, -3.96466404e-02,  3.55508327e-02, -1.26171976e-01,
        5.03112376e-02,  1.39594767e-02, -1.01855956e-02,  4.95452899e-03,
        1.04509955e-02, -3.38419713e-02, -3.14687751e-02, -1.02371015e-02,
       -4.50850837e-02,  3.12295444e-02,  3.96762639e-02,  5.04005440e-02,
       -2.62199412e-03, -2.48780828e-02,  1.87106375e-02, -1.28021343e-02,
        1.49545973e-04, -2.64918851e-03, -3.29065509e-02, -1.30084723e-01,
       -1.30194187e-01, -1.94491241e-02,  2.65950337e-03,  1.55568793e-02,
       -5.17100170e-02,  8.39981716e-03,  2.02221964e-02, -6.34659268e-03,
        9.34613720e-02, -8.56327042e-02, -4.86232117e-02,  1.67191345e-02,
       -6.88029528e-02,  3.38283442e-02, -2.85061672e-02, -1.17968796e-02,
       -3.30465883e-02,  4.41443026e-02, -1.17959166e-02,  7.34066218e-02,
       -6.09000809e-02, -2.36181896e-02,  4.93767522e-02, -6.12642244e-02,
       -8.35187659e-02,  