In [1]:
PROJECT_FOLDER = "/data/students_home/amoscatelli/Desktop/actionAnalysis"
SCENE_FOLDER_PATH = PROJECT_FOLDER + "/scenes/"
SCENE_POSES_FOLDER_PATH = SCENE_FOLDER_PATH + "poses/"
SCENE_DATASET_FOLDER_PATH = SCENE_FOLDER_PATH + "dataset/"
INPUT_SCENE_FOLDER_PATH = SCENE_FOLDER_PATH + "input_video/"
OUTPUT_SCENE_FOLDER_PATH = SCENE_FOLDER_PATH + "output_video/"
INFERENCE_MODEL_FOLDER_PATH = SCENE_FOLDER_PATH + "model/"

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [3]:
from os.path import isfile, isdir, join, exists
import cv2
from PIL import Image

# extracting the frames of the passed video file name in output_folder
def extractFrames(filePath, output_folder, REQUESTED_FRAMES_PER_SECONDS = 9999999, rotate = 0):

    # Opens the Video file
    video = cv2.VideoCapture(filePath)

    fps = round(video.get(cv2.CAP_PROP_FPS))
    length = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    width  = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))

    print("Original FPS: {}".format(int(fps)))
    print("Requested FPS: {}".format(int(REQUESTED_FRAMES_PER_SECONDS)))
    print("length: {} width: {} height: {}".format(length, width, height))
    frame_ratio_to_keep = max(round(fps/REQUESTED_FRAMES_PER_SECONDS),1)

    print("pick 1 frame every {}".format(int(frame_ratio_to_keep)))

    try:
        os.mkdir(output_folder)
    except:
        pass

    !rm $output_folder/* # cleaning output_folder

    i = -1
    while(video.isOpened()):
        i+=1
        ret, frame = video.read()
        if(i%frame_ratio_to_keep!=0):
            # print("DISCARDED")
            continue
        # print("PASSED")
        if ret == False:
            # print("FINISHED")
            break
        outputFramePath = join(output_folder,"frame{:04d}.jpg".format(i))
        cv2.imwrite(outputFramePath, frame)
        if rotate > 0:
            rotated = Image.open(outputFramePath).rotate(rotate)
            rotated.save(outputFramePath)

    video.release()
    cv2.destroyAllWindows()


In [3]:
# sceneFileList = [f for f in scandir(INPUT_SCENE_FOLDER_PATH)]
# temp_folder = SCENE_FOLDER_PATH + "temp_extraction_frames/"
# filePath = sceneFileList[5].path
# filePath
# # extractFrames(filePath, temp_folder)  
# # # extractFrames(sceneFile.path,temp_folder,rotate=180)

# # Opens the Video file
# video = cv2.VideoCapture(filePath)

# fps = round(video.get(cv2.CAP_PROP_FPS))
# length = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
# width  = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
# height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))

# print("Original FPS: {}".format(int(fps)))
# print("Requested FPS: {}".format(int(REQUESTED_FRAMES_PER_SECONDS)))
# print("length: {} width: {} height: {}".format(length, width, height))


In [26]:
# sceneFileList[5].path

# Extracting poses from scenes

In [5]:
import pickle
import torch, torchvision
print(torch.__version__)
# You may need to restart your runtime prior to this, to let your installation take effect
# Some basic setup
# Setup detectron2 logger
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

# import some common libraries
import numpy as np
#from google.colab.patches import cv2_imshow

# import some common detectron2 utilities
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog

1.4.0+cu100


In [10]:
# from os import listdir, scandir
# sceneFileList = [f for f in scandir(INPUT_SCENE_FOLDER_PATH)]

In [6]:
# sceneFile = sceneFileList[0]
# sceneFile.path

In [7]:
# sceneFileList[0].path

In [8]:
# temp_folder = SCENE_FOLDER_PATH + "temp_extraction_frames/"
# framePaths = [f.path for f in os.scandir(temp_folder) if f.is_file() and f.path.endswith('.jpg')]
# framePaths[0]

In [7]:
# import sys 
from os import listdir, scandir

sceneFileList = [f for f in scandir(INPUT_SCENE_FOLDER_PATH)]

#### PoseNet config #### 
poseNetModel = 101 #50, 75, 100, 101

#### Detectron2 config #### 
cfg = get_cfg()
detectron2ConfigName = "/detectron2_repo/configs/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml"
modelWeightName = "detectron2://COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x/139686956/model_final_5ad38f.pkl"
# detectron2ConfigName = "/detectron2_repo/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml"
# modelWeightsName = "detectron2://COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x/137849621/model_final_a6e10b.pkl"
cfg.merge_from_file(PROJECT_FOLDER+detectron2ConfigName)
cfg.MODEL.WEIGHTS = modelWeightName
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.4  # set threshold for this model

predictor = DefaultPredictor(cfg) # Detectron2 predictor

for idx_scene,sceneFile in enumerate(sceneFileList):
    print("#### {} / {} - {} ###".format(idx_scene+1,len(sceneFileList),sceneFile.name))    
    video_points_folder_name = SCENE_POSES_FOLDER_PATH+sceneFile.name[:-4]
    
    if not exists(video_points_folder_name):
        os.mkdir(video_points_folder_name)
    else:
        continue
    
    
#     ######### Extracting frames #########
    temp_folder = SCENE_FOLDER_PATH + "temp_extraction_frames/"
    extractFrames(sceneFile.path,temp_folder,rotate=180) # extract the video frames in temp_folder
    framePaths = [f.path for f in os.scandir(temp_folder) if f.is_file() and f.path.endswith('.jpg')]
    
    ######### POSENET #########
    poseNetFolder = video_points_folder_name+"/PoseNet-"+str(poseNetModel)
    if not exists(poseNetFolder):
        os.mkdir(poseNetFolder)
    #         !export CUDA_VISIBLE_DEVICES=$GPU_CARD && 
        !cd $PROJECT_FOLDER/posenet-python && python keyPointsLogger.py \
            --model $poseNetModel \
            --image_dir $temp_folder \
            --output_dir $poseNetFolder
    else:
        print("######  video already parsed with PoseNet #####")

    print("######  PoseNet done  #####")


    ######### Detectron2 #########
    print("######  parsing with Detectron2.... #####")
    detectron2Model = detectron2ConfigName.split("/")[-1][:-5]
    detectron2Folder = video_points_folder_name+"/Detectron2-"+str(detectron2Model)
    if not exists(detectron2Folder):
        os.mkdir(detectron2Folder)
        for frame_path in framePaths:
            outputsFile = frame_path.split("/")[-1][:-4]+".pickle" # e.g.: ./temp/frame0010.jpg --> frame0010.pickle
            im = cv2.imread(frame_path)      
            outputs = predictor(im)
            with open(detectron2Folder+"/"+outputsFile, 'wb') as file_out:
                pickle.dump(outputs, file_out, protocol=pickle.HIGHEST_PROTOCOL)
    else:
        print("######  video already parsed with Detectron2 #####")

    print("######  Detectron2 done! #####")

#### 0 / 19 - 1-8.mp4 ###
#### 1 / 19 - 1-11.mp4 ###
#### 2 / 19 - 1-7.mp4 ###
#### 3 / 19 - 1-4.mp4 ###
#### 4 / 19 - 1-3.mp4 ###
#### 5 / 19 - 3-4.mov ###
#### 6 / 19 - 1-5.mp4 ###
#### 7 / 19 - 3-3.mov ###
#### 8 / 19 - 2-1.mov ###
#### 9 / 19 - 1-2.mp4 ###
#### 10 / 19 - 1-9.mp4 ###
#### 11 / 19 - 1-10.mp4 ###
#### 12 / 19 - 1-1.mp4 ###
#### 13 / 19 - 2-3.mov ###
#### 14 / 19 - 3-1.mov ###
#### 15 / 19 - 2-2.mov ###
#### 16 / 19 - 3-2.mov ###
#### 17 / 19 - 3-5.mov ###
#### 18 / 19 - 1-6.mp4 ###


# Creating Dataset

In [40]:
from scipy.spatial import distance

def extractBestPosesAlongTheVideo(pose_scores_video, keypoint_coords_video, numberOfPosesToExtract):
    numberOfDetectedPeople = max([np.count_nonzero(i) for i in pose_scores_video])
    alignedCoords = np.zeros((len(pose_scores_video), numberOfDetectedPeople, 17, 2))
    alignedPoseScores = np.zeros((len(pose_scores_video), numberOfDetectedPeople))
    #     peopleLastCoords = keypoint_coords_video[0][:numberOfDetectedPeople]
    peopleLastCoords = np.zeros((numberOfDetectedPeople,17,2))
    for frame_idx, frame in enumerate(keypoint_coords_video):
        coordDistances = [] #triplet list -> distance, id_new_coords, id_person
        for newCoords_idx, newCoords in enumerate(frame):
            if np.count_nonzero(newCoords) == 0: # skip all the coords which are just zeros
                continue
            for person_idx, personLastCoords in enumerate(peopleLastCoords):
                coordinatesDistance = np.mean([distance.euclidean(newCoords[i],personLastCoords[i]) 
                                           for i in range(len(newCoords))
                                           if np.count_nonzero(newCoords[i]) == 2 
                                           and np.count_nonzero(personLastCoords[i]) == 2])
                # in case of poses which are all 0's
                if np.isnan(coordinatesDistance): 
                    coordinatesDistance = float("inf")

                coordDistances.append((coordinatesDistance,newCoords_idx,person_idx))

        # sort distances from te closest to the farest
        coordDistances.sort(key=lambda x : x[0]) 

        idNewCoords = [None for i in range(numberOfDetectedPeople)]
        for distance_triplet in coordDistances:
            person_idx = distance_triplet[2]
            newCoords_idx = distance_triplet[1]
            # check that the person doesn't have already a new coord and that the new coords are not already used  
            if idNewCoords[person_idx] == None and newCoords_idx not in idNewCoords:
                # assign new closest coords to people
                idNewCoords[person_idx] = newCoords_idx
                alignedCoords[frame_idx][person_idx] = frame[newCoords_idx]
                peopleLastCoords[person_idx] = frame[newCoords_idx]
                alignedPoseScores[frame_idx][person_idx] = pose_scores_video[frame_idx][newCoords_idx]


        # for the people who didn't find a match in the new coordinates we add a bunch of zeros in the result
        # without updating their last coordinates
        for person_idx, value in enumerate(idNewCoords):
            if value == None:
                alignedCoords[frame_idx][person_idx] = np.zeros((17,2))
                alignedPoseScores[frame_idx][person_idx] = 0


    # select the best poses
    scorePosesRank = np.mean(alignedPoseScores, axis=0)
    posesIndexRank = scorePosesRank.argsort()[::-1]

    isSecondPersonDetected = True
    if numberOfPosesToExtract == 1:
        bestPoseKeypointCoords = alignedCoords[:,posesIndexRank[0]]
    elif numberOfPosesToExtract == 2:
        if len(posesIndexRank) == 1:
            isSecondPersonDetected = False
            bestPoseKeypointCoords = np.concatenate((
                                                    alignedCoords[:,posesIndexRank[0]], 
                                                    np.zeros( #concatenate it with a bunch of zeros whit the same shape
                                                        alignedCoords[:,posesIndexRank[0]].shape
                                                    )),axis=1)
        else:
            bestPoseKeypointCoords = np.concatenate((
                                                    alignedCoords[:,posesIndexRank[0]], 
                                                    alignedCoords[:,posesIndexRank[1]]
                                                    ),axis=1)
    else:
        raise Exception("So far, it's possible to extract maximum 2 people from the video.") 

    return bestPoseKeypointCoords, isSecondPersonDetected


## PoseNet

In [41]:
from os import scandir, listdir
from os.path import basename

def getPoseNetResult(inputFolder = "/data/students_home/amoscatelli/Desktop/actionAnalysis/scenes/poses/", poseNetModelName = "PoseNet-101"):
    print("retrieving PoseNet points from input folders...")
    videoFeaturesList = []
    videoNameList = []

    videoFoldersToAnalyse = [f.path for f in scandir(inputFolder) if isdir(join(inputFolder, f))]

    print("found {} folders for {}".format(len(videoFoldersToAnalyse), poseNetModelName))
    for i,avf in enumerate(videoFoldersToAnalyse):
        print("{}/{} - {}".format(i+1, len(videoFoldersToAnalyse),basename(avf)))

        poseNet_folder = avf + "/"+ poseNetModelName
        pose_scores_video = []
    #     keypoint_scores_video = [] # ignoring for now
        keypoint_coords_video = []
        framesInFolder = [f for f in listdir(poseNet_folder) if f.endswith(".pickle")]
        framesInFolder.sort() # must be sorted to have the frame in the correct order   
        for frame_file in framesInFolder:
            with open(poseNet_folder+"/"+frame_file,'rb') as file_in:
                pose_scores, keypoint_scores, keypoint_coords = pickle.load(file_in)
            pose_scores_video.append(pose_scores)
    #         keypoint_scores_video.append(keypoint_scores) # for now ignoring the keypoints Scores
            keypoint_coords_video.append(keypoint_coords)

#         numberOfPosesToExtract = 1 if target < 50 else 2
        numberOfPosesToExtract = 2

        bestPosesKeypointCoords, isSecondPersonDetected = extractBestPosesAlongTheVideo(pose_scores_video, 
                                                                keypoint_coords_video, 
                                                                numberOfPosesToExtract)

        if not isSecondPersonDetected:
            print("The second person is not recognized in the whole video:", basename(avf))


        videoFeaturesList.append(bestPosesKeypointCoords)
        videoNameList.append(basename(avf))


    # converting to ndarray
    videoFeatures = np.asarray(videoFeaturesList)
    videoNames = np.asarray(videoNameList)

    # final assertion (just to be sure)
    assert len(videoFeatures) == len(videoNames)

    return videoFeatures,videoNames

## Detectron

In [39]:
# inputFolder = "/data/students_home/amoscatelli/Desktop/actionAnalysis/scenes/poses/"
# [f.path for f in scandir(inputFolder) if isdir(join(inputFolder, f))]

'/data/students_home/amoscatelli/Desktop/actionAnalysis/scenes/poses/3-4'

In [42]:
def getDetectronResult(inputFolder = "/data/students_home/amoscatelli/Desktop/actionAnalysis/scenes/poses/", detectron2ModelName = "keypoint_rcnn_X_101_32x8d_FPN_3x"):
    print("retrieving detectron points from input folders...")
    videoFeaturesList = []
    videoNameList = []

    videoFoldersToAnalyse = [f.path for f in scandir(inputFolder) if isdir(join(inputFolder, f))]

    print("found {} folders for {}".format(len(videoFoldersToAnalyse), detectron2ModelName))

    for i, avf in enumerate(videoFoldersToAnalyse):
        print("{}/{} - {}".format(i+1, len(videoFoldersToAnalyse), basename(avf)))

        detectron2_folder = avf + "/Detectron2-"+ detectron2ModelName
        framesInFolder = [f for f in listdir(detectron2_folder) if f.endswith(".pickle")]
        framesInFolder.sort() # must be sorted to have the frame in the correct order   
        keypoint_coords_video = []
        pose_scores_video = []

        for fIdx,frame_file in enumerate(framesInFolder):
            with open(detectron2_folder+"/"+frame_file,'rb') as file_in:
                outputsRead = pickle.load(file_in)

            if len(outputsRead['instances']) == 0:
                # in case of no pose is detected we fill the features with zeros
                keypoints_frame = np.zeros((17,2))
                score_poses = [0]
                print(" for video {}-{} the frame {} is filled with zeros".format(i,basename(avf),fIdx))
            else:
                instancesDict = outputsRead['instances'].get_fields()
                keypoints_frame = np.array(instancesDict['pred_keypoints'].cpu()) 
                keypoints_frame = np.delete(keypoints_frame, 2, 2) #removing the third value of each keypoint (which I still don't understand what it means)
                score_poses = np.array(instancesDict['scores'].cpu()) 

            keypoint_coords_video.append(keypoints_frame)
            pose_scores_video.append(score_poses)


#         numberOfPosesToExtract = 1 if target < 50 else 2
        numberOfPosesToExtract = 2
        
        bestPoses, isSecondPersonDetected = extractBestPosesAlongTheVideo(
            pose_scores_video, keypoint_coords_video, numberOfPosesToExtract
        )
        
        if not isSecondPersonDetected:
            print("The second person is not recognized in the whole video:", basename(avf))
            

        videoFeaturesList.append(bestPoses)
        videoNameList.append(basename(avf))

    # converting to ndarray
    videoFeatures = np.asarray(videoFeaturesList)
    videoNames = np.asarray(videoNameList)

    #final assertion (just to be sure)
    assert len(videoFeatures) == len(videoNames)

    return videoFeatures,videoNames

In [50]:
# detectron2_folder = "/data/students_home/amoscatelli/Desktop/actionAnalysis/scenes/poses/"+ "3-3" + "/Detectron2-"+ "keypoint_rcnn_X_101_32x8d_FPN_3x"
# framesInFolder = [f for f in listdir(detectron2_folder) if f.endswith(".pickle")]
# framesInFolder

In [43]:
import pickle
import numpy as np

modelNames = ["keypoint_rcnn_X_101_32x8d_FPN_3x"
              ,"PoseNet-101"
             ]
for modelName in modelNames:
    if modelName == "PoseNet-101":
        videoFeatures,videoNames = getPoseNetResult()
    elif modelName == "keypoint_rcnn_X_101_32x8d_FPN_3x":
        videoFeatures,videoNames = getDetectronResult()
    else:
        raise Exception("model name {} unknown".format(modelName)) 
        
    datasetResultName = SCENE_DATASET_FOLDER_PATH+modelName+"-SCENES-dataset.pickle"
    print("Dumping the result in ",datasetResultName)
    with open(datasetResultName, 'wb') as handle:
        pickle.dump((videoFeatures,videoNames), handle, protocol=pickle.HIGHEST_PROTOCOL)    
print("DONE")

retrieving detectron points from input folders...
found 19 folders for keypoint_rcnn_X_101_32x8d_FPN_3x
1/19 - 3-3
2/19 - 3-1
3/19 - 2-1
4/19 - 1-5
5/19 - 2-3
6/19 - 1-10
7/19 - 3-2
8/19 - 1-9
9/19 - 2-2
10/19 - 1-8
11/19 - 1-3
12/19 - 3-5
13/19 - 1-1
14/19 - 1-4
15/19 - 1-11
16/19 - 1-2
17/19 - 3-4
18/19 - 1-7
19/19 - 1-6
Dumping the result in  /data/students_home/amoscatelli/Desktop/actionAnalysis/scenes/dataset/keypoint_rcnn_X_101_32x8d_FPN_3x-SCENES-dataset.pickle
retrieving PoseNet points from input folders...
found 19 folders for PoseNet-101
1/19 - 3-3
2/19 - 3-1
3/19 - 2-1
4/19 - 1-5
5/19 - 2-3
6/19 - 1-10
7/19 - 3-2
8/19 - 1-9
9/19 - 2-2
10/19 - 1-8
11/19 - 1-3
12/19 - 3-5
13/19 - 1-1
14/19 - 1-4
15/19 - 1-11
16/19 - 1-2
17/19 - 3-4
18/19 - 1-7
19/19 - 1-6
Dumping the result in  /data/students_home/amoscatelli/Desktop/actionAnalysis/scenes/dataset/PoseNet-101-SCENES-dataset.pickle
DONE


# Force first pose to be always filled

In [102]:
# with open(datasetToLoad,'rb') as file_in:
#     readFeatures, readNames = pickle.load(file_in)
    
# for features in readFeatures:
#     for frame in features:
#         if np.count_nonzero(frame[0:17]) == 0:
#             frame[0:17] = frame[17:34]
#             frame[17:34] = np.zeros((17,2))



In [55]:
# [f for f in scandir(INPUT_SCENE_FOLDER_PATH) if f.name == "1-9.mp4"]

[<DirEntry '1-9.mp4'>]

## See poses on video

In [57]:
# import numpy as np
# from os import scandir # listdir
import matplotlib.pyplot as plt
# from random import randrange
# import pickle
# import parse


file_name = "1-9.mp4"
ROTATE = 180

sceneFile = [f for f in scandir(INPUT_SCENE_FOLDER_PATH) if f.name == file_name][0]
# sceneFile = sceneFileList[0]

# modelToLoad = "keypoint_rcnn_X_101_32x8d_FPN_3x"
modelToLoad = "PoseNet-101"



datasetToLoad = SCENE_DATASET_FOLDER_PATH + modelToLoad + "-SCENES-dataset.pickle"
plt.rcParams["figure.figsize"] = (19,15)

print("Reading ", datasetToLoad)
with open(datasetToLoad,'rb') as file_in:
    readFeatures, readNames = pickle.load(file_in)


print("Retrieving features for ", file_name)    
idx = np.where(readNames == file_name[:-4])[0][0]
features = readFeatures[idx]

    
# # Force first pose to be always filled
# for frame in features:
#     if np.count_nonzero(frame[0:17]) == 0:
#         frame[0:17] = frame[17:34]
#         frame[17:34] = np.zeros((17,2))

# extracting the frames of the passed video 
tempFrameFolder = SCENE_FOLDER_PATH+"tempDataVisualizerFrames/"
extractFrames(sceneFile.path,tempFrameFolder,rotate=ROTATE) 
    
framesInFolder = [f.path for f in scandir(tempFrameFolder)]
framesInFolder.sort()

for i,framePath in enumerate(framesInFolder):
    if i % 30 == 0:
        print("{}/{}".format(i,len(framesInFolder)))
    plt.axis("off")
    im = plt.imread(framePath)
    implot = plt.imshow(im)
    
    #read dataset for that picture
    if "PoseNet" in datasetToLoad:
        x = [x[1] for x in readFeatures[idx][i]]
        y = [x[0] for x in readFeatures[idx][i]]
    else:
        x = [x[0] for x in readFeatures[idx][i]]
        y = [x[1] for x in readFeatures[idx][i]]
        
    
    categories = np.zeros(17).astype(int)
    if len(x) == 34:  
        categories = np.concatenate((categories,np.ones(17))).astype(int)

    colormap = np.array(['r', 'b'])    
    
    plt.scatter(x=x, y=y, 
                c=colormap[categories], 
                s = 40)
    
#     plt.plot([70, 70], [100, 250], 'k-', lw=2) # draw skeleton lines
  
    plt.savefig(framePath, bbox_inches='tight')
    plt.close()
print("Done!")

Reading  /data/students_home/amoscatelli/Desktop/actionAnalysis/scenes/dataset/PoseNet-101-SCENES-dataset.pickle
Retrieving features for  1-9.mp4
Original FPS: 20
Requested FPS: 9999999
length: 542 width: 1280 height: 720
pick 1 frame every 1
0/535
30/535
60/535
90/535
120/535
150/535
180/535
210/535
240/535
270/535
300/535
330/535
360/535
390/535
420/535
450/535
480/535
510/535
Done!


In [31]:
# [f for f in scandir(INPUT_SCENE_FOLDER_PATH)]

In [4]:
import glob
import cv2
from moviepy.editor import VideoFileClip
 
# tempFrameFolder = SCENE_FOLDER_PATH+"tempDataVisualizerFrames/"
img_array = []
frames = glob.glob(tempFrameFolder+"*.jpg")
frames.sort()
for filename in frames:
    img = cv2.imread(filename)
    height, width, layers = img.shape
    size = (width,height)
    img_array.append(img)
    
tempFilePath = tempFrameFolder+"tempVideo.mp4"
    
# out = cv2.VideoWriter('project.avi',cv2.VideoWriter_fourcc(*'DIVX'), 15, size)
out = cv2.VideoWriter(tempFilePath,cv2.VideoWriter_fourcc(*'DIVX'), 15, size)
 
for i in range(len(img_array)):
    out.write(img_array[i])
out.release()


VideoFileClip(tempFilePath).ipython_display(width=500)

# Build video with inference

In [4]:
from os import scandir
import pickle
import dill
import numpy as np
from keras.models import load_model 
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
from os.path import basename
from PIL import ImageFont 
import glob
import cv2
from moviepy.editor import VideoFileClip


# SLICING_WINDOW_SIZE = 80 # frames

modelToLoad = "keypoint_rcnn_X_101_32x8d_FPN_3x"
# modelToLoad = "PoseNet-101"

seconds_slice_values = [2,3,4]


#################### PROCESS SETTINGS ###################################
# label_order = ['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', 
#                '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', 
#                '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', 
#                '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', 
#                '5', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', 
#                '6', '60', '7', '8', '9']
    
label_order = ['beve', 'applaude', 'legge', 'scrive', 'strappa un foglio', 'si veste', 'si spoglia', 'si infila le scarpe', 'si leva le scarpe', 'si mette gli occhiali', 'si leva gli occhiali', 
               'mangia', 'si mette il cappello', 'si leva il cappello', 'esulta', 'saluta', 'calcia qualcosa', 'mette qualcosa in tasca', 'salta su un piede', 'salta', 'telefona', 'gioca col telefono', 
               'si lava i denti', 'scrive sulla tastiera', 'indica qualcosa', 'si fa un selfie', 'controlla l\'orologio', 'si strofina le mani', 'si inchina', 'squote la testa', 'si strofina la faccia', 'fa il saluto militare', 'unisce i palmi delle mani', 
               'si spazzola i capelli', 'mette le braccia ad X', 'starnutisce', 'barcolla', 'cade', 'si tocca la testa', 'si tocca la pancia', 'si tocca la schiena', 'si tocca il collo', 'ha la nausea', 'si sventola', 
               'fa cadere qualcosa', 'colpisce una persona', 'calcia una persona', 'spinge una persona', 'da una pacca sulla spalla', 'indica una persona', 'si abbracciano', 'da qualcosa ad una persona', 'tocca la tasca di qualcuno', 'si danno la mano', 'si avvicinano', 
               'raccoglie qualcosa', 'si allontanano', 'lancia qualcosa', 'si siede', 'si alza']

modelLength = 300
ROTATE = 180
plt.rcParams["figure.figsize"] = (19,15)
actionFontSize = 40
font = ImageFont.truetype('Pillow/Tests/fonts/Arial.ttf', actionFontSize)
fontPlt = {'family' : 'Arial',
        'weight' : 'normal',
        'size'   : actionFontSize}

print("#### LOADING MODEL #############################")
modelToLoadPath = [f.path for f in scandir(INFERENCE_MODEL_FOLDER_PATH) if f.name.endswith(".h5")][0]
modelToLoadAccessoriesPath = [f.path for f in scandir(INFERENCE_MODEL_FOLDER_PATH) if f.name.endswith(".pickle")][0]

model = load_model(modelToLoadPath)
with open(modelToLoadAccessoriesPath,'rb') as file_in:
    accessories = pickle.load(file_in)

print("#### LOADING FUNCTIONS #############################")
envFunctions = [dill.loads(x) for x in accessories["env_fun_DILL"]]
specificFunctionsList = [dill.loads(x) for x in accessories["spec_fun_DILL"]]

one_hot_encoding = envFunctions[0]
normaliseBeforePadding = envFunctions[1]
paddingTrainValTest = envFunctions[2]
getClosestNonZeroCoordinate = envFunctions[3]
removeZerosFromVideo = envFunctions[4]
getZeroStatsForDataset = envFunctions[5]
preprocessData = envFunctions[6]
############################################

        

file_name_list = [f.name for f in scandir(INPUT_SCENE_FOLDER_PATH)]

numberOfVideos = len(file_name_list)*len(seconds_slice_values)
counter = 0
for file_name in file_name_list:    
    firstFrameExtraction = True    
    for secondsSlice in seconds_slice_values:
        print("#### {}/{} - {} - {} seconds".format(counter,numberOfVideos,file_name,secondsSlice))
        counter += 1
        
        outputAnalysedVideoPath = OUTPUT_SCENE_FOLDER_PATH+file_name[:-4]+"_"+str(secondsSlice)+"s"+file_name[-4:]
        if exists(outputAnalysedVideoPath):
            print("Already done!")
            continue
        
        
        if firstFrameExtraction:
            print("###################   EXTRACTING FRAMES  #######################")
            firstFrameExtraction = False
            sceneFile = [f for f in scandir(INPUT_SCENE_FOLDER_PATH) if f.name == file_name][0]
            # extracting the frames of the passed video 
            extractionFrameFolder = SCENE_FOLDER_PATH+"temp_extraction_frames/"
        #     print("#### NOT EXTRACTING FRAMES!!! #####")
            extractFrames(sceneFile.path,extractionFrameFolder,rotate=ROTATE) 
            sceneFrames = [f.path for f in scandir(extractionFrameFolder)]
            sceneFrames.sort()
        else:
            print("######  Frames already extracted  #######")   
        
        
        
        print("###################   LOADING DATASET  #######################")
        datasetToLoad = SCENE_DATASET_FOLDER_PATH + modelToLoad + "-SCENES-dataset.pickle"

        print("Loading ", datasetToLoad)
        with open(datasetToLoad,'rb') as file_in:
            readFeatures, readNames = pickle.load(file_in)

        # file_name = readNames[0]+".mp4"

        print("Retrieving features for ", file_name)    
        idx_dataset = np.where(readNames == file_name[:-4])[0][0]
        features = readFeatures[idx_dataset]

        # Force first pose to be always filled
        # print("FORCE first pose to be never ZERO")
        # for frame in features:
        #     if np.count_nonzero(frame[0:17]) == 0:
        #         frame[0:17] = frame[17:34]
        #         frame[17:34] = np.zeros((17,2))


        print("###################   INFERENCE  #######################")
        
        fps = 20 if file_name.endswith("mp4") else 29
        print("### FPS:", fps)
        SLICING_WINDOW_SIZE = fps*secondsSlice
        print("### SLICING_WINDOW_SIZE:", SLICING_WINDOW_SIZE)
        
        slicingSequences = list()

        for i in range(SLICING_WINDOW_SIZE-1,len(features)):
            slicingSequences.append(features[i+1-SLICING_WINDOW_SIZE:i])

        slicingSequences = np.asarray(slicingSequences)

        
        # specificFunctionsList = specificFunctionsList[1:] # remove 'remove-0'


        fakeTrainSet = [slicingSequences[0:2],[0]]
        fakeValSet = [slicingSequences[0:2],[0]]
        testSet = [slicingSequences,[0]]

        print("### predicting slices of video ###")
        X_train, y_train, X_val, y_val, X_test, y_test = preprocessData(fakeTrainSet, fakeValSet, testSet, specificFunctionsList)

        X_test = pad_sequences(X_test, maxlen=modelLength, dtype='float32', padding='post', truncating='post', value=0.0)

        y_test_pred = [label_order[i] for i in model.predict_classes(X_test)]
#         y_test_pred_proba = [model.predict_proba(i) for i in X_test]

        print("################## PREPARING FRAMES WITH INFERENCE #######################")

        # # Force first pose to be always filled
        # for frame in features:
        #     if np.count_nonzero(frame[0:17]) == 0:
        #         frame[0:17] = frame[17:34]
        #         frame[17:34] = np.zeros((17,2))


        # cleaning the new video frames
        newVideoFrameFolder = SCENE_FOLDER_PATH+"tempDataVisualizerFrames/"
        !rm $newVideoFrameFolder*


    #     singleFrameToAnalyse = 115
        # for i,framePath in enumerate(sceneFrames[singleFrameToAnalyse-1:singleFrameToAnalyse]):
        #     i=singleFrameToAnalyse
        for idx_sceneFrame, framePath in enumerate(sceneFrames):
            if idx_sceneFrame % 50 == 0:
                print("{}/{}".format(idx_sceneFrame,len(sceneFrames)))
            plt.axis("off")
            im = plt.imread(framePath)
            implot = plt.imshow(im)

            #read dataset for that picture
            if modelToLoad == "PoseNet-101":
                x = [x[1] for x in readFeatures[idx_dataset][idx_sceneFrame]]
                y = [x[0] for x in readFeatures[idx_dataset][idx_sceneFrame]]
            else:
                x = [x[0] for x in readFeatures[idx_dataset][idx_sceneFrame]]
                y = [x[1] for x in readFeatures[idx_dataset][idx_sceneFrame]]


            if np.count_nonzero(x[17:34]) == 0:
                x = x[:17]
                y = y[:17]

            # to avoid to plot outside the frame
            x = [max(p,7) for p in x]
            y = [max(p,7) for p in y]
            x = [min(p,im.shape[1]-7) for p in x]
            y = [min(p,im.shape[0]-7) for p in y]

            categories = np.zeros(17).astype(int)
            if len(x) == 34:  
                categories = np.concatenate((categories,np.ones(17))).astype(int)

            colormap = np.array(['r', 'b'])    

            plt.scatter(x=x, y=y, 
                        c=colormap[categories], 
                        s = 40)

            message = "         ...         "
            if idx_sceneFrame >= SLICING_WINDOW_SIZE-1:
                message = y_test_pred[idx_sceneFrame+1-SLICING_WINDOW_SIZE]    

            w, h = font.getsize(message)
            (text_x, text_y) = (im.shape[1]*0.5-(w/2), im.shape[0]*0.9)

            t =  plt.text(text_x, text_y, message, fontdict = fontPlt)
            t.set_bbox(dict(facecolor='white', alpha=0.5, edgecolor='black')) 
        #     plt.show()

        #     plt.plot([70, 70], [100, 250], 'k-', lw=2) # draw skeleton lines

            newFramePath = newVideoFrameFolder+basename(framePath)
            plt.savefig(newFramePath, bbox_inches='tight')
            plt.close()


        
        print("############### SAVING VIDEO WITH INFERENCE #####################")
        img_array = []
        frames = glob.glob(newVideoFrameFolder+"*.jpg")
        frames.sort()
        for idx_frame,filename in enumerate(frames):
            img = cv2.imread(filename)
            img_array.append(img)


        height, width, layers = img.shape
        size = (width,height)

        out = cv2.VideoWriter(outputAnalysedVideoPath,cv2.VideoWriter_fourcc(*'DIVX'), fps, size)

        for i in range(len(img_array)):
            out.write(img_array[i])
        out.release()


    #     VideoFileClip(tempFilePath).ipython_display(width=500)
print("Done!")


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html
#### LOADING MODEL #############################

#### LOADING FUNCTIONS #############################
#### 0/57 - 1-8.mp4 - 2 seconds
Already done!
#### 1/57 - 1-8.mp4 - 3 seconds
Already done!
#### 2/57 - 1-8.mp4 - 4 seconds
Already done!
#### 3/57 - 1-11.mp4 - 2 seconds
Already done!
#### 4/57 - 1-11.mp4 - 3 seconds
Already done!
#### 5/57 - 1-11.mp4 - 4 seconds
Already done!
#### 6/57 - 1-7.mp4 - 2 seconds
Already done!
#### 7/57 - 1-7.mp4 - 3 seconds
Already done!
#### 8/57 - 1-7.mp4 - 4 seconds
Already done!
#### 9/57 - 1-4.mp4 - 2 seconds
Already done!
#### 10/57 - 1-4.mp4 - 3 seconds
Already done!
#### 11/57 - 1-4.mp4 - 4 seconds
Already done!
#### 12/57 - 1-3.mp4 - 2 seconds
Already done!
#### 13/57 - 1-3.mp4 - 3 seconds
Already done!
#### 14/57 - 1-3.mp4 - 4 seconds
Already done!
#### 15/57 - 3-4.mov - 2 seconds
Already done!
#### 16/57 - 3-4.mov - 3 seconds
Already done!
#### 17/57 - 3-4.mov

Adapting the data to the TOP-MIDDLE-BOTTOM centers of each video
Adapting the data to the TOP-MIDDLE-BOTTOM centers of each video
Adapting the data to the TOP-MIDDLE-BOTTOM centers of each video
normalising EACH VIDEO, considering x and y TOGETHER
normalising EACH VIDEO, considering x and y TOGETHER
normalising EACH VIDEO, considering x and y TOGETHER
train set shape: (2, 39, 34, 2)
train set zero elements (after padding): 0 (0.00%)
val set shape: (2, 39, 34, 2)
val set zero elements (after padding): 0 (0.00%)
test set shape: (615, 39, 34, 2)
test set zero elements (after padding): 0 (0.00%)
################## PREPARING FRAMES WITH INFERENCE #######################
0/654
50/654
100/654
150/654
200/654
250/654
300/654
350/654
400/654
450/654
500/654
550/654
600/654
650/654
############### SAVING VIDEO WITH INFERENCE #####################
#### 28/57 - 1-2.mp4 - 3 seconds
######  Frames already extracted  #######
###################   LOADING DATASET  #######################
Loading  /dat

#### 34/57 - 1-10.mp4 - 3 seconds
######  Frames already extracted  #######
###################   LOADING DATASET  #######################
Loading  /data/students_home/amoscatelli/Desktop/actionAnalysis/scenes/dataset/keypoint_rcnn_X_101_32x8d_FPN_3x-SCENES-dataset.pickle
Retrieving features for  1-10.mp4
###################   INFERENCE  #######################
### FPS: 20
### SLICING_WINDOW_SIZE: 60
### predicting slices of video ###
classes order: ['0']
classes order: ['0']
classes order: ['0']
removing zeros from dataset
removing zeros from dataset
removing zeros from dataset
Adapting the data to the TOP-MIDDLE-BOTTOM centers of each video
Adapting the data to the TOP-MIDDLE-BOTTOM centers of each video
Adapting the data to the TOP-MIDDLE-BOTTOM centers of each video
normalising EACH VIDEO, considering x and y TOGETHER
normalising EACH VIDEO, considering x and y TOGETHER
normalising EACH VIDEO, considering x and y TOGETHER
train set shape: (2, 59, 34, 2)
train set zero elements (aft

train set shape: (2, 57, 34, 2)
train set zero elements (after padding): 0 (0.00%)
val set shape: (2, 57, 34, 2)
val set zero elements (after padding): 0 (0.00%)
test set shape: (894, 57, 34, 2)
test set zero elements (after padding): 1938 (0.06%)
################## PREPARING FRAMES WITH INFERENCE #######################
0/951
50/951
100/951
150/951
200/951
250/951
300/951
350/951
400/951
450/951
500/951
550/951
600/951
650/951
700/951
750/951
800/951
850/951
900/951
950/951
############### SAVING VIDEO WITH INFERENCE #####################
#### 40/57 - 2-3.mov - 3 seconds
######  Frames already extracted  #######
###################   LOADING DATASET  #######################
Loading  /data/students_home/amoscatelli/Desktop/actionAnalysis/scenes/dataset/keypoint_rcnn_X_101_32x8d_FPN_3x-SCENES-dataset.pickle
Retrieving features for  2-3.mov
###################   INFERENCE  #######################
### FPS: 29
### SLICING_WINDOW_SIZE: 87
### predicting slices of video ###
classes order: ['

OSError: [Errno 12] Cannot allocate memory

In [None]:
import glob
import cv2
from moviepy.editor import VideoFileClip

readyVideo = [f for f in scandir(OUTPUT_SCENE_FOLDER_PATH)]
videoPath = OUTPUT_SCENE_FOLDER_PATH+readyVideo[0].name
print("Loading", videoPath)
VideoFileClip(videoPath).ipython_display(width=500)

In [100]:
# y_test_pred[i+1-SLICING_WINDOW_SIZE]
i+1-SLICING_WINDOW_SIZE,len(y_test_pred),i

(673, 673, 692)

In [95]:
OUTPUT_SCENE_FOLDER_PATH+file_name[:-4]+"_"+str(secondsSlice)+"s"+file_name[-4:]

'/data/students_home/amoscatelli/Desktop/actionAnalysis/scenes/output_video/3-4_1s.mov'

In [93]:
secondsSlice = 1
OUTPUT_SCENE_FOLDER_PATH+file_name[:-4]+"_"+str(secondsSlice)+"s"+file_name[-4:]

'/data/students_home/amoscatelli/Desktop/actionAnalysis/scenes/output_video/3-4_1s.mov'

# Show inference on video

In [83]:
import matplotlib.pyplot as plt
from os.path import basename
from PIL import ImageFont

ROTATE = 180
plt.rcParams["figure.figsize"] = (19,15)

# # Force first pose to be always filled
# for frame in features:
#     if np.count_nonzero(frame[0:17]) == 0:
#         frame[0:17] = frame[17:34]
#         frame[17:34] = np.zeros((17,2))


# cleaning the new video frames
newVideoFrameFolder = SCENE_FOLDER_PATH+"tempDataVisualizerFrames/"
!rm $newVideoFrameFolder*


sceneFile = [f for f in scandir(INPUT_SCENE_FOLDER_PATH) if f.name == file_name][0]
# extracting the frames of the passed video 
extractionFrameFolder = SCENE_FOLDER_PATH+"temp_extraction_frames/"
print("#### NOT EXTRACTING FRAMES!!! #####")
# extractFrames(sceneFile.path,extractionFrameFolder,rotate=ROTATE) 
    
framesInFolder = [f.path for f in scandir(extractionFrameFolder)]
framesInFolder.sort()

actionFontSize = 40
font = ImageFont.truetype('Pillow/Tests/fonts/Arial.ttf', actionFontSize)
fontPlt = {'family' : 'Arial',
        'weight' : 'normal',
        'size'   : actionFontSize}

singleFrameToAnalyse = 115
# for i,framePath in enumerate(framesInFolder[singleFrameToAnalyse-1:singleFrameToAnalyse]):
#     i=singleFrameToAnalyse
for i,framePath in enumerate(framesInFolder):
    if i % 30 == 0:
        print("{}/{}".format(i,len(framesInFolder)))
    plt.axis("off")
    im = plt.imread(framePath)
    implot = plt.imshow(im)
    
    #read dataset for that picture
    if modelToLoad == "PoseNet-101":
        x = [x[1] for x in readFeatures[idx][i]]
        y = [x[0] for x in readFeatures[idx][i]]
    else:
        x = [x[0] for x in readFeatures[idx][i]]
        y = [x[1] for x in readFeatures[idx][i]]
        

    if np.count_nonzero(x[17:34]) == 0:
        x = x[:17]
        y = y[:17]
        
    # to avoid to plot outside the frame
    x = [max(p,7) for p in x]
    y = [max(p,7) for p in y]
    x = [min(p,im.shape[1]-7) for p in x]
    y = [min(p,im.shape[0]-7) for p in y]
    
    categories = np.zeros(17).astype(int)
    if len(x) == 34:  
        categories = np.concatenate((categories,np.ones(17))).astype(int)

    colormap = np.array(['r', 'b'])    
    
    plt.scatter(x=x, y=y, 
                c=colormap[categories], 
                s = 40)
    
    message = "         ...         "
    if i >= SLICING_WINDOW_SIZE-1:
        message = y_test_pred[i+1-SLICING_WINDOW_SIZE]    
    
    w, h = font.getsize(message)
    (text_x, text_y) = (im.shape[1]*0.5-(w/2), im.shape[0]*0.9)

    t =  plt.text(text_x, text_y, message, fontdict = fontPlt)
    t.set_bbox(dict(facecolor='white', alpha=0.5, edgecolor='black')) 
#     plt.show()
    
#     plt.plot([70, 70], [100, 250], 'k-', lw=2) # draw skeleton lines
    
    newFramePath = newVideoFrameFolder+basename(framePath)
    plt.savefig(newFramePath, bbox_inches='tight')
    plt.close()
print("Done!")

rm: cannot remove '/data/students_home/amoscatelli/Desktop/actionAnalysis/scenes/tempDataVisualizerFrames/*': No such file or directory
#### NOT EXTRACTING FRAMES!!! #####
0/941
30/941
60/941
90/941
120/941
150/941
180/941
210/941
240/941
270/941
300/941
330/941
360/941
390/941
420/941
450/941
480/941
510/941
540/941
570/941
600/941
630/941
660/941
690/941
720/941
750/941
780/941
810/941
840/941
870/941
900/941
930/941
Done!


In [1]:
# PROJECT_FOLDER = "/data/students_home/amoscatelli/Desktop/actionAnalysis"
# SCENE_FOLDER_PATH = PROJECT_FOLDER + "/scenes/"
# SCENE_POSES_FOLDER_PATH = SCENE_FOLDER_PATH + "poses/"
# SCENE_DATASET_FOLDER_PATH = SCENE_FOLDER_PATH + "dataset/"
# INPUT_SCENE_FOLDER_PATH = SCENE_FOLDER_PATH + "input_video/"
# OUTPUT_SCENE_FOLDER_PATH = SCENE_FOLDER_PATH + "output_video/"
# INFERENCE_MODEL_FOLDER_PATH = SCENE_FOLDER_PATH + "model/"
# newVideoFrameFolder = SCENE_FOLDER_PATH+"tempDataVisualizerFrames/"

In [5]:
import glob
import cv2
from moviepy.editor import VideoFileClip
 
    
img_array = []
frames = glob.glob(newVideoFrameFolder+"*.jpg")
frames.sort()
size = (0,0)
for idx_frame,filename in enumerate(frames):
    img = cv2.imread(filename)
    height, width, layers = img.shape
#     size = (max(width,size[0]),max(height,size[1]))
    newSize = (width,height)
    if size != newSize:
        print(idx_frame,newSize)
        size= newSize
    img_array.append(img)
    
tempFilePath = newVideoFrameFolder+"tempVideo.mp4"

size = (size[0]+1,size[1]+1)
    
out = cv2.VideoWriter(tempFilePath,cv2.VideoWriter_fourcc(*'DIVX'), 29, size)
 
for i in range(len(img_array)):
    out.write(img_array[i])
out.release()


VideoFileClip(tempFilePath).ipython_display(width=500)