-
Notifications
You must be signed in to change notification settings - Fork 9
/
ViewController.swift
356 lines (300 loc) · 13.1 KB
/
ViewController.swift
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
//
// ViewController.swift
// mppb-ios-facegeometry
//
// Created by minseopark on 2022/07/27.
//
import AVFoundation
import SceneKit
import UIKit
class ViewController: UIViewController {
// let tracker = MPPBFaceGeometry()!
let tracker = MPPBFaceGeometry(
string: ViewController.FACE_GEOMETRY_WITH_TRANSFORM_CALCULATORS_SOURCE)!
let metalDevice = MTLCreateSystemDefaultDevice()!
var backgroundTextureCache: CVMetalTextureCache!
let session = AVCaptureSession()
let videoQueue = DispatchQueue(
label: "com.mediapipe.prebuilt.example.videoQueue", qos: .userInitiated,
attributes: [], autoreleaseFrequency: .workItem)
let cameraFacing: AVCaptureDevice.Position = .front
var aspectRatio: Float = 1.0
let scene = SCNScene()
let transformNode = SCNNode()
let faceNode = SCNNode()
override func viewDidLoad() {
super.viewDidLoad()
configureScene()
configureCamera()
tracker.startGraph()
tracker.delegate = self
session.startRunning()
}
func configureScene() {
let camera = SCNCamera()
camera.zNear = 1.0
camera.zFar = 10000.0
camera.fieldOfView = 63.0 // fieldOfView matters
let cameraNode = SCNNode()
cameraNode.camera = camera
scene.rootNode.addChildNode(cameraNode)
scene.rootNode.addChildNode(transformNode)
transformNode.addChildNode(faceNode)
let sceneView = SCNView()
sceneView.scene = scene
sceneView.frame = view.frame
sceneView.rendersContinuously = true
sceneView.showsStatistics = true
sceneView.debugOptions = [.showBoundingBoxes, .showWireframe]
view.addSubview(sceneView)
if CVMetalTextureCacheCreate(
kCFAllocatorDefault, nil, metalDevice, nil, &backgroundTextureCache)
!= kCVReturnSuccess
{
assertionFailure("Unable to allocate texture cache")
}
}
func configureCamera() {
let camera = AVCaptureDevice.default(
.builtInWideAngleCamera, for: .video, position: cameraFacing)!
if camera.isFocusModeSupported(.locked) {
try! camera.lockForConfiguration()
camera.focusMode = .locked
camera.unlockForConfiguration()
}
let cameraInput = try! AVCaptureDeviceInput(device: camera)
session.sessionPreset = .vga640x480
session.addInput(cameraInput)
let videoOutput = AVCaptureVideoDataOutput()
videoOutput.setSampleBufferDelegate(self, queue: videoQueue)
videoOutput.videoSettings = [
kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_32BGRA
]
videoOutput.alwaysDiscardsLateVideoFrames = true
session.addOutput(videoOutput)
let videoConnection = videoOutput.connection(with: .video)
videoConnection?.videoOrientation = .portrait
videoConnection?.isVideoMirrored = camera.position == .front
let videoWidth =
videoOutput.videoSettings[kCVPixelBufferWidthKey as String] as! Float
let videoHeight =
videoOutput.videoSettings[kCVPixelBufferHeightKey as String] as! Float
let screenWidth = Float(UIScreen.main.bounds.width)
let screenHeight = Float(UIScreen.main.bounds.height)
// Aspect fit for the background texture
aspectRatio = (screenHeight * videoWidth) / (screenWidth * videoHeight)
let videoTransform =
aspectRatio < 1.0
? SCNMatrix4MakeScale(1, aspectRatio, 1)
: SCNMatrix4MakeScale(1 / aspectRatio, 1, 1)
scene.background.contentsTransform = videoTransform
scene.background.wrapS = .clampToBorder
scene.background.wrapT = .clampToBorder
}
}
extension ViewController: AVCaptureVideoDataOutputSampleBufferDelegate {
func captureOutput(
_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer,
from connection: AVCaptureConnection
) {
autoreleasepool {
guard let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else {
return
}
let timestamp = CMSampleBufferGetOutputPresentationTimeStamp(sampleBuffer)
tracker.processVideoFrame(imageBuffer, timestamp: timestamp)
}
}
}
extension ViewController: MPPBFaceGeometryDelegate {
/*
We get 468 vertices from 468 face landmarks.
A vertex is defined by 5 coordinates: Position (XYZ) + Texture coordinate (UV)
making it an array of 2340 floats. [XYZUV(5) * 468]
e.g.
typedef struct {
float x, y, z; // position
float u, v; // texture coordinates
} vertex;
vertex vertices[468]; // or `float vertices[2340];`
uint32_t indices[2694];
*/
func tracker(
_ tracker: MPPBFaceGeometry!, didOutputGeometry indices: [NSNumber]!,
withVertices vertices: [NSNumber]!, withFace index: Int
) {
// TODO: Use MTLBuffer
let vertexData = vertices.map { $0.floatValue }.withUnsafeBufferPointer {
Data(buffer: $0)
}
// See https://developer.apple.com/documentation/scenekit/scngeometrysource
let vertexSource = SCNGeometrySource(
data: vertexData, semantic: .vertex, vectorCount: vertices.count / 5,
usesFloatComponents: true, componentsPerVector: 3,
bytesPerComponent: MemoryLayout<Float>.size,
dataOffset: MemoryLayout<Float>.stride * 0,
dataStride: MemoryLayout<Float>.size * 5)
let uvSource = SCNGeometrySource(
data: vertexData, semantic: .texcoord, vectorCount: vertices.count / 5,
usesFloatComponents: true, componentsPerVector: 2,
bytesPerComponent: MemoryLayout<Float>.size,
dataOffset: MemoryLayout<Float>.stride * 3,
dataStride: MemoryLayout<Float>.size * 5)
let indexData = indices.map { $0.uint32Value }.withUnsafeBufferPointer {
Data(buffer: $0)
}
let indexSource = SCNGeometryElement(
data: indexData, primitiveType: .triangles,
primitiveCount: indices.count / 3, bytesPerIndex: MemoryLayout<UInt32>.size)
let faceGeometry = SCNGeometry(
sources: [vertexSource, uvSource], elements: [indexSource])
// WARNING: You do not want to bind texture every frame
faceGeometry.firstMaterial?.diffuse.contents = UIImage(named: "uvgrid.jpeg")
faceNode.geometry = faceGeometry
}
// Update transform
func tracker(
_ tracker: MPPBFaceGeometry!, didOutputTransform transform: simd_float4x4,
withFace index: Int
) {
transformNode.simdTransform = transform
}
// Update video texture
func tracker(_ tracker: MPPBFaceGeometry!, didOutputPixelBuffer pixelBuffer: CVPixelBuffer)
{
DispatchQueue.main.async { [unowned self] in
scene.background.contents = processPixelBuffer(pixelBuffer: pixelBuffer)
}
}
}
extension ViewController {
func processPixelBuffer(pixelBuffer: CVPixelBuffer) -> MTLTexture? {
let bufferHeight = CVPixelBufferGetHeight(pixelBuffer)
let bufferWidth = CVPixelBufferGetWidth(pixelBuffer)
var textureRef: CVMetalTexture? = nil
let _ = CVMetalTextureCacheCreateTextureFromImage(
kCFAllocatorDefault, backgroundTextureCache, pixelBuffer, nil,
.bgra8Unorm_srgb, bufferWidth, bufferHeight, 0, &textureRef)
guard let concreteTextureRef = textureRef else { return nil }
let texture = CVMetalTextureGetTexture(concreteTextureRef)
return texture
}
fileprivate static let FACE_GEOMETRY_WITH_TRANSFORM_CALCULATORS_SOURCE = """
# MediaPipe graph that extract transformation data from detected faces
# on a live video stream.
# Used in the examples in mediapipe/examples/ios/prebuilt/facegeometry.
# GPU image. (ImageFrame)
input_stream: "input_video"
# GPU image. (ImageFrame)
output_stream: "output_video"
output_stream: "MULTI_FACE_GEOMETRY:multi_face_geometry"
# Throttles the images flowing downstream for flow control. It passes through
# the very first incoming image unaltered, and waits for downstream nodes
# (calculators and subgraphs) in the graph to finish their tasks before it
# passes through another image. All images that come in while waiting are
# dropped, limiting the number of in-flight images in most part of the graph to
# 1. This prevents the downstream nodes from queuing up incoming images and data
# excessively, which leads to increased latency and memory usage, unwanted in
# real-time mobile applications. It also eliminates unnecessarily computation,
# e.g., the output produced by a node may get dropped downstream if the
# subsequent nodes are still busy processing previous inputs.
node {
calculator: "FlowLimiterCalculator"
input_stream: "input_video"
input_stream: "FINISHED:multi_face_geometry"
input_stream_info: {
tag_index: "FINISHED"
back_edge: true
}
output_stream: "throttled_input_video"
}
# Calculate size of the image.
node {
calculator: "ImagePropertiesCalculator"
input_stream: "IMAGE_GPU:throttled_input_video"
output_stream: "SIZE:input_image_size"
}
# Defines how many faces to detect. Iris tracking currently only handles one
# face (left and right eye), and therefore this should always be set to 1.
node {
calculator: "ConstantSidePacketCalculator"
output_side_packet: "PACKET:0:num_faces"
node_options: {
[type.googleapis.com/mediapipe.ConstantSidePacketCalculatorOptions]: {
packet { int_value: 1 }
}
}
}
# Detects faces and corresponding landmarks.
node {
calculator: "FaceLandmarkFrontGpu"
input_stream: "IMAGE:throttled_input_video"
input_side_packet: "NUM_FACES:num_faces"
output_stream: "LANDMARKS:multi_face_landmarks"
output_stream: "ROIS_FROM_LANDMARKS:face_rects_from_landmarks"
output_stream: "DETECTIONS:face_detections"
output_stream: "ROIS_FROM_DETECTIONS:face_rects_from_detections"
}
# Generates an environment that describes the current virtual scene.
node {
calculator: "FaceGeometryEnvGeneratorCalculator"
output_side_packet: "ENVIRONMENT:environment"
node_options: {
[type.googleapis.com/mediapipe.FaceGeometryEnvGeneratorCalculatorOptions] {
environment: {
origin_point_location: TOP_LEFT_CORNER
perspective_camera: {
vertical_fov_degrees: 63.0 # 63 degrees
near: 1.0 # 1cm
far: 10000.0 # 100m
}
}
}
}
}
# Extracts a single set of face landmarks associated with the most prominent
# face detected from a collection.
node {
calculator: "SplitNormalizedLandmarkListVectorCalculator"
input_stream: "multi_face_landmarks"
output_stream: "face_landmarks"
node_options: {
[type.googleapis.com/mediapipe.SplitVectorCalculatorOptions] {
ranges: { begin: 0 end: 1 }
element_only: true
}
}
}
# Applies smoothing to the single set of face landmarks.
node {
calculator: "FaceLandmarksSmoothing"
input_stream: "NORM_LANDMARKS:face_landmarks"
input_stream: "IMAGE_SIZE:input_image_size"
output_stream: "NORM_FILTERED_LANDMARKS:smoothed_face_landmarks"
}
# Puts the single set of smoothed landmarks back into a collection to simplify
# passing the result into the `FaceGeometryFromLandmarks` subgraph.
node {
calculator: "ConcatenateNormalizedLandmarkListVectorCalculator"
input_stream: "smoothed_face_landmarks"
output_stream: "multi_smoothed_face_landmarks"
}
# Subgraph that renders face-landmark annotation onto the input image.
node {
calculator: "FaceRendererGpu"
input_stream: "IMAGE:throttled_input_video"
input_stream: "LANDMARKS:multi_smoothed_face_landmarks"
input_stream: "NORM_RECTS:face_rects_from_landmarks"
input_stream: "DETECTIONS:face_detections"
output_stream: "IMAGE:output_video"
}
# Computes face geometry from face landmarks for a single face.
node {
calculator: "FaceGeometryFromLandmarks"
input_stream: "MULTI_FACE_LANDMARKS:multi_smoothed_face_landmarks"
input_stream: "IMAGE_SIZE:input_image_size"
input_side_packet: "ENVIRONMENT:environment"
output_stream: "MULTI_FACE_GEOMETRY:multi_face_geometry"
}
"""
}