# Resnext & Xception Ensemble (Inference)

- This kernel outputs the ensemble of the results from https://www.kaggle.com/khoongweihao/frames-per-video-viz and https://www.kaggle.com/greatgamedota/xception-binary-classifier-inference (not original, modified learning rate and epochs)
- Frames per video at 64 (best found)

## Resnext Model

In [1]:
import os, sys, time
import cv2
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

%matplotlib inline
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [17]:
!pip install '/kaggle/input/dlibpkg/dlib-19.19.0'

Processing /kaggle/input/dlibpkg/dlib-19.19.0
Building wheels for collected packages: dlib
  Building wheel for dlib (setup.py) ... [?25ldone
[?25h  Created wheel for dlib: filename=dlib-19.19.0-cp36-cp36m-linux_x86_64.whl size=4086381 sha256=ad8c29419f91d3127d4c6150d4d52ae3e7eaee362a6a03341153a8f53d343d21
  Stored in directory: /root/.cache/pip/wheels/f3/da/18/38136f7dd8e242c1cc2236d574a50f34463b8c59aab887dd79
Successfully built dlib
Installing collected packages: dlib
Successfully installed dlib-19.19.0


In [47]:
!mamba create -n py310 -y
!source /opt/conda/bin/activate py310 && mamba install python=3.10 jupyter mamba -y

!sudo rm /opt/conda/bin/python3
!sudo ln -sf /opt/conda/envs/py310/bin/python3 /opt/conda/bin/python3
!sudo rm /opt/conda/bin/python3.7
!sudo ln -sf /opt/conda/envs/py310/bin/python3 /opt/conda/bin/python3.7
!sudo rm /opt/conda/bin/python
!sudo ln -sf /opt/conda/envs/py310/bin/python3 /opt/conda/bin/python

/bin/sh: 1: mamba: not found
/bin/sh: 1: source: not found
/bin/sh: 1: sudo: not found
/bin/sh: 1: sudo: not found
/bin/sh: 1: sudo: not found
/bin/sh: 1: sudo: not found
/bin/sh: 1: sudo: not found
/bin/sh: 1: sudo: not found


In [2]:
test_dir = "/kaggle/input/deepfake-detection-challenge/test_videos/"

test_videos = sorted([x for x in os.listdir(test_dir) if x[-4:] == ".mp4"])
frame_h = 5
frame_l = 5
len(test_videos)

400

In [3]:
print("PyTorch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version())

PyTorch version: 1.3.0
CUDA version: 10.0.130
cuDNN version: 7603


In [4]:
gpu = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
gpu

device(type='cuda', index=0)

In [5]:
import sys
sys.path.insert(0, "/kaggle/input/blazeface-pytorch")
sys.path.insert(0, "/kaggle/input/deepfakes-inference-demo")

In [6]:
from blazeface import BlazeFace
facedet = BlazeFace().to(gpu)
facedet.load_weights("/kaggle/input/blazeface-pytorch/blazeface.pth")
facedet.load_anchors("/kaggle/input/blazeface-pytorch/anchors.npy")
_ = facedet.train(False)

In [7]:
import cv2

def frame_selection(video_path, k, omega):
    # Open the video file.
    video = cv2.VideoCapture(video_path)
    # Get the total number of frames.
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))

    # Calculate the sample interval in frames.
    interval = total_frames // k

    frames = []
    for i in range(k):
        # Determine the frame number for this sample.
        frame_num = i * interval

        # Set the video to start at frame_num.
        video.set(cv2.CAP_PROP_POS_FRAMES, frame_num)

        # Extract omega frames from this point.
        for _ in range(omega):
            success, frame = video.read()
            if success:
                frames.append(frame)

    # Close the video file.
    video.release()

    return frames


In [18]:
import dlib
import numpy as np
import cv2

# Define the facial landmark indices for the left eye, right eye, and mouth.
LEFT_EYE_INDICES = np.array([36, 37, 38, 39, 40, 41])
RIGHT_EYE_INDICES = np.array([42, 43, 44, 45, 46, 47])
MOUTH_INDICES = np.array([48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59])

import numpy as np
import cv2
import dlib

def shape_to_np(shape, dtype="int"):
    # initialize the list of (x, y)-coordinates
    coords = np.zeros((68, 2), dtype=dtype)

    # loop over the 68 facial landmarks and convert them
    # to a 2-tuple of (x, y)-coordinates
    for i in range(0, 68):
        coords[i] = (shape.part(i).x, shape.part(i).y)

    # return the list of (x, y)-coordinates
    return coords

def align_face(image, detector, predictor, desired_face_width=112, desired_face_height=112, desired_left_eye=(0.35, 0.35)):

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    rects = detector(gray, 1)

    for rect in rects:
        shape = predictor(gray, rect)
        shape = shape_to_np(shape)

        # simple hack ;)
        if len(shape) == 68:
            # get the center between the two eyes
            dY = float(shape[45][1] - shape[36][1])
            dX = float(shape[45][0] - shape[36][0])
            angle = np.degrees(np.arctan2(dY, dX)) - 180

            desired_right_eye_x = 1.0 - desired_left_eye[0]

            dist = np.sqrt((dX ** 2) + (dY ** 2))
            desired_dist = (desired_right_eye_x - desired_left_eye[0])
            desired_dist *= desired_face_width
            scale = desired_dist / dist
            

            eyes_center = ((shape[36][0] + shape[45][0]) / 2.0, (shape[36][1] + shape[45][1]) / 2.0)


            M = cv2.getRotationMatrix2D(eyes_center, angle, scale)

            tX = desired_face_width * 0.5
            tY = desired_face_height * desired_left_eye[1]

            M[0, 2] += (tX - eyes_center[0])
            M[1, 2] += (tY - eyes_center[1])

            (w, h) = (desired_face_width, desired_face_height)

            output = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC)

            return output




# Usage:
# detector = dlib.get_frontal_face_detector()
# predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")
# aligned_face = align_face(image, detector, predictor)


In [29]:
import cv2
import dlib
import numpy as np
import torch
from torchvision import transforms
from efficientnet_pytorch import EfficientNet
from vit_pytorch import ViT
from PIL import Image

# Initialize the face detector and face aligner.
face_detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor("/content/shape_predictor_68_face_landmarks.dat")


# Specify the transformations.
transformations = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((112, 112)),
    transforms.ToTensor(),
])

def face_processing(frames):
    processed_frames = []
    for frame in frames:
        # Detect faces in the frame.
        detections = face_detector(frame, 1)
        
        if len(detections) > 0:
            for detection in detections:
                # Align the detected face.
                aligned_face = align_face(frame, face_detector, predictor)
                print(aligned_face)
                
                # Transform the face to a 112x112 image.
                processed_face = transformations(aligned_face)
                
                # Append the processed face to the list of processed frames.
                processed_frames.append(processed_face)
        
    if len(processed_frames) > 0:
        # Convert the list of processed frames to a batch.
        batch = torch.stack(processed_frames)
        return batch
    else:
        return None





ModuleNotFoundError: No module named 'vit_pytorch'

In [19]:
import cupy as cp

def motion_magnification(aligned_faces, k, omega, t):
    num_frames, h, w, num_channels = aligned_faces.shape

    # Move aligned_faces to GPU memory
    aligned_faces_gpu = cp.asarray(aligned_faces)

    # Create empty array for the output on GPU
    magnified_faces_gpu = cp.zeros_like(aligned_faces_gpu)

    # Loop over each frame
    for i in range(num_frames):
        # Convert to frequency domain
        f_img_gpu = cp.fft.fft2(aligned_faces_gpu[i])

        # Compute the phase difference and magnify it
        if i > 0:
            phase_diff_gpu = cp.angle(f_img_gpu) - cp.angle(cp.fft.fft2(aligned_faces_gpu[i - 1]))
            phase_diff_magnified_gpu = phase_diff_gpu * k

            # Create a new image with the original amplitude and the magnified phase
            f_img_magnified_gpu = cp.abs(f_img_gpu) * cp.exp(1j * (cp.angle(f_img_gpu) + phase_diff_magnified_gpu))

            # Bandpass filter around the given frequency range
            f_img_magnified_gpu = cp.fft.fftshift(f_img_magnified_gpu)
            low, high = (num_frames // 2 - omega, num_frames // 2 + omega)
            f_img_magnified_gpu[:low] = 0
            f_img_magnified_gpu[high:] = 0
            f_img_magnified_gpu = cp.fft.ifftshift(f_img_magnified_gpu)

            # Convert back to the spatial domain
            img_magnified_gpu = cp.real(cp.fft.ifft2(f_img_magnified_gpu))

            # Store the magnified frame
            magnified_faces_gpu[i] = img_magnified_gpu
        else:
            magnified_faces_gpu[i] = aligned_faces_gpu[i]

    # Normalize the result to 0-1
    magnified_faces_gpu = (magnified_faces_gpu - magnified_faces_gpu.min()) / (magnified_faces_gpu.max() - magnified_faces_gpu.min())

    # Move the result back to CPU memory
    magnified_faces = cp.asnumpy(magnified_faces_gpu)

    return magnified_faces


In [20]:
# Call the testing function.

# Provide the path to your video file.
video_path = "/kaggle/input/deepfake-detection-challenge/test_videos/aassnaulhq.mp4"

# Set the parameters for testing.
k = 5  # Number of frames to select
omega = 2  # Number of frames to extract at each selected point
# test_frame_selection(video_path, k, omega)



# Call the frame_selection function to get the frames.
frames = frame_selection(video_path, k, omega)

# Process the frames to extract faces.
processed_batch = face_processing(frames)

# Perform further processing on the processed_batch, such as using a pre-trained model for face recognition.

# Example: Print the shape of the processed_batch.
print(processed_batch.shape)


NameError: name 'face_processing' is not defined

In [None]:
from helpers.read_video_1 import VideoReader
from helpers.face_extract_1 import FaceExtractor

frames_per_video = 64 #frame_h * frame_l
video_reader = VideoReader()
video_read_fn = lambda x: video_reader.read_frames(x, num_frames=frames_per_video)
face_extractor = FaceExtractor(video_read_fn, facedet)

In [None]:
input_size = 224

In [None]:
from torchvision.transforms import Normalize

mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
normalize_transform = Normalize(mean, std)

In [None]:
def isotropically_resize_image(img, size, resample=cv2.INTER_AREA):
    h, w = img.shape[:2]
    if w > h:
        h = h * size // w
        w = size
    else:
        w = w * size // h
        h = size

    resized = cv2.resize(img, (w, h), interpolation=resample)
    return resized


def make_square_image(img):
    h, w = img.shape[:2]
    size = max(h, w)
    t = 0
    b = size - h
    l = 0
    r = size - w
    return cv2.copyMakeBorder(img, t, b, l, r, cv2.BORDER_CONSTANT, value=0)

In [None]:
import torch.nn as nn
import torchvision.models as models

class MyResNeXt(models.resnet.ResNet):
    def __init__(self, training=True):
        super(MyResNeXt, self).__init__(block=models.resnet.Bottleneck,
                                        layers=[3, 4, 6, 3], 
                                        groups=32, 
                                        width_per_group=4)
        self.fc = nn.Linear(2048, 1)

In [None]:
checkpoint = torch.load("/kaggle/input/deepfakes-inference-demo/resnext.pth", map_location=gpu)

model = MyResNeXt().to(gpu)
model.load_state_dict(checkpoint)
_ = model.eval()

del checkpoint

In [None]:
def predict_on_video(video_path, batch_size):
    try:
        # Find the faces for N frames in the video.
        faces = face_extractor.process_video(video_path)

        # Only look at one face per frame.
        face_extractor.keep_only_best_face(faces)
        
        if len(faces) > 0:
            # NOTE: When running on the CPU, the batch size must be fixed
            # or else memory usage will blow up. (Bug in PyTorch?)
            x = np.zeros((batch_size, input_size, input_size, 3), dtype=np.uint8)

            # If we found any faces, prepare them for the model.
            n = 0
            for frame_data in faces:
                for face in frame_data["faces"]:
                    # Resize to the model's required input size.
                    # We keep the aspect ratio intact and add zero
                    # padding if necessary.                    
                    resized_face = isotropically_resize_image(face, input_size)
                    resized_face = make_square_image(resized_face)

                    if n < batch_size:
                        x[n] = resized_face
                        n += 1
                    else:
                        print("WARNING: have %d faces but batch size is %d" % (n, batch_size))
                    
                    # Test time augmentation: horizontal flips.
                    # TODO: not sure yet if this helps or not
                    #x[n] = cv2.flip(resized_face, 1)
                    #n += 1

            if n > 0:
                x = torch.tensor(x, device=gpu).float()

                # Preprocess the images.
                x = x.permute((0, 3, 1, 2))

                for i in range(len(x)):
                    x[i] = normalize_transform(x[i] / 255.)

                # Make a prediction, then take the average.
                with torch.no_grad():
                    y_pred = model(x)
                    y_pred = torch.sigmoid(y_pred.squeeze())
                    return y_pred[:n].mean().item()

    except Exception as e:
        print("Prediction error on video %s: %s" % (video_path, str(e)))

    return 0.5

In [None]:
from concurrent.futures import ThreadPoolExecutor

def predict_on_video_set(videos, num_workers):
    def process_file(i):
        filename = videos[i]
        y_pred = predict_on_video(os.path.join(test_dir, filename), batch_size=frames_per_video)
        return y_pred

    with ThreadPoolExecutor(max_workers=num_workers) as ex:
        predictions = ex.map(process_file, range(len(videos)))

    return list(predictions)

In [None]:
speed_test = False  # you have to enable this manually

In [None]:
if speed_test:
    start_time = time.time()
    speedtest_videos = test_videos[:5]
    predictions = predict_on_video_set(speedtest_videos, num_workers=4)
    elapsed = time.time() - start_time
    print("Elapsed %f sec. Average per video: %f sec." % (elapsed, elapsed / len(speedtest_videos)))

In [None]:
predictions = predict_on_video_set(test_videos, num_workers=4)

In [None]:
submission_df_resnext = pd.DataFrame({"filename": test_videos, "label": predictions})
submission_df_resnext.to_csv("submission_resnext.csv", index=False)

## Xception Net

In [None]:
!pip install ../input/deepfake-xception-trained-model/pytorchcv-0.0.55-py2.py3-none-any.whl --quiet

In [None]:
test_dir = "/kaggle/input/deepfake-detection-challenge/test_videos/"

test_videos = sorted([x for x in os.listdir(test_dir) if x[-4:] == ".mp4"])
len(test_videos)

In [None]:
gpu = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
import sys
sys.path.insert(0, "/kaggle/input/blazeface-pytorch")
sys.path.insert(0, "/kaggle/input/deepfakes-inference-demo")

In [None]:
from blazeface import BlazeFace
facedet = BlazeFace().to(gpu)
facedet.load_weights("/kaggle/input/blazeface-pytorch/blazeface.pth")
facedet.load_anchors("/kaggle/input/blazeface-pytorch/anchors.npy")
_ = facedet.train(False)

In [None]:
from helpers.read_video_1 import VideoReader
from helpers.face_extract_1 import FaceExtractor

frames_per_video = 64 # originally 4

video_reader = VideoReader()
video_read_fn = lambda x: video_reader.read_frames(x, num_frames=frames_per_video)
face_extractor = FaceExtractor(video_read_fn, facedet)

In [None]:
input_size = 150

In [None]:
from torchvision.transforms import Normalize

mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
normalize_transform = Normalize(mean, std)

In [None]:
def isotropically_resize_image(img, size, resample=cv2.INTER_AREA):
    h, w = img.shape[:2]
    if w > h:
        h = h * size // w
        w = size
    else:
        w = w * size // h
        h = size

    resized = cv2.resize(img, (w, h), interpolation=resample)
    return resized


def make_square_image(img):
    h, w = img.shape[:2]
    size = max(h, w)
    t = 0
    b = size - h
    l = 0
    r = size - w
    return cv2.copyMakeBorder(img, t, b, l, r, cv2.BORDER_CONSTANT, value=0)

In [None]:
!ls ../input/deepfake-xception-trained-model

In [None]:
from pytorchcv.model_provider import get_model
model = get_model("xception", pretrained=False)
model = nn.Sequential(*list(model.children())[:-1]) # Remove original output layer

class Pooling(nn.Module):
  def __init__(self):
    super(Pooling, self).__init__()
    
    self.p1 = nn.AdaptiveAvgPool2d((1,1))
    self.p2 = nn.AdaptiveMaxPool2d((1,1))

  def forward(self, x):
    x1 = self.p1(x)
    x2 = self.p2(x)
    return (x1+x2) * 0.5

model[0].final_block.pool = nn.Sequential(nn.AdaptiveAvgPool2d((1,1)))

class Head(torch.nn.Module):
  def __init__(self, in_f, out_f):
    super(Head, self).__init__()
    
    self.f = nn.Flatten()
    self.l = nn.Linear(in_f, 512)
    self.d = nn.Dropout(0.5)
    self.o = nn.Linear(512, out_f)
    self.b1 = nn.BatchNorm1d(in_f)
    self.b2 = nn.BatchNorm1d(512)
    self.r = nn.ReLU()

  def forward(self, x):
    x = self.f(x)
    x = self.b1(x)
    x = self.d(x)

    x = self.l(x)
    x = self.r(x)
    x = self.b2(x)
    x = self.d(x)

    out = self.o(x)
    return out

class FCN(torch.nn.Module):
  def __init__(self, base, in_f):
    super(FCN, self).__init__()
    self.base = base
    self.h1 = Head(in_f, 1)
  
  def forward(self, x):
    x = self.base(x)
    return self.h1(x)

net = []
model = FCN(model, 2048)
model = model.cuda()
model.load_state_dict(torch.load('../input/deepfake-xception-trained-model/model.pth')) # new, updated
net.append(model)

## Prediction loop

In [None]:
def predict_on_video(video_path, batch_size):
    try:
        # Find the faces for N frames in the video.
        faces = face_extractor.process_video(video_path)

        # Only look at one face per frame.
        face_extractor.keep_only_best_face(faces)
        
        if len(faces) > 0:
            # NOTE: When running on the CPU, the batch size must be fixed
            # or else memory usage will blow up. (Bug in PyTorch?)
            x = np.zeros((batch_size, input_size, input_size, 3), dtype=np.uint8)

            # If we found any faces, prepare them for the model.
            n = 0
            for frame_data in faces:
                for face in frame_data["faces"]:
                    # Resize to the model's required input size.
                    # We keep the aspect ratio intact and add zero
                    # padding if necessary.                    
                    resized_face = isotropically_resize_image(face, input_size)
                    resized_face = make_square_image(resized_face)

                    if n < batch_size:
                        x[n] = resized_face
                        n += 1
                    else:
                        print("WARNING: have %d faces but batch size is %d" % (n, batch_size))
                    
                    # Test time augmentation: horizontal flips.
                    # TODO: not sure yet if this helps or not
                    #x[n] = cv2.flip(resized_face, 1)
                    #n += 1

            if n > 0:
                x = torch.tensor(x, device=gpu).float()

                # Preprocess the images.
                x = x.permute((0, 3, 1, 2))

                for i in range(len(x)):
                    x[i] = normalize_transform(x[i] / 255.)
#                     x[i] = x[i] / 255.

                # Make a prediction, then take the average.
                with torch.no_grad():
                    y_pred = model(x)
                    y_pred = torch.sigmoid(y_pred.squeeze())
                    return y_pred[:n].mean().item()

    except Exception as e:
        print("Prediction error on video %s: %s" % (video_path, str(e)))

    return 0.5

In [None]:
from concurrent.futures import ThreadPoolExecutor

def predict_on_video_set(videos, num_workers):
    def process_file(i):
        filename = videos[i]
        y_pred = predict_on_video(os.path.join(test_dir, filename), batch_size=frames_per_video)
        return y_pred

    with ThreadPoolExecutor(max_workers=num_workers) as ex:
        predictions = ex.map(process_file, range(len(videos)))

    return list(predictions)

In [None]:
speed_test = False

In [None]:
if speed_test:
    start_time = time.time()
    speedtest_videos = test_videos[:5]
    predictions = predict_on_video_set(speedtest_videos, num_workers=4)
    elapsed = time.time() - start_time
    print("Elapsed %f sec. Average per video: %f sec." % (elapsed, elapsed / len(speedtest_videos)))

In [None]:
%%time
model.eval()
predictions = predict_on_video_set(test_videos, num_workers=4)

In [None]:
submission_df_xception = pd.DataFrame({"filename": test_videos, "label": predictions})
submission_df_xception.to_csv("submission_xception.csv", index=False)

In [None]:
submission_df_resnext.head()

In [None]:
submission_df_xception.head()

## Ensemble of Resnext and Xception

In [None]:
submission_df = pd.DataFrame({"filename": test_videos})

In [None]:
r1 = 0.46441
r2 = 0.52189
total = r1 + r2
r11 = r1/total
r22 = r2/total

In [None]:
submission_df["label"] = r22*submission_df_resnext["label"] + r11*submission_df_xception["label"]

In [None]:
submission_df.to_csv("submission.csv", index=False)