In [1]:
# @Author: Ibrahim Salihu Yusuf <Ibrahim>
# @Date:   2019-12-10T11:24:45+02:00
# @Email:  sibrahim1396@gmail.com
# @Project: Audio Classifier
# @Last modified by:   yusuf
# @Last modified time: 2019-12-19T11:25:08+02:00



import os
import glob
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchaudio
import torch.nn as nn
from  keras import backend as K
from sklearn import preprocessing
import moviepy.editor as mp
import torchvision
from keras.models import load_model, Model
import torchvision.transforms as transforms
from PIL import Image


le = preprocessing.LabelEncoder()

device = 'cuda' if torch.cuda.is_available() else 'cpu'

class VideoDataset(Dataset):
    """
    A rapper class for the Video dataset.
    """

    def __init__(self, video_paths, frame_model, audio_model, transform = None, limit=None, file_path=None, df=None):
        """
        Args:
            file_path(string): path to the audio csv file
            root_dir(string): directory with all the audio folds
            folds: integer corresponding to audio fold number or list of fold number if more than one fold is needed
        """
        self.video_file = df
#         elif file_path:
#             self.video_file = pd.read_json(file_path).transpose()
#             self.video_file['label'] = le.fit_transform(self.video_file.label.values)
        # self.folds = folds
        # self.video_paths = glob.glob(video_paths + '/*' + str(self.folds) + '/*')
        self.video_paths = video_paths
        self.frame_model = frame_model.eval()
        self.audio_model = audio_model
        self.transform = transform
        self.limit = limit

    def __len__(self):
        return len(self.video_file)

    def __getitem__(self, idx):

        video_file = self.video_file.index[idx]
        clip = mp.VideoFileClip(os.path.join(self.video_paths, video_file)) #pass in the video file path here
        no_frames = int(np.round(clip.duration)) * int(np.round(clip.fps))
        
        if self.limit:
            frame_embeddings = np.zeros((self.limit, 2048))
        else:
            frame_embeddings = np.zeros((no_frames, 2048))

        for j, frame in enumerate(clip.iter_frames()):
            if j == self.limit:
                break
            frame = Image.fromarray(frame)
            if self.transform:
                frame = self.transform(frame)
            frame_embeddings[j, :] = (self.frame_model(frame.unsqueeze(0)).detach().numpy()).squeeze(0) #detach from gradients

        audio = clip.audio.to_soundarray()

        audio = audio.mean(1, keepdims=True)
        mel_specgram = torchaudio.transforms.MelSpectrogram(sample_rate=44100, n_fft=2048, win_length=2000, hop_length=500, n_mels=240)(torch.tensor(audio).float().T)  # (channel, n_mels, time)
        mel_specgram = np.moveaxis(mel_specgram.detach().numpy(), 1, 2)
        audio_embedding = [self.audio_model.predict(mel_specgram).squeeze()]
        label = self.video_file['label2'][idx]
        
        return torch.tensor(frame_embeddings), torch.tensor(audio_embedding), torch.tensor(label)
    
def init_weights(m):
    if type(m) == nn.Conv1d or type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight.data)

def train(model, train_loader, optimizer, criterion, verbose=False):
  print("Training Model")
  model.train()
  train_loss = 0
  train_correct = 0
  i=0
  for frame_data, audio_data, label in train_loader:
    frame_data = frame_data.to(device)
    label = label.to(device)
    audio_data = audio_data.to(device)
    optimizer.zero_grad()
    out = model(frame_data, audio_data)
    train_correct += (torch.argmax(out, dim=1).eq_(label).sum()).item()
    loss = criterion(out, label)
    if verbose:
        print("Batch Loss for {}/{} is: {}".format(i, len(train_loader), loss.item()))
    train_loss += loss.item()
    loss.backward()
    optimizer.step()
    i+=1
  avg_loss = train_loss/len(train_loader)
  accuracy = train_correct/(len(train_loader.dataset))
  return avg_loss, accuracy

def test(model, test_loader, criterion, verbose=False):
  print("Testing Model")
  with torch.no_grad():
    model.eval()
    test_correct = 0
    test_loss = 0
    i = 0
    for frame_data, audio_data, label in test_loader:
        frame_data = frame_data.to(device)
        label = label.to(device)
        audio_data = audio_data.to(device)
        out2 = model(frame_data, audio_data)
        loss2 = criterion(out2, label)
        if verbose:
            print("Batch Loss for {}/{} is: {}".format(i, len(test_loader), loss2.item()))
        test_loss += loss2.item()
        test_correct += (torch.argmax(out2, dim=1).eq_(label).sum()).item()
        i+=1
    avg_loss = test_loss/len(test_loader)
    accuracy = test_correct/len(test_loader.dataset)
  return avg_loss, accuracy


def customPooling(x):
    target = x[1]
    inputs = x[0]
    maskVal = 0
    #getting the mask by observing the model's inputs
    mask = K.equal(inputs, maskVal)
    mask = K.all(mask, axis=-1, keepdims=True)

    #inverting the mask for getting the valid steps for each sample
    mask = 1 - K.cast(mask, K.floatx())

    #summing the valid steps for each sample
    stepsPerSample = K.sum(mask, axis=1, keepdims=False)

    #applying the mask to the target (to make sure you are summing zeros below)
    target = target * mask

    #calculating the mean of the steps (using our sum of valid steps as averager)
    means = K.sum(target, axis=1, keepdims=False) / stepsPerSample

    return means

class Identity(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=10, verbose=False):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement.
                            Default: False
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf

    def __call__(self, val_loss, model, optimizer):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model, optimizer)
        elif score < self.best_score:
            self.counter += 1
            print('EarlyStopping counter: {} out of {}'.format(self.counter, self.patience))
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model, optimizer)
            self.counter = 0

    def save_checkpoint(self, val_loss, model, optimizer):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print('Validation loss decreased ({} --> {}).  Saving model ...'.format(self.val_loss_min, val_loss))

        DeepFake_model_checkpoint={'model_state_dict':model.state_dict(),
                       'optimizer_state_dict': optimizer.state_dict(),
                        }

        torch.save(DeepFake_model_checkpoint, 'DeepFake_model_checkpoint1.tar')
        self.val_loss_min = val_loss


Using TensorFlow backend.


In [117]:
class DeepFake1(nn.Module):
    def __init__(self):
        super(DeepFake1, self).__init__()
        self.LSTM1 = nn.LSTM(2048, 512, bidirectional=True, batch_first=True)
        self.LSTM2 = nn.LSTM(1024, 512, bidirectional=True, batch_first=True)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(1074, 512)
        self.fc2 = nn.Linear(512, 2)
        
    def forward(self, x, y):
        out1, _ = self.LSTM1(x.float())
        out2, _ = self.LSTM2(out1) #, (h, c))
        out = out1[:, -1, :] + out2[:, -1, :]
        combined = torch.cat((out, y.squeeze(1)), 1)
        x = self.relu(self.fc1(combined))
        x = self.fc2(x)
        return x
    
class DeepFake2(nn.Module):
    def __init__(self, embed_size=2048, LSTM_UNITS=1024):#, DO = 0.3):
        super(DeepFake2, self).__init__()
        
#         self.embedding_dropout = SpatialDropout(0.0) #DO)
        
        self.lstm1 = nn.LSTM(embed_size, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(LSTM_UNITS * 2, LSTM_UNITS, bidirectional=True, batch_first=True)

        self.linear1 = nn.Linear(LSTM_UNITS*2, LSTM_UNITS*2)
        self.linear2 = nn.Linear(LSTM_UNITS*2, LSTM_UNITS//2)

        self.linear = nn.Linear(562, 2)

    def forward(self, x, y):
        _input1 = x.float()
        _input2 = y.float()
      
        out_lstm1, _ = self.lstm1(_input1)
        out_lstm2, _ = self.lstm2(out_lstm1)
        
        out_linear1  = F.relu(self.linear1(out_lstm1[:, -1, :]))
        hidden = out_lstm1[:, -1, :] + out_lstm2[:, -1, :] + out_linear1 + _input1.mean(1)

        out_linear2  = F.relu(self.linear2(hidden))

        combined = torch.cat((out_linear2, y.squeeze(1)), 1)

        output = self.linear(combined)
        
        return output
    
class DeepFake3(nn.Module):
    def __init__(self):
        super(DeepFake1, self).__init__()
        self.LSTM1 = nn.LSTM(562, 256, bidirectional=True, batch_first=True)
        self.LSTM2 = nn.LSTM(512, 256, bidirectional=True, batch_first=True)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(512, 256)
        self.fc2 = nn.Linear(256, 1)
        
    def forward(self, x):
        out1, _ = self.LSTM1(x.float())
        out2, _ = self.LSTM2(out1) #, (h, c))
        out = out1[:, -1, :] + out2[:, -1, :]
#         combined = torch.cat((out, y.squeeze(1)), 1)
        x = self.relu(self.fc1(out))
        x = self.fc2(x)
        return x
    

In [254]:
# @Author: Ibrahim Salihu Yusuf <Ibrahim>
# @Date:   2019-12-10T12:28:39+02:00
# @Email:  sibrahim1396@gmail.com
# @Project: Audio Classifier
# @Last modified by:   yusuf
# @Last modified time: 2019-12-19T11:09:51+02:00

import os
import glob
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torchaudio
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import moviepy.editor as mp
import torchvision
from  keras import backend as K
from keras.models import load_model, Model
import torchvision.transforms as transforms
from PIL import Image
from sklearn.model_selection import train_test_split

# from models2 import *
# from utils2 import *
import json, time

train_file_path = "train_sample_videos/metadata.json"

main_df = pd.read_json(train_file_path).transpose().reset_index()
main_df['label2'] = le.fit_transform(main_df.label.values)

df1 = pd.DataFrame((main_df['index'], main_df['label2'])).T
df2 = pd.DataFrame((main_df['original']))
df2["label2"] = 1
df2 = df2.dropna()
df2.set_axis(["index", "label2"], 1)
new_df = pd.concat((df1, df2))
new_df = new_df.set_index("index")

#Drop files not present
z = lambda x: os.path.exists(os.path.join(video_paths, x))
new_df['exist'] = new_df.index.map(z)
pure_df = new_df[new_df.exist==True]

train_df, test_df = train_test_split(pure_df, test_size=0.2)

# df = pd.read_json(train_file_path).transpose()
# df['label'] = le.fit_transform(df.label.values)
# train_df, test_df = train_test_split(df, test_size=0.2)

video_paths = "train_sample_videos"

model = load_model("saved_model_240_8_32_0.05_1_50_0_0.0001_100_156_2_True_True_fitted_objects.h5", custom_objects={'customPooling': customPooling})
audio_model = Model(inputs = model.input, outputs = model.layers[-3].output)

frame_model = torch.hub.load('pytorch/vision:v0.4.2', 'inception_v3', pretrained=True)
frame_model.eval()
frame_model.fc = Identity()

img_transforms = transforms.Compose([transforms.Resize((229, 229)),
                                 transforms.ToTensor(),
                                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

device = 'cuda' if torch.cuda.is_available() else 'cpu'

avg_loss = []
overall_result = {}

def main():
    model = DeepFake1().to(device).apply(init_weights)
    model_name = "DeepFake_model"

    writer_path = "/results/{}".format(model_name)
    # comment = "{}".format(models_name[model_idx])
#     writer = SummaryWriter(writer_path)
    inter_result = []
    
    train_dataset = VideoDataset(df=train_df, video_paths=video_paths, frame_model=frame_model, audio_model=audio_model, transform=img_transforms, limit=150)
    test_dataset = VideoDataset(df=test_df, video_paths=video_paths, frame_model=frame_model, audio_model=audio_model, transform=img_transforms, limit=150)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)#, num_workers=2)
    test_loader = DataLoader(test_dataset, batch_size=8)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001)
    criterion = nn.CrossEntropyLoss()
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.007645)

    early_stopping = EarlyStopping(patience=5, verbose=True)

    if os.path.isfile('DeepFake_model_checkpoint.tar'):
        print('DeepFake_model_checkpoint.tar found..')
        print('Loading checkpoint..')
        checkpoint = torch.load('DeepFake_model_checkpoint.tar')
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        print('Finished loading checkpoint..')

    epochs = 10
    print_every = 1

    train_losses = []
    test_losses = []
    train_accuracy = []
    test_accuracy = []

    for i in range(epochs):
      print("Start of Training epoch {}".format(i+1))
      t0 = time.time()
      train_loss, train_acc = train(model, train_loader, optimizer, criterion, verbose=True)
      t1 = time.time()
      print("Train Loss at end of Epoch {}/{} is {} | Train accuracy:{} | Time:{}".format(i+1, epochs, train_loss, train_acc, t1-t0))

      train_losses.append(train_loss)
      train_accuracy.append(train_acc)

#       writer.add_scalar("Fold{}/Loss/Train".format(i), train_loss, i)
#       writer.add_scalar("Fold{}/Accuracy/Train".format(i), train_acc, i)

      if (i+1)%print_every == 0:
          print("Start of testing epoch {}".format(i+1))
          t0 = time.time()
          test_loss, test_acc = test(model, test_loader, criterion, verbose=True)
          t1 = time.time()
          print("Test Loss at end of Epoch{}/{} is {} | Test accuracy:{} | Time:{}".format(i+1, epochs, test_loss, test_acc, t1-t0))

          test_losses.append(test_loss)
          test_accuracy.append(test_acc)

#           writer.add_scalar("Fold{}/Loss/Test".format(i), test_loss, i)
#           writer.add_scalar("Fold{}/Accuracy/Test".format(i), test_acc, i)
          try:
            early_stopping(test_loss, model, optimizer)
            if early_stopping.early_stop:
                print("Early stopping")
                break
          except:
            pass
      scheduler.step()

    inter_result.append({'train_loss':train_losses, 'train_accuracy':train_accuracy, 'test_loss':test_losses, 'test_accuracy':test_accuracy})
    overall_result["{}".format(model_name)] = inter_result
    out_filename = '/results/{}_result.json'.format(model_name)
    writer.close()
    try:
        with open(out_filename, 'w') as f:
            json.dump(inter_result, f)
    except:
        pass
if __name__ == "__main__":
    t0 = time.time()
    main()
    t1 = time.time()
    print("Total Training time is: {}".format(t1-t0))
    print("\n\nEnd of training\nLogging results..")
    with open('/data/overall_results.json', 'w') as f:
        json.dump(overall_result, f)


KeyboardInterrupt: 

In [10]:
audio, _ = torchaudio.load("TMaRdy00.wav")
audio = audio.T#.mean(1, keepdims=True)
mel_specgram = torchaudio.transforms.MelSpectrogram(sample_rate=44100, n_fft=2048, win_length=2000, hop_length=500, n_mels=240)(torch.tensor(audio).float().T)  # (channel, n_mels, time)


In [11]:
mel_specgram.shape

torch.Size([1, 240, 76])

Started training at 19:50