In [4]:
import torch
import torchvision
from torchvision import transforms, models
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import cv2, os, json
import time, random
import matplotlib.pyplot as plt

#### 1. Creating the clips

* Firstly, the timestamps are manually marked and kept in a text-file, with a corresponding url of the yt video. The video is then downloaded locally.
* The downloaded video is loaded, then a clip of 3 seconds, 2s before the timestamp and 1s after, is created and stored in `data/`

In [None]:
from pytubefix import YouTube
from pytubefix.cli import on_progress


# download the yt videos

video1_url = 'www.youtube.com/watch?v=P22HqM9w500'
video2_url = "https://www.youtube.com/watch?v=GsLhzD72yYA"
video3_url = 'https://www.youtube.com/watch?v=k4uLbGkm6Ls'
video4_url = "https://www.youtube.com/watch?v=xjy-7ZCohG8"

yt = YouTube(video4_url, on_progress_callback=on_progress).streams[0]


print(yt)
yt.download('video4.mp4')

In [None]:
# load the timestamps and convert them from MM:SS formats to only seconds
with open("video4_kill_timestamps.txt") as f:
    lines = f.readlines()

lines = [t[:-1] for t in lines]
lines = [t.split(':') for t in lines]
lines = [int(t[0])*60+int(t[1]) for t in lines]

lines

In [None]:
import moviepy as mp

#load the downloaded video
video = mp.VideoFileClip('video4.mp4')

#clip out the t-2 to t+1 section for each timestamp t, and save it
for t in lines:
    t = int(t)
    subclip = video[t-2:t+1]
    subclip.write_videofile(f"data/video4_{t-2}_{t+1}.mp4", codec='libx264')

#### function for loading the clips as frames

In [5]:
def load_video_as_frames(path:str):

    video = cv2.VideoCapture(path)
    success, img = video.read()

    
    frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))

    frames = torch.zeros((frame_count, *img.shape))

    i= 0
    while success:

        frames[i] = torch.Tensor(img)
        i+=1
        success, img = video.read()

    
    frames = torch.permute(frames, (0, 3, 1, 2))
    
    return frames

    # torch.Tensor(frames[0])
    # print(frames.size())

### 2. The architechture(or pipeline, or whatver)

* Firstly, each clip is loaded, then it is batched and a feature representation is obtained for each individual frame. A pretrained model, (such as a resnet) is used for this, while optionally unfreezing the last layer 
* The features are then passed sequentially into an LSTM one after another, a single vector is finally recieved, encapsulating the sequential form of the data.
* now it is passed into a (Variational) autoencoder, and trained. 


In [123]:
from torchvision.models import resnet50, ResNet50_Weights

class myResnet(nn.Module):

    def __init__(self):
        super().__init__()

        self.model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
        self.transforms = ResNet50_Weights.IMAGENET1K_V2.transforms()

        self.model.fc = nn.Identity()

        for p in self.model.parameters():
            p.requires_grad_(False)
        
    def preprocess(self, X):
        return self.transforms(X)
    
    def forward(self, X):
        return self.model(X)
    
class myLSTM(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers):
        super().__init__()

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        # self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, X):

        out, (hn, cn) = self.lstm(X)

        #out.shape = (batch_size, seq_length, hidden_size)
        
        #take the last element from the sequence
        if out.ndim == 3: #batched
            out = out[:, -1, :] 
        elif out.ndim == 2: #unbatched
            out = out[-1, :]


        return out
    
class AE(nn.Module):

    def __init__(self, input_size, latent_size, hidden_dims:list=[256, 128]):
        
        super().__init__()
        layers = []

        hidden_dims.append(latent_size)
        hidden_dims.insert(0, input_size)

        for i in range(len(hidden_dims)-1):
            layers.append(nn.Linear(hidden_dims[i], hidden_dims[i+1]))
            layers.append(nn.ReLU())
        
        self.encoder = nn.Sequential(*layers)

        layers = []
        hidden_dims.reverse()
        for i in range(len(hidden_dims)-1):
            layers.append(nn.Linear(hidden_dims[i], hidden_dims[i+1]))
            layers.append(nn.ReLU())
        
        self.decoder = nn.Sequential(*layers)

    def forward(self, X):
        out = self.encoder(X)
        out = self.decoder(out)
        return out
    
    def get_loss(self, input, output):
        return F.mse_loss(input, output)

class VAE(nn.Module):

    def __init__(self, input_dim, latent_dim, hidden_dims=[256, 128]):
        super().__init__()

        layers = []

        k=input_dim

        for dim in hidden_dims:
            layers.append(nn.Linear(k, dim))
            layers.append(nn.LeakyReLU())

            k = dim

        self.encoder = nn.Sequential(*layers)

        self.fc_mu = nn.Linear(k, latent_dim)
        self.fc_log_var = nn.Linear(k, latent_dim)

        layers = []
        hidden_dims.reverse()
        k = latent_dim

        for dim in hidden_dims:
            layers.append(nn.Linear(k, dim))
            layers.append(nn.LeakyReLU())
            k = dim
        
        layers.append(nn.Linear(k, input_dim))

        self.decoder = nn.Sequential(*layers)
    
    def forward(self, X):

        out = self.encoder(X)

        #get mean and log(var)
        mean = self.fc_mu(out)
        log_var = self.fc_log_var(out)

        #reparametrization
        epsilon = torch.randn_like(mean)
        z = mean + epsilon*log_var

        #decode
        X_recon = self.decoder(z)

        return X_recon, mean, log_var
    
    def get_loss(self, X, target):

        X_recon, mean, log_var = target

        normal_loss = F.mse_loss(X, X_recon, reduce='sum') / X.shape[0]

        kld_loss = -0.5 * torch.mean(1+log_var-mean**2-log_var.exp()) / X.shape[0]

        return kld_loss + normal_loss


class myGRU(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers):
        super().__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)

    def forward(self, X):

        out, h_n = self.gru(X)

        #take last element from the sequence. 
        #out.shape = (batch_size, seq_len, hidden_size)

        if out.ndim == 3: #batched
            out = out[:, -1, :] 
        elif out.ndim == 2: #unbatched
            out = out[-1, :]

        return out

        

class MyModel(nn.Module):

    def __init__(self, denoising_std=0.):
        super().__init__()

        ## model for getting feature encodings
        self.cnn_model = myResnet()
        
        ## LSTM (can also use attention based here if needed)
        

        self.lstm = myLSTM(2048, 1024, 1)
        
        ## VAE
        # self.lstm_outputs = []

        self.denoising_std = None
        if (denoising_std > 0.):
            self.denoising_std = denoising_std
        elif denoising_std < 0.:
            raise ValueError(f"Denoising Standard deviation must be > 0, but got {denoising_std}")

        self.ae = VAE(1024, 128, [256, 128])


    def forward(self, X):
        
        # assumes that X is an input of shape (seq_length, channels, height, width)

        encodings = self.cnn_model(X)
        #encoding has shape (seq_length, 2048), note that this is unbatched

        lstm_encoding = self.lstm(encodings)

        # self.lstm_outputs.append(lstm_encoding)
        #this output has shape (1024, )

        
        if self.denoising_std is not None and self.training:
            out = self.ae(lstm_encoding + torch.randn_like(lstm_encoding) * self.denoising_std)
        else:
            out = self.ae(lstm_encoding)

     
        return lstm_encoding, out #both are necessary for loss
    

    def preprocess(self, X):
        return self.cnn_model.preprocess(X)
    
    def get_loss(self, lstm_encoding, output):

        return self.ae.get_loss(lstm_encoding, output)





In [124]:
sum([p.numel() for p in MyModel().parameters() if p.requires_grad])

13232128

In [125]:
with open("train_test_clips_without4.json") as f:
    train_test_clips = json.load(f)

train_clips = train_test_clips['train']
test_clips = train_test_clips['test']



In [127]:
model = MyModel(0.001)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-6)


### Training the Model

In [None]:
random.seed(0)

iteration_losses = []
epoch_losses = []



if True:
    for epoch in range(5):

        total_loss = 0.
        start = time.time()

        model.train()

        
        random.shuffle(train_clips)

        iteration = 0
        for clip_path in train_clips:
            iteration += 1
            clip = load_video_as_frames(clip_path)

            print(clip_path, end=' ')
            clip = model.preprocess(clip).to(device)

            ae_in, ae_out = model(clip)
            loss = model.get_loss(ae_in, ae_out)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            print(loss.item())
            total_loss += loss
            if (iteration+1)%20==0:
                print(f"loss for iteration{iteration+1} = {loss.item()}")
                iteration_losses.append(loss.item())


        epoch_losses.append(total_loss/len(train_clips))
        print(f"Epoch[{epoch}], avg_loss={total_loss/len(train_clips)}, time-taken={time.time()-start}")




In [122]:
torch.save(model.state_dict(), "mymodel_weights_lstm_denoised_vae_with4.pth")


In [None]:
with torch.no_grad():

    losses = []
    model.eval()

    for clip_path in test_clips:
        # print(clip_path)
        clip = load_video_as_frames(clip_path)
        clip = model.preprocess(clip).to(device)

        ae_in, ae_out = model(clip)
        loss = model.get_loss(ae_in, ae_out)

        losses.append(loss.item())
        print(loss.item())

    print(losses)

    print(sum(losses)/len(losses))

test_losses = torch.Tensor(losses)

In [120]:
test_losses*1e7

tensor([ 9.2393,  9.4072,  8.0120, 10.9386,  7.1337,  8.3697, 66.1797,  8.6581,
         9.1574,  8.5859,  7.3321, 10.7241,  9.9988, 10.0097, 12.3649,  7.9547,
        10.8286,  9.9233, 10.3480,  7.6679,  7.5420,  8.9507,  6.8597, 10.4211,
         9.0404, 12.9463,  9.1019,  9.3718,  6.9494,  6.6684])

In [121]:
aimbot_clips = os.listdir("./aimbot_clips")

with torch.no_grad():

    losses = []

    for clip_path in aimbot_clips:
        # print(clip_path, end=" ")
        clip = load_video_as_frames("./aimbot_clips/"+clip_path)
        clip = model.preprocess(clip).to(device)

        ae_in, ae_out = model(clip)
        loss = model.get_loss(ae_in, ae_out)

        print(loss.item())
        losses.append(loss.item())

    # print(losses)

    print(sum(losses)/len(losses))

aimbot_losses = torch.Tensor(losses)

9.692652156445547e-07
1.0705133490773733e-06
1.101264615499531e-06
8.667104793858016e-07
1.0351291166443843e-06
7.906993459982914e-07
7.346360462179291e-07
7.865038469390129e-07
1.1838942555186804e-06
1.0110279617947526e-06
8.699598765815608e-07
9.446653166378383e-07
1.1511541515574208e-06
1.8588846160128014e-06
1.2381334499877994e-06
9.606399089534534e-07
9.186260285787284e-07
7.501408845200785e-07
1.098136408472783e-06
1.1053630259993952e-06
1.0113816415469046e-06
9.682303243607748e-07
1.225103801516525e-06
7.976198617143382e-07
1.1416185543566826e-06
1.01122668638709e-06
1.063196123141097e-06
7.830982440282241e-07
6.714049618494755e-07
8.663701578370819e-07
9.994866085586788e-07


In [None]:
aimbot_clips

In [111]:
aimbot_losses*1e7

tensor([ 9.6945, 10.6996, 11.0151,  8.6881, 10.3331,  7.8960,  7.3499,  7.8663,
        11.8214, 10.1039,  8.7024,  9.4551, 11.5155, 18.5857, 12.3759,  9.6060,
         9.1683,  7.5026, 10.9939, 11.0458, 10.1139,  9.6736, 12.2503,  7.9749,
        11.4198, 10.1142, 10.6243,  7.8296,  6.7145,  8.6664])

In [115]:
testing = {
    "test_correct":[],
    "aimbot_correct":[],
    "threshold": []
}

In [116]:
to_check = [7e-7 + i*1e-8 for i in range(-15, 26)]

for i in to_check:
    testing['threshold'].append(i*1e7)
    testing['aimbot_correct'].append(torch.sum(aimbot_losses > i).item())
    testing['test_correct'].append(torch.sum(test_losses < i).item())


In [117]:
import pandas as pd

df = pd.DataFrame(testing)

df['sum'] = df['test_correct'] + df['aimbot_correct']

print(df)

    test_correct  aimbot_correct  threshold  sum
0              0              30        5.5   30
1              0              30        5.6   30
2              0              30        5.7   30
3              0              30        5.8   30
4              0              30        5.9   30
5              0              30        6.0   30
6              0              30        6.1   30
7              0              30        6.2   30
8              0              30        6.3   30
9              0              30        6.4   30
10             0              30        6.5   30
11             0              30        6.6   30
12             1              30        6.7   31
13             1              29        6.8   30
14             2              29        6.9   31
15             3              29        7.0   32
16             3              29        7.1   32
17             4              29        7.2   33
18             4              29        7.3   33
19             5    