In [1]:
import torch
import torchvision
from torchvision import transforms, models
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import cv2, os, json
import time, random
import matplotlib.pyplot as plt

#### 1. Creating the clips

* Firstly, the timestamps are manually marked and kept in a text-file, with a corresponding url of the yt video. The video is then downloaded locally.
* The downloaded video is loaded, then a clip of 3 seconds, 2s before the timestamp and 1s after, is created and stored in `data/`

In [8]:
from pytubefix import YouTube
from pytubefix.cli import on_progress


# download the yt videos

video1_url = 'www.youtube.com/watch?v=P22HqM9w500'
video2_url = "https://www.youtube.com/watch?v=GsLhzD72yYA"
video3_url = 'https://www.youtube.com/watch?v=k4uLbGkm6Ls'
video4_url = "https://www.youtube.com/watch?v=xjy-7ZCohG8"

yt = YouTube(video4_url, on_progress_callback=on_progress).streams[0]


print(yt)
yt.download('video4.mp4')

<Stream: itag="18" mime_type="video/mp4" res="360p" fps="30fps" vcodec="avc1.42001E" acodec="mp4a.40.2" progressive="True" sabr="False" type="video">
 ↳ |████████████████████████████████████████████| 100.0%

'c:\\Users\\SUDARSHAN\\Desktop\\Coding Related files\\CSOC\\CSGO_model\\video4.mp4\\CSGO - 57 Kills In One Game! (Full Competitive Gameplay).mp4'

In [None]:
# load the timestamps and convert them from MM:SS formats to only seconds
with open("video4_kill_timestamps.txt") as f:
    lines = f.readlines()

lines = [t[:-1] for t in lines]
lines = [t.split(':') for t in lines]
lines = [int(t[0])*60+int(t[1]) for t in lines]

lines

In [None]:
import moviepy as mp

#load the downloaded video
video = mp.VideoFileClip('aim_bot_video.mp4')

#clip out the t-2 to t+1 section for each timestamp t, and save it
for t in lines:
    t = int(t)
    subclip = video[t-2:t+1]
    subclip.write_videofile(f"aimbot_clips/video_{t-2}_{t+1}.mp4", codec='libx264')

#### function for loading the clips as frames

In [2]:
def load_video_as_frames(path:str):

    video = cv2.VideoCapture(path)
    success, img = video.read()

    
    frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))

    frames = torch.zeros((frame_count, *img.shape))

    i= 0
    while success:

        frames[i] = torch.Tensor(img)
        i+=1
        success, img = video.read()

    
    frames = torch.permute(frames, (0, 3, 1, 2))
    
    return frames

    # torch.Tensor(frames[0])
    # print(frames.size())

### 2. The architechture(or pipeline, or whatver)

* Firstly, each clip is loaded, then it is batched and a feature representation is obtained for each individual frame. A pretrained model, (such as a resnet) is used for this, while optionally unfreezing the last layer 
* The features are then passed sequentially into an LSTM one after another, a single vector is finally recieved, encapsulating the sequential form of the data.
* now it is passed into a (Variational) autoencoder, and trained. 


In [3]:
from torchvision.models import resnet50, ResNet50_Weights

class myResnet(nn.Module):

    def __init__(self):
        super().__init__()

        self.model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
        self.transforms = ResNet50_Weights.IMAGENET1K_V2.transforms()

        self.model.fc = nn.Identity()

        for p in self.model.parameters():
            p.requires_grad_(False)
        
    def preprocess(self, X):
        return self.transforms(X)
    
    def forward(self, X):
        return self.model(X)
    
class myLSTM(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers):
        super().__init__()

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        # self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, X):

        out, (hn, cn) = self.lstm(X)

        #out.shape = (batch_size, seq_length, hidden_size)
        
        #take the last element from the sequence
        if out.ndim == 3: #batched
            out = out[:, -1, :] 
        elif out.ndim == 2: #unbatched
            out = out[-1, :]


        return out
    
class AE(nn.Module):

    def __init__(self, input_size, latent_size, hidden_dims:list=[256, 128]):
        
        super().__init__()
        layers = []

        hidden_dims.append(latent_size)
        hidden_dims.insert(0, input_size)

        for i in range(len(hidden_dims)-1):
            layers.append(nn.Linear(hidden_dims[i], hidden_dims[i+1]))
            layers.append(nn.ReLU())
        
        self.encoder = nn.Sequential(*layers)

        layers = []
        hidden_dims.reverse()
        for i in range(len(hidden_dims)-1):
            layers.append(nn.Linear(hidden_dims[i], hidden_dims[i+1]))
            layers.append(nn.ReLU())
        
        self.decoder = nn.Sequential(*layers)

    def forward(self, X):
        out = self.encoder(X)
        out = self.decoder(out)
        return out
    
    def get_loss(self, input, output):
        return F.mse_loss(input, output)

class VAE(nn.Module):

    def __init__(self, input_dim, latent_dim, hidden_dims=[256, 128]):
        super().__init__()

        layers = []

        k=input_dim

        for dim in hidden_dims:
            layers.append(nn.Linear(k, dim))
            layers.append(nn.LeakyReLU())

            k = dim

        self.encoder = nn.Sequential(*layers)

        self.fc_mu = nn.Linear(k, latent_dim)
        self.fc_log_var = nn.Linear(k, latent_dim)

        layers = []
        hidden_dims.reverse()
        k = latent_dim

        for dim in hidden_dims:
            layers.append(nn.Linear(k, dim))
            layers.append(nn.LeakyReLU())
            k = dim
        
        layers.append(nn.Linear(k, input_dim))

        self.decoder = nn.Sequential(*layers)
    
    def forward(self, X):

        out = self.encoder(X)

        #get mean and log(var)
        mean = self.fc_mu(out)
        log_var = self.fc_log_var(out)

        #reparametrization
        epsilon = torch.randn_like(mean)
        z = mean + epsilon*log_var

        #decode
        X_recon = self.decoder(z)

        return X_recon, mean, log_var
    
    def get_loss(self, X, target):

        X_recon, mean, log_var = target

        normal_loss = F.mse_loss(X, X_recon, reduce='sum') / X.shape[0]

        kld_loss = -0.5 * torch.mean(1+log_var-mean**2-log_var.exp()) / X.shape[0]

        return kld_loss + normal_loss




        

class MyModel(nn.Module):

    def __init__(self):
        super().__init__()

        ## model for getting feature encodings
        self.cnn_model = myResnet()
        
        ## LSTM (can also use attention based here if needed)
        

        self.lstm = myLSTM(2048, 1024, 1)
        
        ## VAE

        self.ae = VAE(1024, 128, [256, 128])


    def forward(self, X):
        
        # assumes that X is an input of shape (seq_length, channels, height, width)

        encodings = self.cnn_model(X)
        #encoding has shape (seq_length, 2048), note that this is unbatched

        lstm_encoding = self.lstm(encodings)
        #this output has shape (1024, )

        out = self.ae(lstm_encoding)
     
        return lstm_encoding, out #both are necessary for loss
    

    def preprocess(self, X):
        return self.cnn_model.preprocess(X)
    
    def get_loss(self, lstm_encoding, output):

        return self.ae.get_loss(lstm_encoding, output)



        





In [4]:
with open("train_test_clips.json") as f:
    train_test_clips = json.load(f)

train_clips = train_test_clips['train']
test_clips = train_test_clips['test']



In [5]:
model = MyModel()

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
optimizer = optim.Adam(model.parameters())


In [6]:
random.seed(0)
for epoch in range(7):
    
    total_loss = 0.
    start = time.time()

    
    random.shuffle(train_clips)

    iteration = 0
    for clip_path in train_clips:
        iteration += 1
        # print(clip_path)
        clip = load_video_as_frames("./data/"+clip_path)

        # print(clip.shape)
        clip = model.preprocess(clip).to(device)

        ae_in, ae_out = model(clip)
        loss = model.get_loss(ae_in, ae_out)
        # print("waw", flush=True)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss
        # print("waw2", flush=True)
        if (iteration+1)%10==0:
            print(f"loss for iteration{iteration+1} = {loss.item()}")


    
    print(f"Epoch[{epoch}], avg_loss={total_loss/len(train_clips)}, time-taken={time.time()-start}")






loss for iteration10 = 3.7145946407690644e-06
loss for iteration20 = 2.4245659915322904e-06
loss for iteration30 = 1.6617902929283446e-06
loss for iteration40 = 1.3856213172402931e-06
loss for iteration50 = 1.4239824395190226e-06
loss for iteration60 = 1.1269304422967252e-06
loss for iteration70 = 1.039824837789638e-06
loss for iteration80 = 8.44008809508523e-07
Epoch[0], avg_loss=1.7932636637851829e-06, time-taken=336.9501762390137
loss for iteration10 = 3.897629312632489e-07
loss for iteration20 = 2.9117899202901754e-07
loss for iteration30 = 3.141724391753087e-07
loss for iteration40 = 2.879878024941718e-07
loss for iteration50 = 2.967209979942709e-07
loss for iteration60 = 2.864271664293483e-07
loss for iteration70 = 2.64931315996364e-07
loss for iteration80 = 2.3753804612169915e-07
Epoch[1], avg_loss=3.227281126783055e-07, time-taken=286.38115429878235
loss for iteration10 = 1.7574187438640365e-07
loss for iteration20 = 1.1474269001610082e-07
loss for iteration30 = 1.5601648328811

In [7]:
torch.save(model.state_dict(), "mymodel_weights.pth")

In [None]:
start = time.time()
# for clip_path in train_clips:
#         # print(clip_path)
#         clip = load_video_as_frames("./data/"+clip_path)

#         clip = model.preprocess(clip).to(device)

clip = load_video_as_frames("./data/video3_467_470.mp4").to(device)
clip = model.preprocess(clip).to(device)[0:]
print(clip.shape)
ae_in, ae_out = model(clip)
# print(ae_in.shape)
# print(ae_out.shape)


time.time()-start

torch.Size([90, 3, 224, 224])


8.781089544296265

In [None]:
with torch.no_grad():

    losses = []

    for clip_path in test_clips:
        print(clip_path)
        clip = load_video_as_frames("./data/"+clip_path)
        clip = model.preprocess(clip).to(device)

        ae_in, ae_out = model(clip)
        loss = model.get_loss(ae_in, ae_out)

        losses.append(loss.item())

    print(losses)

    print(sum(losses)/len(losses))

In [29]:
aimbot_clips = os.listdir("./aimbot_clips")

with torch.no_grad():

    losses = []

    for clip_path in aimbot_clips:
        print(clip_path, end=" ")
        clip = load_video_as_frames("./aimbot_clips/"+clip_path)
        clip = model.preprocess(clip).to(device)

        ae_in, ae_out = model(clip)
        loss = model.get_loss(ae_in, ae_out)

        print(loss.item())
        losses.append(loss.item())

    # print(losses)

    print(sum(losses)/len(losses))


video_106_109.mp4 



4.594298275151232e-07
video_130_133.mp4 4.143348633078858e-07
video_137_140.mp4 5.393862920755055e-07
video_164_167.mp4 4.3277805161778815e-07
video_203_206.mp4 4.974145895175752e-07
video_222_225.mp4 4.5344032173488813e-07
video_225_228.mp4 5.612148470390821e-07
video_228_231.mp4 4.7457589857913263e-07
video_266_269.mp4 3.993547466052405e-07
video_293_296.mp4 4.4689147671306273e-07
video_300_303.mp4 5.507839091478672e-07
video_309_312.mp4 5.013226314076746e-07
video_323_326.mp4 4.864784841629444e-07
video_329_332.mp4 8.82284325598448e-07
video_350_353.mp4 6.126661560301727e-07
video_353_356.mp4 5.344479632185539e-07
video_373_376.mp4 5.620280489893048e-07
video_377_380.mp4 5.106541038912837e-07
video_397_400.mp4 4.4440466240303067e-07
video_413_416.mp4 3.9110881289161625e-07
video_467_470.mp4 4.285246575363999e-07
video_471_474.mp4 4.191072946468921e-07
video_491_494.mp4 4.287560386728728e-07
video_513_516.mp4 3.077764461068e-07
video_519_522.mp4 4.6760052896388515e-07
video_525_528.m