# points ! code is in progress

In [1]:
import numpy as np
import torch 
import cv2 as cv
import os 
import matplotlib.pyplot as plt
from torchinfo import summary


In [2]:
from typing import Literal 

def capture_vide_frames(video_path:str  , video_name:str, number_of_frames:int  , class_label:Literal[0, 1] =0 ): 

    # capturing frames 
    video_path = os.path.join(video_path , video_name)
    cap = cv.VideoCapture(video_path)
    
    total_frames = cap.get(cv.CAP_PROP_FRAME_COUNT)
    frame_indices = np.random.uniform(1, total_frames , number_of_frames).astype('int')
    count_frames = 0 
    img_sample = []

    while cap : 
        isTrue, frame = cap.read()

        if not isTrue : 
            break

        count_frames+=1 

        if count_frames in frame_indices: 
            img_sample.append(frame)

        if cv.waitKey(10) & 0xff == ord('q'): 
            break
        
    cap.release()
    cv.destroyAllWindows()

    #optimizing the frames 
    resized_images = [cv.resize(img, (224, 224)) for img in img_sample]

    #saving the frames 
    for idx,  img in enumerate(resized_images): 

        if class_label : 
            path = os.path.join('model_data/fire', video_name)
            os.makedirs(path, exist_ok=True)
            fileName= f'{idx}.jpg'
            cv.imwrite(os.path.join(path, fileName) , img)

        else : 
            path = os.path.join('model_data/no_fire', video_name)
            os.makedirs(path , exist_ok=True)
            fileName = f'{idx}.jpg'
            cv.imwrite(os.path.join(path, fileName) , img)


In [3]:
fire_vidoes_list  = os.listdir(os.path.join('data/videos/fire'))
no_fire_video_list  = os.listdir(os.path.join('data/videos/noFire'))

In [None]:
for vid in fire_vidoes_list: 
    capture_vide_frames('data/videos/fire', vid, 40, 1)


for vid in no_fire_video_list:
    capture_vide_frames('data/videos/nofire' , vid , 100)

In [4]:
import glob 

fire_images = glob.glob('model_data/fire/**/*')
no_fire_images = glob.glob('model_data/no_fire/**/*')

print(len(fire_images), len(no_fire_images))

278 225


In [5]:
from torch.utils.data import Dataset, DataLoader , random_split
from PIL import Image
from torchvision import transforms 

In [18]:
class FireVideoDataset(Dataset):
    def __init__(self, root_dir, chunk_size=17):
        self.chunk_size = chunk_size
        self.samples = []
        self.transform = transforms.Compose([
            transforms.ToTensor(),  # Convert PIL image to min max scale (0-1)
            transforms.Normalize(
                    [0.485, 0.456, 0.406],
                    [0.229, 0.224, 0.225]
                    ), 
            # transforms.RandomHorizontalFlip(0.5), 
            # transforms.ColorJitter(0.2)
        ])

        for class_name, label in [('fire', 1), ('no_fire', 0)]:
            class_dir = os.path.join(root_dir, class_name)

            for chunk_folder in os.listdir(class_dir):
                chunk_path = os.path.join(class_dir, chunk_folder)
                
                if os.path.isdir(chunk_path) and len(os.listdir(chunk_path)) >= chunk_size:
                    self.samples.append((chunk_path, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        chunk_path, label = self.samples[idx]
        
        image_files = os.listdir(chunk_path)[:self.chunk_size] 
        frames = []

        for fname in image_files:
            img_path = os.path.join(chunk_path, fname)
            img = Image.open(img_path)

            if img is None:
                continue  
            frames.append(self.transform(img)) 
        np_frame = np.array(frames)  # its [B , C , H , W]
        np_frame = np.transpose(np_frame, (1 , 0, 2, 3))  # becomes [C , B , H , W]

        return torch.tensor(np_frame, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)


In [7]:
dataset = FireVideoDataset(root_dir='model_data')

train_size = int(0.7 *len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_data_loader = DataLoader(train_dataset, batch_size=3 ,shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=3, shuffle=True)

In [8]:
pos = 0 
neg = 0 
for i in dataset: 
    if  i[1] == 1 : 
        pos+=1 
    else :
       neg +=1 

print(pos , neg)

7 7


In [9]:
import torch.nn as nn

class FireDetector(nn.Module):
    def __init__(self, in_channels=3, num_classes=1):
        super().__init__()
        self.conv_block = nn.Sequential(
            nn.Conv3d(in_channels, 16, kernel_size=3, padding=1),
            nn.BatchNorm3d(16),
            nn.ReLU(),
            nn.MaxPool3d((1, 2, 2)),  # frame , hight , width (keep fram , max pool each frame)

            nn.Conv3d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm3d(32),
            nn.ReLU(),
            nn.MaxPool3d((2, 2, 2)), # pick two frame , maxpool each frame 

            nn.Conv3d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm3d(64),
            nn.ReLU(),
            nn.AdaptiveAvgPool3d((1, 1, 1))  # compress to (1,1,1) # pick one frame , one pixel(best one in hight and ..)
        )
        self.classifier = nn.Linear(64, num_classes) # [9, 64  ,  1 ,1 , 1] we resize to have channels giving to linear to pick 2 

    def forward(self, x):
        x = self.conv_block(x)
        x = x.squeeze() # flatten
        x = self.classifier(x)
        return x


In [10]:
fire_detector_model = FireDetector()
summary(fire_detector_model)

Layer (type:depth-idx)                   Param #
FireDetector                             --
├─Sequential: 1-1                        --
│    └─Conv3d: 2-1                       1,312
│    └─BatchNorm3d: 2-2                  32
│    └─ReLU: 2-3                         --
│    └─MaxPool3d: 2-4                    --
│    └─Conv3d: 2-5                       13,856
│    └─BatchNorm3d: 2-6                  64
│    └─ReLU: 2-7                         --
│    └─MaxPool3d: 2-8                    --
│    └─Conv3d: 2-9                       55,360
│    └─BatchNorm3d: 2-10                 128
│    └─ReLU: 2-11                        --
│    └─AdaptiveAvgPool3d: 2-12           --
├─Linear: 1-2                            65
Total params: 70,817
Trainable params: 70,817
Non-trainable params: 0

In [11]:
loss_fn = nn.BCEWithLogitsLoss()
def measure_acc(ypred , ytrue): 

    from torchmetrics import Accuracy , Recall
    acc_fn = Accuracy(task='binary')
    acc = acc_fn(ypred , ytrue)

    sig_ypred = (torch.sigmoid(ypred) > 0.5).int()
    recallfn = Recall(task='binary')
    recall = recallfn(sig_ypred , ytrue)
    return acc , recall

In [12]:
from typing import Tuple
from tqdm import tqdm 

def train_the_model(model , number_of_epochs:int ,
                     data_loader:DataLoader, loss_fn:torch.nn.Module , 
                     ) -> Tuple[float, float] :
    
    optimizer = torch.optim.Adam(params=model.parameters() , lr=1e-1, weight_decay=1e-3)
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    
    for epoch in tqdm(range(number_of_epochs)): 
        model.train()
        batch_loss = 0
        batch_acc = 0 
        batch_recall = 0

        for x , y in data_loader : 
            ypred = model(x)
            ypred = ypred.squeeze()
            loss = loss_fn(ypred, y)
            batch_loss += loss.item() 
            acc , recall = measure_acc(ypred, y) 
            batch_acc += acc.item()
            batch_recall += recall.item()
            loss.backward()
            optimizer.step()
        batch_loss /= len(data_loader)
        batch_acc /= len(data_loader)
        batch_recall /= len(data_loader)
    return batch_loss , batch_acc, batch_recall


def eval_the_model(model:torch.nn.Module ,data_loader:DataLoader, loss_fn:torch.nn.Module ):
    test_loss = 0 
    test_acc = 0 
    batch_recall = 0

    model.eval()
    with torch.inference_mode(): 
        for x , y in data_loader: 
            ypred = model(x)
            ypred = np.squeeze(ypred)
            loss = loss_fn(ypred , y)
            test_loss+=loss.item()
            acc , recall = measure_acc(ypred , y)
            test_acc += acc.item()
            batch_recall += recall.item()
        test_loss /= len(data_loader)
        test_acc /= len(data_loader)
        batch_recall /= len(data_loader)
    
    return test_loss, test_acc , batch_recall


In [13]:
train_loss , train_acc , recall= train_the_model(fire_detector_model , 10 , train_data_loader , loss_fn  )
print(f'train loss is {train_loss:2f}\n',
       f'train accuracy is {train_acc:.2f}\n', 
       f'recall is {recall:.2f}\n')


100%|██████████| 10/10 [00:36<00:00,  3.64s/it]

train loss is 1.308441
 train accuracy is 0.78
 recall is 1.00






In [14]:
test_loss , test_acc , recall = eval_the_model(fire_detector_model  , test_data_loader , loss_fn)
print(f'train loss is {train_loss:2f}\n',
       f'train accuracy is {train_acc:.2f}\n', 
       f'recall is {recall:.2f}\n')


train loss is 1.308441
 train accuracy is 0.78
 recall is 0.00



In [15]:
from torchvision.models.video import r3d_18 , R3D_18_Weights
resnet_model = r3d_18(weights=R3D_18_Weights.DEFAULT)
resnet_model.fc = nn.Linear(resnet_model.fc.in_features, 1)  # Binary classification

resnet_model.fc


Linear(in_features=512, out_features=1, bias=True)

In [16]:
train_loss , train_acc , recall= train_the_model(resnet_model , 1, train_data_loader , loss_fn  )
print(f'train loss is {train_loss:2f}\n',
       f'train accuracy is {train_acc:.2f}\n', 
       f'recall is {recall:.2f}\n')


100%|██████████| 1/1 [00:50<00:00, 50.23s/it]

train loss is 7.387102
 train accuracy is 0.44
 recall is 0.67






In [19]:
test_loss , test_acc , recall = eval_the_model(resnet_model ,  test_data_loader, loss_fn)
print(f'test loss is {test_loss}\n',
       f'test accuracy is {test_acc}\n', 
       f'recall is {recall}\n')

test loss is 7615515228897280.0
 test accuracy is 0.4166666716337204
 recall is 1.0



### trying conv2dand1D

In [20]:
class FireDetectorConv2D1D(nn.Module): 
    def __init__(self, in_channels = 3 , num_classes=1): 
        super().__init__() 
        self.spatial_extractor = nn.Sequential(
            nn.Conv2d(in_channels, 32, kernel_size = 3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(), 
            nn.MaxPool2d(2),

            nn.Conv2d(32 , 64, kernel_size = 3 , padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1),
        )

        self.temporal_extractor = nn.Sequential(
            nn.Conv1d(64, 128 , 3 ,padding = 1 ),
            nn.BatchNorm1d(128), 
            nn.ReLU(), 
            nn.AdaptiveAvgPool1d(1) 
        )

        self.clasifier = nn.Linear(128, num_classes)

    def forward(self , x): 
            B , C , T , H , W = x.size()
            x = x.permute(0, 2, 1, 3 ,4)  # (B, T, C, H, W)
            x = x.reshape(B * T , C , H , W)
            x = self.spatial_extractor(x) # (B*T, 64, 1, 1)
            x = x.view(B , T , 64)  # (B, T, 64)

            x = x.permute(0 , 2 , 1) # (B , 64, T)
            x = self.temporal_extractor(x) # (B , 128, 1)
            x = x.squeeze(-1) # (B , 128)

            return self.clasifier(x)

In [21]:
model2d_plus = FireDetectorConv2D1D()
summary(model2d_plus)

Layer (type:depth-idx)                   Param #
FireDetectorConv2D1D                     --
├─Sequential: 1-1                        --
│    └─Conv2d: 2-1                       896
│    └─BatchNorm2d: 2-2                  64
│    └─ReLU: 2-3                         --
│    └─MaxPool2d: 2-4                    --
│    └─Conv2d: 2-5                       18,496
│    └─BatchNorm2d: 2-6                  128
│    └─ReLU: 2-7                         --
│    └─AdaptiveAvgPool2d: 2-8            --
├─Sequential: 1-2                        --
│    └─Conv1d: 2-9                       24,704
│    └─BatchNorm1d: 2-10                 256
│    └─ReLU: 2-11                        --
│    └─AdaptiveAvgPool1d: 2-12           --
├─Linear: 1-3                            129
Total params: 44,673
Trainable params: 44,673
Non-trainable params: 0

In [22]:
train_loss , train_acc , recall = train_the_model(model2d_plus, 15 , train_data_loader , loss_fn )
print(f'train loss is {train_loss:2f}\n',
       f'train accuracy is {train_acc:.2f}\n', 
       f'recall is {recall:.2f}\n')

100%|██████████| 15/15 [00:52<00:00,  3.53s/it]

train loss is 1.114936
 train accuracy is 0.89
 recall is 0.89






In [23]:
test_loss , test_acc , recall = eval_the_model(model2d_plus, test_data_loader, loss_fn )
print(f'test loss is {test_loss:2f}\n',
       f'test accuracy is {test_acc:.2f}\n', 
       f'recall is {recall:.2f}\n')

test loss is 34.941568
 test accuracy is 0.00
 recall is 0.00



In [24]:
class FireDetectorMain3D(nn.Module): 
    def __init__(self, in_channels , out_channels, kernel_size, padding): 
        super().__init__()
        T, H, W = kernel_size

        self.spatial_conv = nn.Conv3d(
            in_channels,  out_channels , kernel_size=(1 , H, W), padding=(0, padding , padding),
        )

        self.relu = nn.ReLU()
        
        self.temporal_conv = nn.Conv3d(
            out_channels , out_channels , kernel_size=(T , 1, 1) , padding=(padding, 0 , 0)
        )

    def forward(self, x): 
        x = self.spatial_conv(x)
        x = self.relu(x) 
        x = self.temporal_conv(x) 
        return x 

class Residual(nn.Module): 
    def __init__(self, channels , kernel_size, padding =1):
        super().__init__()
        self.conv1 = FireDetectorMain3D(channels, channels, kernel_size , padding)
        self.norm1 = nn.LayerNorm(channels)
        self.conv2 = FireDetectorMain3D(channels ,channels, kernel_size , padding)
        self.norm2 = nn.LayerNorm(channels)
        self.relu = nn.ReLU()

    def forward(self, x) : 
        Residual = x 
        out = self.conv1(x) #(N, C, D, H, W)
        out = out.permute(0, 2 , 3 , 4 , 1)
        out = self.norm1(out)
        out= out.permute(0 , 4 , 1 , 2, 3)
        # out = self.relu(out)

        out = self.conv2(out)
        out = out.permute(0, 2 , 3 , 4 , 1)
        out = self.norm2(out)
        out= out.permute(0 , 4 , 1 , 2, 3)

        out += Residual
        out = self.relu(out)
        return out 
    
class FireDetectorWithResidual(nn.Module): 
        def __init__(self , in_channels =3 , num_clases =1) :
            super().__init__()
            self.initial_conv = FireDetectorMain3D(in_channels, 16 , kernel_size=(3, 7, 7) , padding=1)
            self.bn = nn.BatchNorm3d(16)
            self.relu = nn.ReLU() 
            self.pool1 = nn.MaxPool3d((1 , 2 ,2))
            self.res_block1 = Residual(16, kernel_size=(3 ,3 ,3))
            self.pool2 = nn.MaxPool3d((2 ,2 ,2))

            self.res_block2 = Residual(16, kernel_size=(3 ,3 ,3))
            self.adaptive_pool = nn.AdaptiveAvgPool3d((1 ,1,1))

            self.classifier = nn.Linear(16 , num_clases)

        def forward(self  , x): 
            x = self.initial_conv(x)
            x = self.bn(x)
            x = self.relu(x)
            x = self.pool1(x)

            x = self.res_block1(x)
            x = self.pool2(x)

            x = self.res_block2(x)
            x = self.adaptive_pool(x)

            x = x.flatten(1)
            x = self.classifier(x)

            return x

In [25]:
test_sample = next(iter(train_data_loader))[0]

In [26]:
test_sample.shape

torch.Size([3, 3, 17, 224, 224])

In [27]:

resnet2d1d = FireDetectorWithResidual()
summary(resnet2d1d, input_size=test_sample.shape)  # (batch, C, D, H, W)


Layer (type:depth-idx)                   Output Shape              Param #
FireDetectorWithResidual                 [3, 1]                    --
├─FireDetectorMain3D: 1-1                [3, 16, 17, 220, 220]     --
│    └─Conv3d: 2-1                       [3, 16, 17, 220, 220]     2,368
│    └─ReLU: 2-2                         [3, 16, 17, 220, 220]     --
│    └─Conv3d: 2-3                       [3, 16, 17, 220, 220]     784
├─BatchNorm3d: 1-2                       [3, 16, 17, 220, 220]     32
├─ReLU: 1-3                              [3, 16, 17, 220, 220]     --
├─MaxPool3d: 1-4                         [3, 16, 17, 110, 110]     --
├─Residual: 1-5                          [3, 16, 17, 110, 110]     --
│    └─FireDetectorMain3D: 2-4           [3, 16, 17, 110, 110]     --
│    │    └─Conv3d: 3-1                  [3, 16, 17, 110, 110]     2,320
│    │    └─ReLU: 3-2                    [3, 16, 17, 110, 110]     --
│    │    └─Conv3d: 3-3                  [3, 16, 17, 110, 110]     784
│    └─

In [28]:
train_loss , train_acc, recall = train_the_model(resnet2d1d , 50 , train_data_loader, loss_fn )
print(f'train loss is {train_loss}\n',
       f'train accuracy is {train_acc}\n', 
       f'recall is {recall}\n')

100%|██████████| 50/50 [02:37<00:00,  3.15s/it]

train loss is 1.4138781825701396
 train accuracy is 0.4444444576899211
 recall is 0.8333333333333334






In [29]:
test_loss, test_acc , test_recall = eval_the_model(resnet2d1d , test_data_loader, loss_fn)
print(f'test loss is {test_loss}\n',
       f'test accuracy is {test_acc}\n', 
       f'recall is {recall}\n')

test loss is 2.0982104539871216
 test accuracy is 0.3333333432674408
 recall is 0.8333333333333334



## slow fast model

In [30]:
import torch
# Choose the `slowfast_r50` model 
slow_fast_model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)

Using cache found in C:\Users\alchemist/.cache\torch\hub\facebookresearch_pytorchvideo_main


In [31]:
summary(slow_fast_model)

Layer (type:depth-idx)                                       Param #
Net                                                          --
├─ModuleList: 1-1                                            --
│    └─MultiPathWayWithFuse: 2-1                             --
│    │    └─ModuleList: 3-1                                  15,432
│    │    └─FuseFastToSlow: 3-2                              928
│    └─MultiPathWayWithFuse: 2-2                             --
│    │    └─ModuleList: 3-3                                  225,760
│    │    └─FuseFastToSlow: 3-4                              14,464
│    └─MultiPathWayWithFuse: 2-3                             --
│    │    └─ModuleList: 3-5                                  1,287,552
│    │    └─FuseFastToSlow: 3-6                              57,600
│    └─MultiPathWayWithFuse: 2-4                             --
│    │    └─ModuleList: 3-7                                  10,369,536
│    │    └─FuseFastToSlow: 3-8                              229,8

In [32]:
from typing import Dict
import json
import urllib
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo
) 

  return [int(c) if c.isdigit() else c for c in re.split("(\d+)", text)]


In [71]:
device = "cpu"
slow_fast_model = slow_fast_model.eval()
slow_fast_model = slow_fast_model.to(device)

In [None]:
# train_loss , train_acc , recall = train_the_model(slow_fast_model,1 , train_data_loader , loss_fn , optimizer)
# print(train_acc)

In [None]:
# side_size = 256
# mean = [0.45, 0.45, 0.45]
# std = [0.225, 0.225, 0.225]
# crop_size = 256
# num_frames = 32
# sampling_rate = 2
# frames_per_second = 30
# slowfast_alpha = 4
# num_clips = 10
# num_crops = 3

# class PackPathway(torch.nn.Module):
#     """
#     Transform for converting video frames as a list of tensors. 
#     """
#     def __init__(self):
#         super().__init__()
        
#     def forward(self, frames: torch.Tensor):
#         fast_pathway = frames
#         # Perform temporal sampling from the fast pathway.
#         slow_pathway = torch.index_select(
#             frames,
#             1,
#             torch.linspace(
#                 0, frames.shape[1] - 1, frames.shape[1] // slowfast_alpha
#             ).long(),
#         )
#         frame_list = [slow_pathway, fast_pathway]
#         return frame_list

# transform =  ApplyTransformToKey(
#     key="video",
#     transform=Compose(
#         [
#             UniformTemporalSubsample(num_frames),
#             Lambda(lambda x: x/255.0),
#             NormalizeVideo(mean, std),
#             ShortSideScale(
#                 size=side_size
#             ),
#             CenterCropVideo(crop_size),
#             PackPathway()
#         ]
#     ),
# )

# # The duration of the input clip is also specific to the model.
# clip_duration = (num_frames * sampling_rate)/frames_per_second

In [34]:
from typing import Dict
import json
import urllib

from torchvision.transforms import Compose, Lambda, CenterCrop, Normalize
from pytorchvideo.data.encoded_video import EncodedVideo
# Optional: if needed
# from pytorchvideo.transforms import (
#     ApplyTransformToKey,
#     ShortSideScale,
#     UniformTemporalSubsample,
#     UniformCropVideo
# )


In [None]:
video = EncodedVideo.from_path('data/videos/fire/fire_2.mp4')
clip = video.get_clip(start_sec=0, end_sec=1.0)  # Extract a 2-second segment
clip['video'].shape

torch.Size([3, 30, 720, 1280])

In [123]:
transform = ApplyTransformToKey(
    key="video",
    transform=Compose([
        UniformTemporalSubsample(32),
        Lambda(lambda x: x / 255.0),
        NormalizeVideo(mean=[0.45, 0.45, 0.45], std=[0.225, 0.225, 0.225]),
        ShortSideScale(256),
        CenterCropVideo(224),
    ])
)

In [124]:
clip['video'].shape

torch.Size([3, 30, 720, 1280])

In [125]:
myclip = {'video' :  clip['video']}
myclip['video'].shape

torch.Size([3, 30, 720, 1280])

In [126]:
ts_image = transform(myclip)
final_ts_image = ts_image['video'].unsqueeze(0)
final_ts_image.shape

torch.Size([1, 3, 32, 224, 224])

In [127]:
def pack_pathway_output(frames, alpha=4):
    """
    Convert video to SlowFast input format:
    - fast_pathway: the original (e.g. 30 frames)
    - slow_pathway: subsampled by alpha (e.g. 30 // 4 = ~8 frames)
    """
    fast_pathway = frames
    slow_pathway = frames[:, :, ::alpha, :, :]  # temporal subsampling
    return [slow_pathway, fast_pathway]


In [128]:
final_ts_image.shape

torch.Size([1, 3, 32, 224, 224])

In [129]:
ready = pack_pathway_output(final_ts_image)
ready[0].shape , ready[1].shape

(torch.Size([1, 3, 8, 224, 224]), torch.Size([1, 3, 32, 224, 224]))

In [130]:
with torch.inference_mode(): 
    output = slow_fast_model(ready)
    output

In [132]:
output.shape

torch.Size([1, 400])