# Imports

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import MultiStepLR
import torchvision
from torchvision.models import mobilenet_v2
from torchvision.transforms import v2
from torchvision.ops import Conv2dNormActivation

from data_helper import SQLDataset_Informative
import mysql.connector as connector
from pathlib import Path
import tqdm # progress bar
import os

# importing accuracy metric functions
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device {device}')

# Connecting to SQL Server

In [None]:
home = os.path.expanduser('~')
os.chdir(home) # b/c we will be using universal paths

host = '127.0.0.1'
user = 'root' # change to your username
password = 'ethan1' # change to your password
database = 'ai_proj_2025' # we should all have this as the db name 

try:
    conn = connector.connect(
        host = host, 
        user = user, 
        password = password, 
        database = database
    )
    print('success')
except connector.Error as err:
    print(err)

# Creating and Transforming Datasets with SQL

We are utilizing the CrisisMMD dataset, which can be found here: https://crisisnlp.qcri.org/crisismmd

This dataset includes images and text data from Twitter in 2017, a year of several major natural disasters across the globe. Some posts (images and/or text) are informative to humanitarian aid officials, while others may be irrelevant and distracting. The text and images were each labeled as informative or not informative based on human inference for supervised learning purposes. However, we are most interested in correctly classifying posted images as informative or not to ultimately pinpoint geographical areas that require the most or most urgent humanitarian aid.

In [None]:
# set up transforms
transformations = v2.Compose([
    v2.RandomResizedCrop(size=(224, 224), antialias=True), # resize all images to same size
    v2.RandomHorizontalFlip(p=0.5), # add random changes in the image 
    v2.ToDtype(torch.float32, scale=True), 
    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # normalize channel-wise
])

data_dir=Path(home, 'OneDrive - Stephen F. Austin State University', 'CrisisMMD_v2.0','CrisisMMD_v2.0')

train_set = SQLDataset_Informative(conn=conn, img_col='image_path', label_col='image_info', transform=transformations, 
                     data_dir=data_dir, is_train=True)
val_set = SQLDataset_Informative(conn=conn, img_col='image_path', label_col='image_info', transform=transformations, 
                     data_dir=data_dir, is_val=True)
test_set = SQLDataset_Informative(conn=conn, img_col='image_path', label_col='image_info', transform=transformations, 
                     data_dir=data_dir, is_test=True)

In [None]:
train_loader = DataLoader(train_set, batch_size=256) # load the data with torch
val_loader = DataLoader(val_set, batch_size=128)
test_loader = DataLoader(test_set, batch_size=128)

# First CNN Model

Since we are dealing with image data, we will implement a Convolutional Neural Network (CNN) to classify whether twitter images are informative in the context of a humanitarian crisis or disaster.

In [None]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, 3) # specify the size (outer numbers) and amount (middle number) of filters
        self.pool = nn.MaxPool2d(2, 2) # specify pool size first number is size of pool, second is step size
        self.conv2 = nn.Conv2d(16, 8, 3) # new depth is amount of filters in previous conv layer
        self.fc1 = nn.Linear(54*54*8, 120)
        self.fc2 = nn.Linear(120, 60)
        self.fc3 = nn.Linear(60, 2) # final fc layer needs 19 outputs because we have 19 layers # ???

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = x.view(-1, 54*54*8) # flatten
        x = F.relu(self.fc1(x))    # fully connected, relu        
        x = F.relu(self.fc2(x))    
        x = self.fc3(x)     # output    
        return x

**IMPORTANT**: Make sure to change the run_name (and 'architecture' parameter of the wandb `run` variable if necessary) with each new run. 

## Training and Validation Loops

In [None]:
# validation loop
def dev(model, val_loader): 
    model.to(device)
    batch_size = val_loader.batch_size
    avg = 'macro' # used when computing certain accuracy metrics
    model.eval()

    eval_loss = 0

    all_preds = []
    all_trues = []

    with torch.no_grad():
        for b, batch in tqdm.tqdm(enumerate(val_loader), 
                             total= len(val_loader), desc=f"Processing validation data"):
            images = batch['image'].to(device)
            labels = batch['label'].to(device)

            raw_logits = model.forward(images)

            preds = torch.argmax(raw_logits, dim=1) # https://discuss.pytorch.org/t/cross-entropy-loss-get-predicted-class/58215

            loss = nn.CrossEntropyLoss()(raw_logits, labels)

            eval_loss += loss.item()

            all_preds.extend(preds.tolist())
            all_trues.extend(labels.tolist())


        # metrics 
        acc_total = accuracy_score(y_true=all_trues, y_pred=all_preds)
        precision = precision_score(y_true=all_trues, y_pred=all_preds, zero_division=0, average=avg)
        recall = recall_score(y_true=all_trues, y_pred=all_preds, zero_division=0, average=avg)
        f1 = f1_score(y_true=all_trues, y_pred=all_preds, zero_division=0, average=avg)

        avg_eval_loss = eval_loss / (len(val_loader))

        metrics = {
            'accuracy': acc_total, 
            'precision': precision, 
            'recall': recall, 
            'f1': f1, 
            'avg_eval_loss': avg_eval_loss
        }
        print('****Evaluation****')
        print(f'total_accuracy: {acc_total}')

        return acc_total

In [None]:
# training loop
def train_eval(model, num_epochs, lr):
    # training hyperparameters & functions/tools
    lr = lr 
    num_epochs = num_epochs
    

    best_val_acc = 0.0
    optimizer = torch.optim.Adam(model.parameters(), lr=lr) #stocastic gradient descent for our optimization algorithm
    lr_sched = MultiStepLR(optimizer=optimizer, milestones=list(range(50, num_epochs, 30)), gamma=.1) # decaying lr

    model.to(device)
    # for saving the models
    #Path(research_dir, 'models' ).mkdir(parents=True, exist_ok=True)

    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1}')

        loss = 0

        for b, batch in tqdm.tqdm(enumerate(train_loader), 
                            total= len(train_loader), desc=f"Processing training data in epoch {epoch+1}"):
            model.train()
            images = batch['image'].to(device)
            labels = batch['label'].to(device)

            model.zero_grad() 
            optimizer.zero_grad()

            raw_logits = model.forward(images) # forward pass
            loss = nn.CrossEntropyLoss()(raw_logits, labels) # raw logits as input
            print(f'Train Loss: {loss}')
            
            loss.backward() # backprop
            optimizer.step() # gradient update

            if (b+1) % 20 == 0:
                print(f'batch: {b+1} ; loss: {loss.item()}')
        
        # each epoch, run validation
        acc = dev(model=model, val_loader=val_loader)
        
        if acc > best_val_acc:
            best_val_acc = acc
            #torch.save(model, Path(research_dir, 'models', f'{run_name}'))
        
        lr_sched.step()
    
    return best_val_acc

## Hyperparameter Tuning - Grid Search

In [None]:
num_epochs = 200
os.chdir(home)

# grid-search
history = []
for lr in [10**-3, 10**-2, 10**-1, 30**-1, 1]:
    my_dict = {}
    my_dict['lr'] = lr

    model = ConvNet()

    acc = train_eval(model, num_epochs, lr=lr)
    my_dict['acc'] = acc
    history.append(my_dict)

In [None]:
# print/report final model hyperparameters and metrics

# Updated CNN Model

Since the previous CNN worked well, we will tweak that model architecture by adding layers to hopefully improve performance.

In [None]:
class ConvNet2(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, 3) # specify the size (outer numbers) and amount (middle number) of filters
        self.pool = nn.MaxPool2d(2, 2) # specify pool size first number is size of pool, second is step size
        self.conv2 = nn.Conv2d(16, 32, 3) # new depth is amount of filters in previous conv layer
        self.conv3 = nn.Conv2d(32, 64, 3)
        self.fc1 = nn.Linear(64*26*26, 120)
        self.fc2 = nn.Linear(120, 60)
        self.fc3 = nn.Linear(60, 2) 
        self.bn1 = nn.BatchNorm2d(16)
        self.bn2 = nn.BatchNorm2d(32)


    def forward(self, x): # forward pass
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        x = x.view(-1, x.shape[1]*x.shape[2]*x.shape[3]) # flatten
        x = F.relu(self.fc1(x))    # fully connected, relu         
        x = F.relu(self.fc2(x))    
        x = self.fc3(x)  
        return x

## Hyperparameter Tuning - Grid Search

In [None]:
num_epochs = 200
os.chdir(home)

history = []
for lr in [10**-3, 10**-2, 10**-1, 30**-1, 1]:
    my_dict = {}
    my_dict['lr'] = lr

    model = ConvNet2()

    # update architecture = (?)
    acc = train_eval(model, num_epochs, lr=lr)
    my_dict['acc'] = acc
    history.append(my_dict)

In [None]:
# print final model metrics

# MobileNetV2 Model

With disaster reports being inherently spontaneous, we would like to investigate more efficient models such as MobileNetV2 to analyze the efficiency-accuracy tradeoff for predicting informative or not informative images.

In [None]:
# Involution Block - Filter for each "location" in the image
class Inv2d(nn.Module): 
    def __init__(self, in_channels, reduction=1, kernel_size=3, group_ch=1, stride=1, padding=1, dilation=1):
        super().__init__()

        self.k = kernel_size
        self.group_ch = group_ch
        self.in_channels = in_channels
        self.out_channels = in_channels
        self.g = in_channels // group_ch
        self.r = reduction

        self.stride=stride
        self.padding = padding
        self.dilation=dilation

        # avg pool is for adjusting the kernel and therefore output based on the stride
        self.o = nn.AvgPool2d(kernel_size=stride, stride=stride) if stride > 1 else nn.Identity()
        self.reduce = nn.Conv2d(in_channels, in_channels // reduction, 1)
        self.batch_norm = nn.BatchNorm2d(in_channels//reduction)
        self.span = nn.Conv2d(in_channels // reduction, self.g*(kernel_size**2), 1)
        self.unfold = nn.Unfold(kernel_size, padding=padding, stride=stride, dilation=dilation)
    
    def forward(self, X):
        '''
        Input should be an image tensor of shape: (batch_size, channels, h, w)
        '''
        # KERNEL GENERATION
        W = self.reduce(self.o(X))
  
        W = nn.ReLU()(self.batch_norm(W))
        W = self.span(W)
 
        b, c, h, w = W.shape
        W = W.view(b, self.g, 1, self.k**2, w*h)

        # INVOLUTION
        # print(f'stride: {self.stride}')
        # print(f'padding: {self.padding}')
        
        patches = self.unfold(X)
        patches = patches.view(b, self.g, self.group_ch, self.k**2, w*h)
        out = patches*W 

        # output width and height should be w / stride and h / stride
        out = out.view(b, self.out_channels, self.k**2, w*h).sum(dim=2)
        out = out.view(b, self.out_channels, w, h)

        return out
    

# Inverted residual block - bottleneck architecture with residual connection (like resnet)
class InvertedResidual(nn.Module):
    def __init__(self, in_channels, out_channels, stride, expand_ratio, norm_layer=None):
        super().__init__()

        assert stride == 1 or stride == 2, 'stride must be 1 or 2'

        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        
        hidden_dim = int(round(in_channels * expand_ratio))
        # print(hidden_dim)

        layers = [] 

        if expand_ratio != 1:
            # add expansion 
            layers.append(Conv2dNormActivation(in_channels, hidden_dim, kernel_size=1, stride=stride, 
                                               norm_layer=norm_layer, activation_layer=nn.ReLU6))
            # depthwise conv => pointwise => norm_layer
            # hidden_dim == in_channels if expand_ratio==1
            # b/c groups = hidden_dim, each input channel is convolved with its own set of filters
            # b/c instead of using multiple 3d kernels, we want multiple 2d kernels that each go across a different channel
        layers.extend([Conv2dNormActivation(hidden_dim, hidden_dim, kernel_size=3, stride=stride, groups=hidden_dim, norm_layer=norm_layer, activation_layer=nn.ReLU6), 
                                                nn.Conv2d(hidden_dim, out_channels, 1, 1, 0, bias=False), # point-wise (1x1 filter)
                                                norm_layer(out_channels)])
        
        self.conv = nn.Sequential(*layers)
        
        self.stride = stride
        self.out_channels = out_channels
        self.in_channels = in_channels
        self._is_cn = stride > 1 # downsample indicator
        self.use_res_conn = stride == 1 and in_channels == out_channels

    def forward(self, x):
        if self.use_res_conn:
            return x + self.conv(x)
        else:                
            return self.conv(x)
        

def _make_divisible(v, divisor, min_value=None): 
    '''
    Description: rounds v to nearest multiple of divisor
    '''
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v


# mobilenetv2 architecture
class MobileNetV2(nn.Module):
    """
        MobileNet V2 main class

        Args:
            width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
            inverted_residual_setting: Network structure
            round_nearest (int): Round the number of channels in each layer to be a multiple of this number
            Set to 1 to turn off rounding
            block: Module specifying inverted residual building block for mobilenet
            norm_layer: Module specifying the normalization layer to use

        """
    def __init__(self, width_mult=1.0, inverted_residual_setting = None, round_nearest=8,
                  block = None, norm_layer = None, dropout = 0.2):
        super().__init__()
        if block is None:
            block = InvertedResidual

        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        
        input_channel = 32 # adjustable
        last_channel = 1280 # adjustable

        if inverted_residual_setting is None:
            inverted_residual_setting = [
                # t - expansion factor, c - # of output after block, n - # of repitions, s - stride
                [1, 16, 1, 1],
                [6, 24, 2, 2],
                [6, 32, 3, 2],
                [6, 64, 4, 2],
                [6, 96, 3, 1],
                [6, 160, 3, 2],
                [6, 320, 1, 1],
            ]

        # build the first layer
        # input_channel is the number of channels after the first Conv2dNormActivation
        input_channel = _make_divisible(input_channel * width_mult, round_nearest)
        
        self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
        
        features = [Conv2dNormActivation(3, input_channel, kernel_size=3, stride=2, norm_layer=norm_layer, activation_layer=nn.ReLU6)]

        # build the inverted residual blocks
        for t, c, n, s in inverted_residual_setting:
            output_channel = _make_divisible(c*width_mult, round_nearest) # update output channels (will increase)
            for i in range(n):
                stride = s if i == 0 else 1
                features.append(block(input_channel, output_channel, stride, expand_ratio=t, norm_layer=norm_layer))
                input_channel = output_channel 
                
        # build last severals
        features.append(Conv2dNormActivation(input_channel, self.last_channel, kernel_size=1, norm_layer=norm_layer, activation_layer=nn.ReLU6))

        # complete CNN architecture
        self.features = nn.Sequential(*features)  

        # weight initialization
        for m in self.modules():
            # print('initializing weights')
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out")
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.ones_(m.weight)
                nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.zeros_(m.bias)
    
    def _forward_impl(self, x):
        x = self.features(x) # pass input through layers
        
        x = nn.functional.adaptive_avg_pool2d(x, (1, 1)) # takes the average value for each each channel
        x = torch.flatten(x, 1) # flatten for linear layer
        # x = self.classifier(x) # classify
        return x

    def forward(self, x):
        return self._forward_impl(x)

        

## Transfer Learning with MobileNetV2

We see if we can improve performance and efficiency of the MobileNetV2 model by using transfer learning with `weights` = 'IMAGENET1K_V2'

In [None]:
num_epochs = 200

history = []
for lr in [10**-2, 10**-1, 30**-1, 1]:
    my_dict = {}
    my_dict['lr'] = lr

    # instantiate our model
    model = MobileNetV2()

    # load pretrained weights
    pretrained = mobilenet_v2(weights='IMAGENET1K_V2')
    weights = pretrained.state_dict()
    model.features.load_state_dict(weights, strict=False)

    # turn off all but the topmost layers
    freeze_up_to = 9
    for param in model.features[:freeze_up_to].parameters(): 
        param.requires_grad = False
    
    model = nn.Sequential(model, nn.Linear(1280, 512), nn.ReLU(), nn.Linear(512, 128), nn.ReLU(), nn.Linear(128, 2)) # 1280 is the num of features outputted by MobileNetv2 after flattening
    
    acc = train_eval(model, num_epochs, run_name, lr=lr)
    my_dict['acc'] = acc
    history.append(my_dict)

In [None]:
# print final mobilenet metrics, is it better than the updated CNN?

# Grad CAM

In [None]:
# grad cam implementation

# Conclusion

In [None]:
# final model choice and analysis on metrics (compared to baseline model or existing models)

# Contribution From Group Members

In [None]:
# add stuff as needed

Agafia's Contribution: Implemented grid-search to optimize hyperparameters and transfer learning for MobileNetV2. Implemented involution class. Wrote code to connect to SQL server and build dataset in python.

Ethan's Contribution: Implemented and commented on ConvNet class and training/validation loops. 

Hudson's Contribution: Implemented and commented on MobileNetV2, InvertedResidual classes. Created final ipynb file.