# Final Assessment: Making Prediction on a Dataset without Domain Knowledge

## Tasks & Submission

In [436]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import DataLoader, TensorDataset
from torchvision import transforms

class CNNModel(nn.Module):  
    """
    This class represents an AI model.
    """
    def __init__(self):
        """
        Constructor for Model class.
  
        Parameters
        ----------
        self : object
            The instance of the object passed by Python.
        """
        super(CNNModel, self).__init__()
        self.conv = nn.Sequential(
                        nn.Conv2d(3, 32, (3,3)),
                        nn.MaxPool2d((2, 2)),
                        nn.LeakyReLU(0.1),
                        nn.Conv2d(32, 64, (3,3)),
                        nn.MaxPool2d((2, 2)),
                        nn.LeakyReLU(0.1)
                    )

        self.fc = nn.Sequential(
                        nn.Linear(64, 256),
                        nn.LeakyReLU(0.1),
                        nn.Linear(256, 128),
                        nn.LeakyReLU(0.1),
                        nn.Linear(128, 3)
                    )

    def forward(self, x):
        x = self.conv(x)
        #x = x.view(x.shape[0], 32, 4).mean(2) # GAP – do not remove this line
        x = nn.AvgPool2d(x.shape[2:])(x).view(x.shape[0], -1)
        x = self.fc(x)

        return x
    
    def preprocess(self, X):
        # Normalize the data by replacing outliers with NaN
        X[X > 255] = np.nan
        X[X < 0] = np.nan

        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5])
            #transforms.RandomRotation(45)
        ])

        # Replace NaN values with the mean of the channel
        for i in range(X.shape[0]):
            image = X[i, :, :, :]

            if np.isnan(image).any():
                channel_means = np.nanmean(image.astype('float64'), axis=(1, 2))
                if np.isnan(channel_means).any():
                    print("There are NaN values in the channel means")

                image[0][np.isnan(image[0])] = channel_means[0]
                image[1][np.isnan(image[1])] = channel_means[1]
                image[2][np.isnan(image[2])] = channel_means[2]

            img_transposed = np.transpose(image, (1, 2, 0))
            img_transformed = transform(img_transposed)
            test = np.transpose(img_transformed, (0, 1, 2))
            X[i, :, :, :] = test

        return X
    
    def scale(self, X):
        # Reshape and scale the data
        reshaped = X.reshape(X.shape[0], -1)
        scaler = StandardScaler()
        reshaped_scaled = scaler.fit_transform(reshaped)

        return reshaped_scaled


    def fit(self, X, y):
        """
        Train the model using the input data.
        
        Parameters
        ----------
        X : ndarray of shape (n_samples, channel, height, width)
            Training data.
        y : ndarray of shape (n_samples,)
            Target values.
            
        Returns
        -------
        self : object
            Returns an instance of the trained model.
        """
        X = self.preprocess(X)
        #X = self.scale(X)

        # Apply PCA
        #self.pca = PCA(n_components=0.99)
        #self.pca.fit(X)
        # Define the fc1 using the number of components from PCA
        #self.fc1 = nn.Linear(self.pca.n_components_, self.n_hidden1)
        #X_pca = self.pca.transform(X)

        # Remove data with a corresponding NaN value in y
        X = X[~np.isnan(y)]
        y = y[~np.isnan(y)]

        # Calculate class weights for imbalanced data
        class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
        class_weights = torch.tensor(class_weights, dtype=torch.float)

        # Convert to tensors
        X = torch.from_numpy(X).float()
        y = torch.from_numpy(y).long()

        # Initialize dataloader
        train_loader = DataLoader(TensorDataset(X, y), shuffle=True, batch_size=512)

        optimizer = optim.Adam(self.parameters(), lr=1e-3)
        loss_fn = nn.CrossEntropyLoss(class_weights)

        # Train the model
        num_epochs = 200
        #prev_loss = 99999
        for i in range(num_epochs):
            epoch_loss = 0
            for _, data in enumerate(train_loader):
                inputs, labels = data
                outputs = self(inputs)
                loss = loss_fn(outputs, labels)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                #if loss.item() > prev_loss:
                #    break
                #prev_loss = loss.item()

                epoch_loss += loss.item()

            epoch_loss = epoch_loss / len(train_loader)
            print ("Epoch: {}, Loss: {}".format(i, epoch_loss))
    
    def predict(self, X):
        """
        Use the trained model to make predictions.
        
        Parameters
        ----------
        X : ndarray of shape (n_samples, channel, height, width)
            Input data.
            
        Returns
        -------
        ndarray of shape (n_samples,)
        Predicted target values per element in X.
           
        """
        self.eval()
        #X = self.preprocess(X)
        #X = self.scale(X)
        #X = self.pca.transform(X)
        X = torch.from_numpy(X).float()
        with torch.no_grad():
            outputs = self(X)

        return torch.argmax(outputs, dim=1).numpy()

In [489]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import DataLoader, TensorDataset

class MLPModel(nn.Module):  
    """
    This class represents an AI model.
    """
    def __init__(self):
        """
        Constructor for Model class.
  
        Parameters
        ----------
        self : object
            The instance of the object passed by Python.
        """
        super(MLPModel, self).__init__()
        self.lrelu = nn.LeakyReLU(0.1)
        self.n_hidden1 = 1024
        self.n_hidden2 = 512
        self.n_hidden3 = 256
        self.n_classes = 3
        self.fc2 = nn.Linear(self.n_hidden1, self.n_hidden2)
        self.fc3 = nn.Linear(self.n_hidden2, self.n_hidden3)
        self.fc4 = nn.Linear(self.n_hidden3, self.n_classes)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.lrelu(x)
        x = self.fc2(x)
        x = self.lrelu(x)
        x = self.fc3(x)
        x = self.lrelu(x)
        x = self.fc4(x)
        x = self.log_softmax(x)

        return x
    
    def preprocess(self, X):
        # Normalize the data by replacing outliers with NaN
        X[X > 255] = np.nan
        X[X < 0] = np.nan

        # Replace NaN values with the mean of the channel
        for i in range(X.shape[0]):
            image = X[i, :, :, :]

            if np.isnan(image).any():
                channel_means = np.nanmean(image.astype('float64'), axis=(1, 2))
                if np.isnan(channel_means).any():
                    print("There are NaN values in the channel means")

                image[0][np.isnan(image[0])] = channel_means[0]
                image[1][np.isnan(image[1])] = channel_means[1]
                image[2][np.isnan(image[2])] = channel_means[2]

            X[i, :, :, :] = image

        return X
    
    def scale(self, X):
        # Reshape and scale the data
        reshaped = X.reshape(X.shape[0], -1)
        scaler = StandardScaler()
        reshaped_scaled = scaler.fit_transform(reshaped)

        return reshaped_scaled


    def fit(self, X, y):
        """
        Train the model using the input data.
        
        Parameters
        ----------
        X : ndarray of shape (n_samples, channel, height, width)
            Training data.
        y : ndarray of shape (n_samples,)
            Target values.
            
        Returns
        -------
        self : object
            Returns an instance of the trained model.
        """
        X = self.preprocess(X)
        X = self.scale(X)

        # Apply PCA
        self.pca = PCA(n_components=0.99)
        self.pca.fit(X)
        # Define the fc1 using the number of components from PCA
        self.fc1 = nn.Linear(self.pca.n_components_, self.n_hidden1)
        X_pca = self.pca.transform(X)

        # Remove data with a corresponding NaN value in y
        X_pca = X_pca[~np.isnan(y)]
        y = y[~np.isnan(y)]

        # Calculate class weights for imbalanced data
        class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
        class_weights = torch.tensor(class_weights, dtype=torch.float)

        # Convert to tensors
        X = torch.from_numpy(X_pca).float()
        y = torch.from_numpy(y).long()

        # Initialize dataloader
        train_loader = DataLoader(TensorDataset(X, y), shuffle=True, batch_size=64)

        optimizer = optim.Adam(self.parameters(), lr=1e-3)
        loss_fn = nn.CrossEntropyLoss(class_weights)

        # Train the model
        num_epochs = 100
        for i in range(num_epochs):
            epoch_loss = 0
            for _, data in enumerate(train_loader):
                inputs, labels = data
                outputs = self(inputs)
                loss = loss_fn(outputs, labels)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()

            epoch_loss = epoch_loss / len(train_loader)
            print ("Epoch: {}, Loss: {}".format(i, epoch_loss))
    
    def predict(self, X):
        """
        Use the trained model to make predictions.
        
        Parameters
        ----------
        X : ndarray of shape (n_samples, channel, height, width)
            Input data.
            
        Returns
        -------
        ndarray of shape (n_samples,)
        Predicted target values per element in X.
           
        """
        self.eval()
        X = self.preprocess(X)
        X = self.scale(X)
        X = self.pca.transform(X)
        X = torch.from_numpy(X).float()
        with torch.no_grad():
            outputs = self(X)

        return torch.argmax(outputs, dim=1).numpy()

#### Local Evaluation

You may test your solution locally by running the following code. Do note that the results may not reflect your performance in Coursemology. You should not be submitting the code below in Coursemology. The code here is meant only for you to do local testing.

In [36]:
# Import packages
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

In [3]:
# Load data
with open('data.npy', 'rb') as f:
    data = np.load(f, allow_pickle=True).item()
    X = data['image']
    y = data['label']

In [493]:
# Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

# Filter test data that contains no labels
# In Coursemology, the test data is guaranteed to have labels
nan_indices = np.argwhere(np.isnan(y_test)).squeeze()
mask = np.ones(y_test.shape, bool)
mask[nan_indices] = False
X_test = X_test[mask]
y_test = y_test[mask]

# Train and predict
model = MLPModel()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate model predition
# Learn more: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
print("F1 Score (macro): {0:.2f}".format(f1_score(y_test, y_pred, average='macro'))) # You may encounter errors, you are expected to figure out what's the issue.

Epoch: 0, Loss: 0.879564479798884
Epoch: 1, Loss: 0.321165699813817
Epoch: 2, Loss: 0.11935280822217464
Epoch: 3, Loss: 0.035500711547462524
Epoch: 4, Loss: 0.016074027913320507
Epoch: 5, Loss: 0.010740979283975714
Epoch: 6, Loss: 0.003127313926155251
Epoch: 7, Loss: 0.0013565582548098235
Epoch: 8, Loss: 0.00044996448952981546
Epoch: 9, Loss: 0.0002582809415675788
Epoch: 10, Loss: 0.00016979354679918019
Epoch: 11, Loss: 0.00013469944747772093
Epoch: 12, Loss: 0.00010899466139341491
Epoch: 13, Loss: 9.100555512673423e-05
Epoch: 14, Loss: 7.308857798430049e-05
Epoch: 15, Loss: 6.194561432789055e-05
Epoch: 16, Loss: 5.538049874696692e-05
Epoch: 17, Loss: 4.664921721701536e-05
Epoch: 18, Loss: 3.795214633027098e-05
Epoch: 19, Loss: 3.730957808550778e-05
Epoch: 20, Loss: 3.42695875588106e-05
Epoch: 21, Loss: 2.9910915981344186e-05
Epoch: 22, Loss: 2.7275329213463574e-05
Epoch: 23, Loss: 2.3614136525056154e-05
Epoch: 24, Loss: 2.2741821966691703e-05
Epoch: 25, Loss: 2.0058607628311006e-05
Ep