# Tools / Modules

In [1]:
import numpy as np # for handling numbers
import pandas as pd # for handling spreadsheet data
import os # to retrive data-set files
import cv2 # computer vision for extracting features from images
import torch # pytorch for NN/CNN


# General Data loading and pre-processing

In [2]:
# function to load dataset into a data frame
# Encaplsated in a function to restrict the scope of variables that will not be needed in
# later code blocks.
# gray scale flag, by default it's off
# ResizeX and ResizeY, fills these to resize image to desired size, if either
# is 0 it's assumed to be off

def loadData(grayscale:bool = False, resizeX: int = 0, resizeY = 0):
    # helper function to fill the data variables with data from images
    def fillData(data: dict,dir: str, label:str):
        # list of all files in dir
        # these values are the image files
        list = os.listdir(dir)
        # append image paths and labels in data dictionary
        for image in list:
            absImagePath = os.path.join(dir,image)
            data['image'].append(absImagePath)
            data['label'].append(label)
            fileImage = cv2.imread(absImagePath)
            # image gray scale
            if(grayscale == True):
                # 32 x 32, numpyarray
                fileImage = cv2.cvtColor(fileImage,cv2.COLOR_BGR2GRAY)
            if(resizeX > 0 and resizeY > 0):
                target_size = (resizeX,resizeY)
                fileImage = cv2.resize(fileImage,target_size)
            # if it's color mode, reshape into 3-tuples(RGB)
            if(grayscale == False):
                data['features'].append(fileImage.reshape((-1,3)))
            # else if it's gray scale, just flatten it
            else: data['features'].append(fileImage.flatten())
            
    # dictionary to temporary house the data
    # image = image path list, label = fake or real
    trainData = {'image':[],'label':[], 'features':[]}
    testData = {'image':[],'label':[], 'features':[]}
   
    # Read Train folder & Read test folder
    # OS module used to ensure this works on all platforms that python runs on
    currentDir = os.getcwd() # get's current directory to later append to image filepath for abs path
    trainDirReal = os.path.join(currentDir,"train",'REAL') # abs file path to real class folder in training
    trainDirFake = os.path.join(currentDir,'train','FAKE')
    testDirReal = os.path.join(currentDir,"test",'REAL')
    testDirFake = os.path.join(currentDir,"test",'FAKE')  


    # helper function read file list from each folder and append abs path and labels
    fillData(trainData,trainDirReal,'REAL')
    fillData(trainData,trainDirFake,'FAKE')
    fillData(testData,testDirReal,'REAL')
    fillData(testData,testDirFake,'FAKE')
        
    # converts from dictionary type to dataframe for ease of access and compadability with
    # ML library function calls
    return pd.DataFrame(trainData), pd.DataFrame(testData)
# executes function, returning 2 dataframes containing train and test data of both classes
# Train and test data are seperated into different dataframes to enforce data hygiene 
trainData,testData = loadData(grayscale=False)
# test if data was loaded successfully by outputing first 10 entries
trainData.head(10)

Unnamed: 0,image,label,features
0,c:\Users\bryan\Documents\Programming\Git\ML-Pr...,REAL,"[[146, 166, 124], [137, 142, 80], [218, 189, 8..."
1,c:\Users\bryan\Documents\Programming\Git\ML-Pr...,REAL,"[[237, 218, 227], [219, 205, 211], [209, 202, ..."
2,c:\Users\bryan\Documents\Programming\Git\ML-Pr...,REAL,"[[195, 174, 176], [193, 175, 176], [194, 176, ..."
3,c:\Users\bryan\Documents\Programming\Git\ML-Pr...,REAL,"[[74, 102, 133], [0, 23, 53], [3, 20, 47], [35..."
4,c:\Users\bryan\Documents\Programming\Git\ML-Pr...,REAL,"[[181, 165, 152], [182, 166, 153], [183, 167, ..."
5,c:\Users\bryan\Documents\Programming\Git\ML-Pr...,REAL,"[[56, 37, 29], [48, 31, 22], [81, 63, 56], [17..."
6,c:\Users\bryan\Documents\Programming\Git\ML-Pr...,REAL,"[[90, 92, 92], [86, 88, 88], [66, 67, 71], [86..."
7,c:\Users\bryan\Documents\Programming\Git\ML-Pr...,REAL,"[[160, 211, 207], [156, 207, 200], [178, 222, ..."
8,c:\Users\bryan\Documents\Programming\Git\ML-Pr...,REAL,"[[110, 98, 98], [112, 100, 100], [115, 102, 10..."
9,c:\Users\bryan\Documents\Programming\Git\ML-Pr...,REAL,"[[252, 252, 255], [250, 253, 255], [243, 251, ..."


# CNN Class

In [3]:
import torch.nn as nn # importing nueral networks from pytorch
import torch.nn.functional as F # API for NN operations
import torch.optim as optim # loss function and optimizer
class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5) # 3 input(assumes color here), 6 output, kernel size of 5x5
        self.pool = nn.MaxPool2d(2, 2) # max pooling layer with kernel 2 x 2 and stride of  2
        self.conv2 = nn.Conv2d(6, 16, 5) # 6 input(from prior layer), 16 output channels and kernel 5 x 5
        self.fc1 = nn.Linear(16 * 5 * 5, 120) # fully connected layers with input,output sizes
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 2) # outputs to 2 neurons for 2 class problem

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

cnn = CNN()

# loss function
cnn_criterion = nn.CrossEntropyLoss()
# opitimizer
cnn_optimizer = optim.SGD(cnn.parameters(), lr = 0.001, momentum=0.9)


# Custom dataloader / Dataset Class For Pytorch

In [4]:
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from sklearn.preprocessing import LabelEncoder

class customDataset(Dataset):
    # init = class constructor
    def __init__(self,dataframe=None, transform=None): 
        self.data = dataframe # designed to take in the pre-processed dataframe from code blocks above
        self.transform = transform # sets the transform attribute of the class but doesn't apply it yet
    # returns # of samples in dataset
    def __len__(self): 
        return len(self.data)
    # loads and returns a sample from the dataset at the given index 'idx'
    def __getitem__(self,idx):
        # grab row #idx
        sample = self.data.iloc[idx]
        # Grab data in column features of row #idx, this is image data in a flatten numpy array that contains full data of an image
        image_data = sample['features']
        # reshape the flattened array into 32x32x3 (height x width x color channels)
        image_data = np.reshape(image_data,(32,32,3))
        # convert to tensor
        image_tensor = torch.from_numpy(image_data)
        # apply transformation which is defined outside the function and passed whe __init__ is called(when the instance is constructed)
        image_tensor = self.transform(image_data)
        # get label strings
        labels = sample['label']
        # encode strings to numbers
        if labels == 'FAKE':
            labels = 0
        else:
            labels = 1
        # return the image tensor and encoded labels        
        return image_tensor, labels

# Preprocessing for NN
Use the above class, to transform data into tensor format and dimensions to match pytorch expectations

In [5]:
# batch size for training
batchSize = 20
# normalize images from [0,254] to [0,1], then to [-1,1] range tensor this is common practice for CNN
# This reduces training time and increases rate of convergence
# first parameter is mean, second std. The reason each is a 3 tuple, is it's per color(RGB)
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
# create custom pytorch dataset using the pandas pre-processed pandas dataframe
traindataset = customDataset(dataframe=trainData, transform=transform)
testdataset = customDataset(dataframe=testData,transform=transform)
# create pytorch Dataloader for each custom dataset
traindataloader = DataLoader(traindataset, batch_size=batchSize,shuffle=True)
testdataloader = DataLoader(testdataset,batch_size=batchSize,shuffle=True)

# Training NN Function

In [6]:
# defaults to 5 iterations
def trainNN(model,dataloader,iterations=5):
    for epoch in range(iterations):  # loop over the dataset multiple times

        running_loss = 0.0
        for i, data in enumerate(dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            # note: the dataloader will produce X instances and labels based on batch sized defined in the block above
            inputs, labels = data
        
    
            # zero the parameter gradients
            cnn_optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = cnn_criterion(outputs, labels)
            loss.backward()
            cnn_optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
                running_loss = 0.0
    print('Finished Training')
      

# Trainining the CNN

In [7]:
# calls the above function to trai the CNN
trainNN(cnn,traindataloader)

Finished Training


# Test CNN

In [8]:
from sklearn.metrics import classification_report
def testNNModel(NN, testloader):        
    correct = 0
    total = 0
    y_true = []
    y_pred = []
    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        # ensures expected behavior when evaluating the model by putting it into eval mode
        NN.eval()
        for data in testloader:
            images, labels = data
            # calculate outputs by running images through the network
            outputs = NN(images)
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(outputs.data, 1)
            
            # update the true and predicted labels
            y_true.extend(labels.numpy())
            y_pred.extend(predicted.numpy())
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f'Accuracy of the network on the {len(testData)} test images: {100 * correct // total} %')

    # calculate the confusion matrix
    print(classification_report(y_true, y_pred))
# run the above function
testNNModel(cnn,testdataloader)

Accuracy of the network on the 10000 test images: 84 %
              precision    recall  f1-score   support

           0       0.82      0.88      0.85      5000
           1       0.87      0.81      0.84      5000

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

