# Support Vector Machine

**Importing RAPIDS CuML library**

The RAPIDS library will be used to enable Kaggle notebook acceleration through GPU usage. In particular, the cudf and cuml packages will be used to accomplish this.

In [1]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# import matplotlib.pyplot as plt
# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# # Setup

# import sys
# !cp ../input/rapids/rapids.0.18.0 /opt/conda/envs/rapids.tar.gz
# !cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
# sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
# sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
# sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
# !cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [3]:
# !conda install -c conda-forge cupy -y

In [4]:
# !conda install -c rapidsai -c conda-forge -c nvidia rmm cuda-version=11.8 -y

In [5]:
# import cudf
# import cuml

***Function to track memory usage***

In [6]:
def mem_usage():
    pid = os.getpid()
    py = psutil.Process(pid)
    memory_use = py.memory_info()[0] / 2. ** 30
    return 'memory usage: ' + str(np.round(memory_use, 2)) + " GB\n"

****Importing SVM and other libarries****

In [7]:
# !pip install scikit-image --

In [None]:
from sklearn import datasets, metrics, svm
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
# import seaborn as sns
from skimage import io, transform
import numpy as np
# import pandas as pd

import time
import psutil
import os
import gc

Define functions to load data, convert to one-dimensional array (feature vector), then split into training and testing datasets

In [None]:
gc.collect();

dataset_dir = "/kaggle/input/alaska2-image-steganalysis"
classes = ["all three stego algorithms", "JMiPOD", "JUNIWARD", "UERD"]

# print("some cover images' paths: ", str(image_paths_cover[:10]))
# print("\nsome JMiPOD images' paths: ", str(image_paths_JMiPOD[:10]))
# print("\nsome JUNIWARD images' paths: ", str(image_paths_JUNIWRD[:10]))
# print("\nsome UERD images' paths: ", str(image_paths_UERD[:10]))

def collectImages(image_paths, images, labels):
    for image_path in image_paths:
        image = io.imread(image_path)
#         image = transform.resize(image, (512, 512), mode='constant')
        images.append(image)
        labels.append("cover" if "Cover" in image_path else "stego")
        
def defineData(stego_class):
    print("Binary classification between Cover and", stego_class)
    
    # # size of each class
    class_size = 100
    
    # # A list to store images
    images = []

    # # A list to store labels
    labels = []
    
    # # define paths of each image to be sampled
    image_paths_cover = [dataset_dir + "/Cover/" + image_path for image_path in os.listdir(dataset_dir + "/Cover/") if image_path.endswith(".jpg")][: class_size]
    collectImages(image_paths_cover, images, labels)

    if stego_class == "all three stego algorithms":
        image_paths_JMiPOD = [dataset_dir + "/JMiPOD/" + image_path for image_path in os.listdir(dataset_dir + "/JMiPOD/") if image_path.endswith(".jpg")][ : (class_size // 3)]
        image_paths_JUNIWRD = [dataset_dir + "/JUNIWARD/" + image_path for image_path in os.listdir(dataset_dir + "/JUNIWARD/") if image_path.endswith(".jpg")][(class_size // 3) : (2 * class_size // 3)]
        image_paths_UERD = [dataset_dir + "/UERD/" + image_path for image_path in os.listdir(dataset_dir + "/UERD/") if image_path.endswith(".jpg")][(2 * class_size // 3) : class_size]
        
        collectImages(image_paths_JMiPOD, images, labels)
        collectImages(image_paths_JUNIWRD, images, labels)
        collectImages(image_paths_UERD, images, labels)
    else:
        image_paths_stego = [dataset_dir + "/"+ stego_class + "/" + image_path for image_path in os.listdir(dataset_dir + "/" + stego_class +"/") if image_path.endswith(".jpg")][: class_size]

        collectImages(image_paths_stego, images, labels)

#     images = np.array([(x ,x ** 2) for x in images])  # creates an np array of shape (len(images), 1572864), returns accuracy of 16%
    images = np.array(images)
    labels = np.array(labels)

    n_samples = len(images)

    # Convert to vector
    data = images.reshape((n_samples, -1))
    print("\nvector shape:")
    print(data.shape)

    # flatten data
    for element in data:
        element = element / 255
    
    print(mem_usage())
    return data, labels

**Let's create a SVM model, train it, and then make predictions on the test dataset.**

The support vector model will run with the creation of multiple support vector classifiers utilizing different kernals to create decision boundries of varying accuracy and results.

* Linear kernel
* Polynomial kernel
* RBF kernel

In [None]:
# starting time
start_time = time.time()

# best accuracy so far: 
#      * linear: 53% w/ ~70/30 train/test split, random_state = 0
#      * poly: timed out after 12 hours
#      * rbf: 50% w/ ~70/30 train/test split, random_state = 1234, C = 1000, gamma = 'auto' 
def create_svc(kernel_in, data, labels, algorithm_class):
    
    # split the data
    X_train, X_test, y_train, y_test = train_test_split(data, labels, random_state = 12, test_size=0.3, shuffle=True)
    X_train = X_train ** 2

    print("Finished splitting the data")
    print(mem_usage())
    
    # create SVC with chosen kernel
    print("Creating SVC from svm")
    if kernel_in == 'linear':
        classifier = svm.SVC(kernel = kernel_in, C=10, random_state = 0, class_weight = 'balanced')
    elif kernel_in == 'poly':
        classifier = svm.SVC(kernel = kernel_in, C=10, random_state = 0, degree = 2, gamma = 'auto', class_weight = 'balanced')
    else:
        sc = StandardScaler();
        X_train = sc.fit_transform(X_train)
        X_test = sc.fit_transform(X_test)
        classifier = svm.SVC(kernel = kernel_in, C=1000, random_state = 1234, gamma = 'auto', class_weight = 'balanced')
    print(mem_usage())

    # fit and test classifier
    print("fitting SVC")
    model = classifier.fit(X = X_train, y = y_train)
    print(mem_usage())

    print("finished fitting SVC, testing SVC")
    prediction = classifier.predict(X_test)
    print(mem_usage())
    
    # print classifier results
    print("Classification report for " + algorithm_class + " classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(y_test, prediction)))
    
    # # display confusion matrix
    np.set_printoptions(precision=2)
    disp = ConfusionMatrixDisplay.from_estimator(
        classifier,
        X_test,
        y_test,
        display_labels = ["cover", "stego"],
        cmap = plt.cm.Blues,
        normalize = None,
    )
    disp.ax_.set_title("Binary classification of images: cover vs " + algorithm_class)
    plt.show()
    accuracy_percentage = metrics.accuracy_score(y_test, prediction) * 100
    print(f"Accuracy:\t{(accuracy_percentage):.2f}%")

    return model, classifier, prediction

for algorithm_class in classes:
    data, labels = defineData(algorithm_class)
    create_svc('rbf', data, labels, algorithm_class)

print(f"Total time taken: {(time.time() - start_time):.2f} seconds")
 