<b>GPU accelerated Support Vector Classifier SVC</b>

In this code we will use RAPIDs GPU accelerated version of SVC to try and classify images into 1000 classes. The program is not efficient for this specific case, however, it still conveys the idea.

[RAPIDS SVC Library Link](https://docs.rapids.ai/api/cuml/stable/api.html?highlight=svc#cuml.svm.SVC)

In [1]:
# RAPIDS cuML SVC model
import cudf, cuml, cupy
import pickle
import numpy as np
import gc
import pandas as pd

from cuml.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

print('cuML version',cuml.__version__)

#global X,Y,Q_TST

def unpickle(file):
    with open(file, 'rb') as f:
        dict = pickle.load(f, encoding="latin1")
    return dict

data_FULL_training = unpickle(r'../input/trimagi/training_x.dat') #Loads all unpickled raw data in datadict variable
Label_FULL_training = unpickle(r'../input/trimagi/training_y.dat') #Loads all unpickled raw data in datadict variable

data_Validation = unpickle(r'../input/trimagi/validation_x.dat') #Loads all unpickled raw Test data in datadict_tst variable


    
def DATA_Processing(data_list,class_list,validation_list):
    
    #Remove unformatted data
    remove_idx=[]
    for i in range(1,len(data_list)):
        if data_list[i].shape[0]!=8 or data_list[i].shape[1]!=8 or data_list[i].shape[2]!=3:
            #print(i)
            #print(data_list[i].shape)  
            remove_idx.append(i)

    for j in range(len(remove_idx)):    
        print("Removing value at index",remove_idx[j]) 
        #print(data_list[])
        remove=data_list.pop(remove_idx[j])
        remove=class_list.pop(remove_idx[j])


    data_list=np.array(data_list)     #Convert List to array
    class_list=np.array(class_list)
    validation_list=np.array(validation_list)


    #8x8 image FILTERED TO 64 BIT vector and remove extra channels
    data_list=data_list[:,:,:,0]
    data_list = data_list.reshape(data_list.shape[0], data_list.shape[1]*data_list.shape[2])
    #print(data_list[0])

    validation_list=validation_list[:,:,:,0]
    validation_list = validation_list.reshape(validation_list.shape[0], validation_list.shape[1]*validation_list.shape[2])
           
    return data_list,class_list,validation_list



#Call Function to load and process data
X,Y,VALIDATION_TST = DATA_Processing(data_FULL_training,Label_FULL_training,data_Validation)

# Normalize data between 0 and 1
X = (X - np.min(X)) / (np.max(X) - np.min(X))
VALIDATION_TST = (VALIDATION_TST - np.min(VALIDATION_TST)) / (np.max(VALIDATION_TST) - np.min(VALIDATION_TST))

#Convert to float64 type for RAPIDS
X = np.array(X).astype("float64")  
Y = np.array(Y).astype("float64") 

print("Training Dataset's Dimensions:" ,X.shape,"class Dim: ", Y.shape)
print()
print("Validation Dataset's Dimensions:" ,VALIDATION_TST.shape)

#Delete extra data and clear memory space
del data_Validation
del data_FULL_training
del Label_FULL_training
gc.collect()

cuML version 21.10.02
Removing value at index 216805
Training Dataset's Dimensions: (1281166, 64) class Dim:  (1281166,)

Validation Dataset's Dimensions: (48238, 64)


30

**Apply SMOTE and Split data to train and test**

In [2]:
smote = SMOTE(sampling_strategy='auto')
X_sm, y_sm = smote.fit_resample(X, Y)

print("Total Training Dataset's Dimensions:" ,X_sm.shape,"class Dim: ", y_sm.shape)

siz = 0.6
#X1, Q_TST1, Y1, Q_Label1 = train_test_split(X, Y, test_size=siz, random_state=0, stratify=Y)
#X1, Q_TST1, Y1, Q_Label1 = train_test_split(X_sm, y_sm, test_size=siz, random_state=0, stratify=y_sm)
X1, Q_TST1, Y1, Q_Label1 = train_test_split(X_sm, y_sm, test_size=1200000, random_state=0, stratify=y_sm)
print("Training Dataset's Dimensions:" ,X1.shape,"class Dim: ", Y1.shape)

Total Training Dataset's Dimensions: (1300000, 64) class Dim:  (1300000,)
Training Dataset's Dimensions: (100000, 64) class Dim:  (100000,)


**Generate SVC model and fit training data**

In [3]:
model = SVC(kernel='rbf', C=1.0, cache_size=20, verbose=2, multiclass_strategy='ovr')
model.fit(X1, Y1)

#cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

#cv_scores = cross_val_score(model, X, Y, cv=5,scoring='f1_macro')#print each cv score (accuracy) and average them ,scoring='f1_macro'
#print(cv_scores)
#print('cv_scores mean:{}'.format(np.mean(cv_scores)))
#print()

#del model
#del X1, Y1, Q_TST1, Q_Label1

SVC()

**Predict and calculate accuracy**

In [4]:
BATCH = model.predict(Q_TST1) #Q_TST1
BATCH_Acc = accuracy_score(Q_Label1, BATCH)
print(BATCH_Acc)

0.014715
