<b>In This code, we will see the functionality of SMOTE</b>

SMOTE is one of the ways in which we can balance an unbalanced data.

In [1]:
# RAPIDS cuML kNN model
import cudf, cuml, cupy
from cuml.neighbors import KNeighborsClassifier as cuKNeighbors
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score

import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

print('cuML version',cuml.__version__)

from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE  #Different techniques for SMOTE


def unpickle(file):
    with open(file, 'rb') as f:
        dict = pickle.load(f, encoding="latin1")
    return dict

data_FULL_training = unpickle(r'../input/trimagi/training_x.dat') #Loads all unpickled raw data in datadict variable
Label_FULL_training = unpickle(r'../input/trimagi/training_y.dat') #Loads all unpickled raw data in datadict variable

data_Validation = unpickle(r'../input/trimagi/validation_x.dat') #Loads all unpickled raw Test data in datadict_tst variable


    
def DATA_Processing(data_list,class_list,validation_list):
    
    #Remove unformatted data
    remove_idx=[]
    for i in range(1,len(data_list)):
        if data_list[i].shape[0]!=8 or data_list[i].shape[1]!=8 or data_list[i].shape[2]!=3:
            #print(i)
            #print(data_list[i].shape)  
            remove_idx.append(i)

    for j in range(len(remove_idx)):    
        print("Removing value at index",remove_idx[j]) 
        remove=data_list.pop(remove_idx[j])
        remove=class_list.pop(remove_idx[j])


    data_list=np.array(data_list) 
    class_list=np.array(class_list)
    validation_list=np.array(validation_list)


    #8x8 image FILTERED TO 64 BIT vector
    data_list=data_list[:,:,:,0]
    data_list = data_list.reshape(data_list.shape[0], data_list.shape[1]*data_list.shape[2])
    #print(data_list[0])

    validation_list=validation_list[:,:,:,0]
    validation_list = validation_list.reshape(validation_list.shape[0], validation_list.shape[1]*validation_list.shape[2])
           
    return data_list,class_list,validation_list


X,Y,VALIDATION_TST = DATA_Processing(data_FULL_training,Label_FULL_training,data_Validation)

X = (X - np.min(X)) / (np.max(X) - np.min(X))
VALIDATION_TST = (VALIDATION_TST - np.min(VALIDATION_TST)) / (np.max(VALIDATION_TST) - np.min(VALIDATION_TST))


###SPLIT THE Training DATA --- Not required when prediction is being done on full dataset.

#X, Q_TST, Y, Q_Label = train_test_split(X, Y, test_size=0.05, random_state=0, stratify=Y)
#X, Q_TST, Y, Q_Label = train_test_split(X, Y, test_size=10000, train_size=50000, random_state=0, stratify=Y)

X = np.array(X).astype("float64")  
Y = np.array(Y).astype("float64") 

#Q_TST = np.array(Q_TST).astype("float32")
#Q_Label = np.array(Q_Label).astype("float32")


print("Training Dataset's Dimensions:" ,X.shape,"class Dim: ", Y.shape)
#print("Test Dataset's Dimensions:" ,Q_TST.shape,"class Dim: ", Q_Label.shape)
print()

print("Validation Dataset's Dimensions:" ,VALIDATION_TST.shape)




cuML version 21.10.02
Removing value at index 216805
Training Dataset's Dimensions: (1281166, 64) class Dim:  (1281166,)

Validation Dataset's Dimensions: (48238, 64)


**Before SMOTE**

In [15]:
#Data is highly unbalanced with class 42 having lowest amount of data
df_Y_Full_New = pd.DataFrame(Y)
print(df_Y_Full_New.value_counts(sort=True))
print()
print("Total datapoints before SMOTE" , Y.shape[0])

1.0      1300
649.0    1300
637.0    1300
638.0    1300
639.0    1300
         ... 
99.0      772
46.0      755
207.0     754
31.0      738
42.0      732
Length: 1000, dtype: int64

Total datapoints before SMOTE 1281166


**After SMOTE**

In [14]:
#Data is now balanced with all 1000 classes having 1300 data points each.

smote = SMOTE(sampling_strategy='auto')
X_sm, y_sm = smote.fit_resample(X, Y)


df_Y_Full_New = pd.DataFrame(y_sm)
print(df_Y_Full_New.value_counts(sort=True))
print()
print("Total datapoints After SMOTE" , y_sm.shape[0])

1.0       1300
672.0     1300
659.0     1300
660.0     1300
661.0     1300
          ... 
339.0     1300
340.0     1300
341.0     1300
342.0     1300
1000.0    1300
Length: 1000, dtype: int64

Total datapoints After SMOTE 1300000


In [2]:
#Use SMOTE to balance the data then fit just like before.

smote = SMOTE(sampling_strategy='auto')
X_sm, y_sm = smote.fit_resample(X, Y)

print("Balanced Dataset's Dimensions:" ,X_sm.shape,"class Dim: ", y_sm.shape)

Balanced Dataset's Dimensions: (1300000, 64) class Dim:  (1300000,)


**Fit Model**

In [3]:
#Now, Fit the KNN model

model = cuKNeighbors(n_neighbors=500)    #Run Rapids KNN function on K=500
#model.fit(X, Y)
model.fit(X_sm, y_sm)                   #Fit the model
BATCH = model.predict(VALIDATION_TST)   #Predict
BATCH=BATCH.astype('int').astype('<U4') #Convert labels to proper format
print(BATCH.dtype)
print(BATCH.shape)
print(BATCH)

<U4
(48238,)
['549' '549' '903' ... '194' '772' '903']


In [4]:
#Save to CSV and Export to zip format.

ind = np.arange(1,VALIDATION_TST.shape[0]+1, dtype = np.int32)
print(ind.shape)

predictions = pd.DataFrame(BATCH)

column_names = ["Id", "Class"]
combine_DF=pd.DataFrame(columns = column_names)

combine_DF['Id']=ind
combine_DF['Class']=BATCH

compression_opts = dict(method='zip',archive_name='OSAMA_KNN.csv')
combine_DF.to_csv('./OSAMA_KNN_K500_GPU.zip', index=False,compression=compression_opts)


(48238,)
