# 데이터 분석을 위한 추가 전처리
1. 'stos', 'dtos' 제거
2. 'label' 에서 'target 생성 및 label 제거
3. 숫자형 데이터 MinMaxScaling

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset

from scipy.optimize import linear_sum_assignment as linear_assignment

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'{device} is available')

cuda:0 is available


In [3]:
dataset=pd.read_csv('/content/drive/MyDrive/캡스톤/dataset/CTU-13(9)_preprocessed.csv')

In [4]:
# column 명들 소문자로 변환
dataset.columns = dataset.columns.str.lower()

In [5]:
# 'stos0' 'dtos0' 모두 1.0 의 값을 가지므로 분류에 필요하지 않은 것 같아 삭제 (9번 시나리오에서는)
dataset.drop(['stos0','dtos0'],axis=1,inplace=True)
dataset

Unnamed: 0,dur,proto0,proto1,proto2,proto3,sport0,sport1,sport2,sport3,sport4,...,dport3,dport4,state0,state1,state2,state3,totpkts,totbytes,srcbytes,label
0,1752.578735,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,467,274517,79210,flow=From-Normal-V50-Jist
1,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1,130,130,flow=From-Normal-V50-UDP-CVUT-DNS-Server
2,1532.028076,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,71,4556,2154,flow=From-Normal-V50-Stribrek
3,3570.125732,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,117,7020,3480,flow=From-Normal-V50-Stribrek
4,3570.125000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,519,74059,19656,flow=From-Normal-V50-Stribrek
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214949,0.000282,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,2,400,74,flow=From-Normal-V50-Stribrek
214950,0.018888,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,6,412,272,flow=From-Normal-V50-Stribrek
214951,0.000378,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,2,244,81,flow=From-Normal-V50-Stribrek
214952,0.000319,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,2,280,81,flow=From-Normal-V50-Stribrek


In [6]:
# Botnet 과 Normal 구분 -> 정상 0, 비정상 1
def convertLabel(sample):
  if 'Botnet' in sample: return 1
  else: return 0

dataset['target'] = dataset['label'].apply(convertLabel)

In [7]:
# label feature 제거
dataset.drop(['label'],axis=1,inplace=True)

In [8]:
# 숫자형 feature -> MinMaxScaling
columns=['dur','totpkts','totbytes','srcbytes']
scaler=MinMaxScaler()
dataset[columns]=scaler.fit_transform(dataset[columns])

In [9]:
dataset

Unnamed: 0,dur,proto0,proto1,proto2,proto3,sport0,sport1,sport2,sport3,sport4,...,dport3,dport4,state0,state1,state2,state3,totpkts,totbytes,srcbytes,target
0,4.868403e-01,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.006758,3.996590e-03,0.001442,0
1,0.000000e+00,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.000000,9.902101e-07,0.000002,0
2,4.255746e-01,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.001015,6.544124e-05,0.000039,0
3,9.917278e-01,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.001682,1.013218e-04,0.000063,0
4,9.917276e-01,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.007512,1.077538e-03,0.000358,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214949,7.833540e-08,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.000015,4.921927e-06,0.000001,0
214950,5.246805e-06,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.000073,5.096670e-06,0.000005,0
214951,1.050028e-07,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.000015,2.650268e-06,0.000001,0
214952,8.861345e-08,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.000015,3.174497e-06,0.000001,0


## Train data, Test data 나누기
**논문에 나와있는 대로**
1. Train data -> Normal data(40%)
2. Test data -> Normal data(60%) + Abnormal data(60%)

In [10]:
# 정상 dataset / 비정상 dataset
normal_dataset=dataset[dataset['target']==0]
abnormal_dataset=dataset[dataset['target']==1]

# 정상 dataset -> Train 정상 / Test 정상 나누기 , 40%:60%
normal_dataset = normal_dataset.drop(['target'],axis=1) # 'target' feature drop 
train_normal,test_normal=train_test_split(normal_dataset, test_size=0.6, random_state=42)

In [11]:
train_normal

Unnamed: 0,dur,proto0,proto1,proto2,proto3,sport0,sport1,sport2,sport3,sport4,...,dport2,dport3,dport4,state0,state1,state2,state3,totpkts,totbytes,srcbytes
12073,5.833487e-08,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.000015,0.000002,0.000001
208130,6.873070e-05,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.000015,0.000001,0.000001
33424,9.166908e-08,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.000015,0.000004,0.000001
36975,9.679965e-01,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.000334,0.000031,0.000020
1751,5.332881e-03,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.000073,0.000005,0.000005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212197,1.222254e-07,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.000015,0.000005,0.000001
5440,1.108001e-02,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.000261,0.000058,0.000021
860,8.615164e-02,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.000435,0.000100,0.000027
19091,1.356223e-02,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.000261,0.000121,0.000032


In [12]:
# 비정상 dataset에서 random 으로 60% 추출
test_abnormal = abnormal_dataset.sample(frac=0.6)
test_abnormal = test_abnormal.drop(['target'],axis=1) # 'target' feature drop 

In [13]:
test_abnormal

Unnamed: 0,dur,proto0,proto1,proto2,proto3,sport0,sport1,sport2,sport3,sport4,...,dport2,dport3,dport4,state0,state1,state2,state3,totpkts,totbytes,srcbytes
175855,4.985880e-03,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.000102,6.145127e-06,0.000004
74564,2.503693e-03,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.000029,1.805677e-06,0.000003
185070,6.193386e-05,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.000015,6.858661e-06,0.000002
56016,8.343526e-04,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.000015,9.028386e-07,0.000002
160673,4.365921e-05,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.000015,1.980420e-06,0.000001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63570,8.023378e-05,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.000015,3.159935e-06,0.000001
197015,9.492750e-05,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.000015,2.664830e-06,0.000001
104535,1.688156e-05,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.000015,3.407488e-06,0.000001
130081,9.333580e-08,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.000015,2.126039e-06,0.000001


# Dataframe 을 pytorch tensor로 변환 

In [14]:
# pytorch tensor 로 변환 

# determine the supported device
def get_device():
  if torch.cuda.is_available():
    device = torch.device('cuda:0')
  else :
    device = torch.device('cpu') # don 't have GPU 
  return device

# convert a df to tensor to be used in pytorch
def dataframe_to_tensor(df):
   device = get_device()
   return torch.from_numpy(df.values).float().to(device)

In [15]:
train_normal = dataframe_to_tensor(train_normal)

# Deep K-means
1. layer 수는 5
2. mini-batch size 는 100 
3. 각 layer에서 얼만큼 줄일지 + 활성화 함수는 어떤 것을 사용할지 ?
4. learning rate=0.001 인데 어느 정도로 해야할지?

--> 3차원으로 latent vector 크기 잡아도 될듯

In [16]:
# train_loader 생성
train_loader = DataLoader(train_normal,batch_size=100,shuffle=True)
train_batch = iter(train_loader).next()
print(train_batch.shape)


torch.Size([100, 27])


In [17]:
class TestDataset(Dataset):

  def __init__(self,x_data,y_data):
    self.x_data = torch.FloatTensor(x_data)
    self.y_data = torch.LongTensor(y_data)
    self.len = len(self.x_data)

  def __getitem__(self,index):
    return self.x_data[index], self.y_data[index]

  def __len__(self):
    return self.len


In [18]:
# test_loader 생성
test_data = pd.concat([test_normal, test_abnormal]).values
test_label = np.concatenate([np.zeros(17981),np.ones(110992)])

test_data = TestDataset(test_data,test_label)

test_loader = DataLoader(test_data,batch_size=100,shuffle=True)

In [19]:
class Encoder(nn.Module):
  def __init__(self,latent_size):
    super(Encoder,self).__init__()

    self.encoder = nn.Sequential(
                      nn.Linear(27,12),
                      nn.ReLU(),
                      nn.Linear(12,latent_size),
                      nn.ReLU())
          
    
  def forward(self, x):
    return self.encoder(x)

class Decoder(nn.Module):
  def __init__(self, latent_size):
    super(Decoder,self).__init__()

    self.decoder = nn.Sequential(
                      nn.Linear(latent_size,12),
                      nn.ReLU(),
                      nn.Linear(12,27),
                      nn.Sigmoid())
    
  def forward(self,x):
    return self.decoder(x)

In [20]:
class Kmeans(nn.Module):
  def __init__(self,num_clusters, latent_size):
    super(Kmeans,self).__init__()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.num_clusters = num_clusters
    self.centroids = nn.Parameter(torch.rand((self.num_clusters,latent_size)).to(device))

  def argminl2distance(self,a,b):
    return torch.argmin(torch.sum((a-b)**2,dim=1),dim=0)

  def forward(self,x):
    y_assign = []
    for m in range(x.size(0)):
      h = x[m].expand(self.num_clusters,-1)
      assign = self.argminl2distance(h, self.centroids)
      y_assign.append(assign.item())
    return y_assign, self.centroids[y_assign]

In [21]:
def cluster_acc(y_true,y_pred):
  y_true = np.array(y_true)
  y_pred = np.array(y_pred)
  D = max(y_pred.max(), y_true.max()) + 1
  w = np.zeros((D,D), dtype = np.int64)
  for i in range(y_pred.size):
    w[y_pred[i],y_true[i]] +=1
  ind = linear_assignment(w.max() - w)
  return sum([w[i,j] for i,j in zip(ind[0],ind[1])]) * 1.0 / y_pred.size

In [22]:
def evaluation(testloader, encoder, kmeans, device):
  predictions = []
  actual = []

  with torch.no_grad():
    for images, labels in testloader:
      inputs = images.to(device)
      labels = labels.to(device)
      latent_var = encoder(inputs)
      y_pred,_ = kmeans(latent_var)

      predictions+=y_pred
      actual+=labels.cpu().tolist()

  return cluster_acc(actual,predictions)

#모델 학습


In [23]:
latent_size=4
num_clusters=2

In [24]:
encoder = Encoder(latent_size).to(device)
decoder = Decoder(latent_size).to(device)
kmeans = Kmeans(num_clusters, latent_size).to(device)
criterion1 = torch.nn.MSELoss()
criterion2 = torch.nn.MSELoss()
optimizer = torch.optim.Adam(list(encoder.parameters())+list(decoder.parameters())+list(kmeans.parameters()), lr=0.001)

In [45]:
T1=100
T2=200
lam = 1e-3

In [46]:
for ep in range(350):
  if ep <= T1:
    alpha = lam/(T2-T1)
  else:
    alpha = lam

  # 정상 data(label=0)에 대해서만 학습
  running_loss = 0.0
  for batch in train_loader:
    inputs = batch.to(device)
    optimizer.zero_grad()
    latent_var = encoder(inputs)
    _, centroids = kmeans(latent_var.detach())
    outputs = decoder(latent_var)

    l_rec = criterion1(inputs,outputs)
    l_clt = criterion2(latent_var,centroids)
    loss = l_rec + alpha*l_clt

    loss.backward()
    optimizer.step()
    running_loss+=loss.item()
  
  ###
  avg_loss = running_loss/len(train_loader)

  if(ep%10==0):
    print('[%d] Train loss:%.4f' %(ep,avg_loss) )

[0] Train loss:0.2298
[10] Train loss:0.0025
[20] Train loss:0.0011
[30] Train loss:0.0007
[40] Train loss:0.0006
[50] Train loss:0.0005
[60] Train loss:0.0004
[70] Train loss:0.0004
[80] Train loss:0.0004
[90] Train loss:0.0003
[100] Train loss:0.0003
[110] Train loss:0.0005
[120] Train loss:0.0005
[130] Train loss:0.0004
[140] Train loss:0.0004
[150] Train loss:0.0004
[160] Train loss:0.0004
[170] Train loss:0.0004
[180] Train loss:0.0003
[190] Train loss:0.0003
[200] Train loss:0.0003
[210] Train loss:0.0003
[220] Train loss:0.0003
[230] Train loss:0.0003
[240] Train loss:0.0003
[250] Train loss:0.0003
[260] Train loss:0.0003
[270] Train loss:0.0003
[280] Train loss:0.0003
[290] Train loss:0.0003
[300] Train loss:0.0003
[310] Train loss:0.0003
[320] Train loss:0.0003
[330] Train loss:0.0003
[340] Train loss:0.0003


In [47]:
torch.save(encoder.state_dict(),'/content/drive/MyDrive/캡스톤/models(CTU-13(9))/encoder.pth')
torch.save(decoder.state_dict(),'/content/drive/MyDrive/캡스톤/models(CTU-13(9))/decoder.pth')
torch.save(kmeans.state_dict(),'/content/drive/MyDrive/캡스톤/models(CTU-13(9))/kmeans.pth') 

# 학습 후 One class svm ensemble

In [25]:
encoder.load_state_dict(torch.load('/content/drive/MyDrive/캡스톤/models(CTU-13(9))/encoder.pth'))
decoder.load_state_dict(torch.load('/content/drive/MyDrive/캡스톤/models(CTU-13(9))/decoder.pth'))
kmeans.load_state_dict(torch.load('/content/drive/MyDrive/캡스톤/models(CTU-13(9))/kmeans.pth'))

<All keys matched successfully>

In [26]:
# svm 에 활용할 latent_vectors 생성
# svm 에 활용할 latent_vectors_labels 생성

train_latent_vectors = [] # latent vector 가 list로 계속해서 들어간 형태
train_predict_label= []

for data in train_loader:
  inputs = data.to(device)
  latent_vector= encoder(inputs)
  predict_label, _ = kmeans(latent_vector)

  train_latent_vectors+=latent_vector.cpu().tolist()
  train_predict_label+=predict_label

In [27]:
# svm 에 활용할 test_latent_vectors 생성
# svm 에 활용할 test_label (실제 label 값) 생성

test_latent_vectors = [] # latent vector 가 list로 계속해서 들어간 형태
test_predict_label = []
test_label= []

for data,labels in test_loader:
  inputs = data.to(device)
  latent_vector= encoder(inputs)
  predict_label, _ = kmeans(latent_vector)

  test_latent_vectors+=latent_vector.cpu().tolist()
  test_predict_label+=predict_label
  test_label+=labels.cpu().tolist()

# OneSvm Ensemble

In [28]:
#OneclassSVM Ensemble
from sklearn.svm import OneClassSVM
import numpy as np
import pandas as pd

class OCSVMEnsemble():

  def __init__(self,nu=0.5):
    self.nu = nu
    #self.gamma = gamma

  def fit(self, latent, pred):
    # oneclasssvm 인스턴스가 들어갈 리스트
    self.instance = []
    self.num_cluster = len(np.unique(pred))

    # cluster 수 만큼 One-Class-SVM 생성 , label 순서대로
    for clu in range(self.num_cluster):
      idx = np.where(pred == clu) # 0-9 까지의 각 그룹에 해당하는 index 도출 
      #해당 군집에 속한 data
      clu_data = latent[idx]
      ocsvm = OneClassSVM(kernel = 'rbf',gamma = 'scale', nu = self.nu).fit(clu_data) # OneClassSVM 을 통한 객체 ocsvm 생성
      self.instance.append(ocsvm)

  # 모든 OneClassSVM 인스턴스를 돌면서
  # inlier -> 0 , outlier -> 1 return 하는 함수
  def predictLabel(self,x):
    for model in self.instance:
      model_predict = model.predict(x) # inliers 1, outliers -1
      if model_predict == 1:
        return 0
    return 1
  
  # test 데이터가 할당된 군집에 대한 svm 만을 이용해서 이상 데이터를 분류한다
  # test_latent_vector 1개 - 할당된 label 1개 를 입력받음
  def l_predict(self,x,cluster_label):
    model = self.instance[cluster_label]
    model_predict = model.predict(x)
    if model_predict == 1:
      return 0
    else:
      return 1


In [29]:
# train_latent_vectors, latent_vectors_labels -> numpy 로 변환
train_latent_vectors=np.array(train_latent_vectors)
train_predict_label=np.array(train_predict_label)

In [30]:
# test_latent_vectors, test_predict_label, test_label -> numpy 로 변환
test_latent_vectors = np.array(test_latent_vectors)
test_predict_label = np.array(test_predict_label)
test_label = np.array(test_label)

In [31]:
# 학습 과정
ocsvm = OCSVMEnsemble(nu=0.8) 
ocsvm.fit(train_latent_vectors, train_predict_label) # train data 에서 latent vector 들 + 가장 가까운 중심의 index

### One-Class-SVM 여러 개 중 1개에라도 들어가면 정상 아니면 비정상

In [32]:
# test과정에서의 latent vector 들을 넣어가면서 예측하는 과정
y_pred = []
for i in test_latent_vectors:
  i=i.reshape(1,-1)
  pred=ocsvm.predictLabel(i)
  y_pred.append(pred)

In [33]:
# confusion matrix
from sklearn.metrics import confusion_matrix
Confusion_Matrix=confusion_matrix(test_label,y_pred)
print(Confusion_Matrix.T) # 행은 예측 값, 열은 실제 값

[[  5868   3719]
 [ 12113 107273]]


In [34]:
# classification report 
from sklearn.metrics import classification_report
print(classification_report(test_label,y_pred))

              precision    recall  f1-score   support

           0       0.61      0.33      0.43     17981
           1       0.90      0.97      0.93    110992

    accuracy                           0.88    128973
   macro avg       0.76      0.65      0.68    128973
weighted avg       0.86      0.88      0.86    128973



### 가장 가까운 cluster에서 One-Class-SVM

In [97]:
y_pred2 = [] 
for i,j in zip(test_latent_vectors,test_predict_label):
  i=i.reshape(1,-1)
  pred=ocsvm.l_predict(i,j)
  y_pred2.append(pred)

In [98]:
# confusion matrix
from sklearn.metrics import confusion_matrix
Confusion_Matrix2=confusion_matrix(test_label,y_pred2)
print(Confusion_Matrix2.T) # 행은 예측 값, 열은 실제 값

[[  5871   3655]
 [ 12110 107337]]


In [99]:
# classification report 
from sklearn.metrics import classification_report
print(classification_report(test_label,y_pred2))

              precision    recall  f1-score   support

           0       0.62      0.33      0.43     17981
           1       0.90      0.97      0.93    110992

    accuracy                           0.88    128973
   macro avg       0.76      0.65      0.68    128973
weighted avg       0.86      0.88      0.86    128973



#### 비교해야 할 One-Class SVM

In [35]:
from sklearn.svm import OneClassSVM

ocsvm = OneClassSVM(kernel = 'rbf',gamma = 'scale', nu = 0.8).fit(train_latent_vectors)
y_pred=ocsvm.predict(test_latent_vectors)
y_pred3 =np.where(y_pred == 1,0,1)
print(classification_report(test_label,y_pred3))

              precision    recall  f1-score   support

           0       0.57      0.18      0.28     17981
           1       0.88      0.98      0.93    110992

    accuracy                           0.87    128973
   macro avg       0.72      0.58      0.60    128973
weighted avg       0.84      0.87      0.84    128973

