In [1]:
import torch
print(torch.__version__)

1.11.0


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torch import Tensor
from torch.nn import Linear
from torch.nn import ReLU
from torch.nn import Sigmoid
from torch.nn import Module
from torch.optim import SGD, Adam
from torch.nn import BCELoss, BCEWithLogitsLoss
from torch.nn.init import kaiming_uniform_
from torch.nn.init import xavier_uniform_
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
from torchinfo import summary
from livelossplot import PlotLosses

In [3]:
PATH = 'train.csv'

device = torch.device("cpu")

EPOCHS = 50
BATCH_SIZE = 64
LEARNING_RATE = 0.001

In [4]:
class CSVDataset(Dataset):

  def __init__(self, path):
    df = pd.read_csv(path, header=None)
    print(df.columns)
    print(df[1].describe())
    self.X = df.values[1:,:]
    self.X = np.delete(self.X,1,1) #remover a idade
    self.y = df.values[1:, 1]
    #print(self.y)
    self.X = self.X.astype('float32')
    self.y = LabelEncoder().fit_transform(self.y)
    self.y = self.y.astype('float32')
    self.y = self.y.reshape((len(self.y), 1)) 

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    return [self.X[idx], self.y[idx]] 
  
  def get_splits(self, n_test=0.33): 
    test_size = round(n_test * len(self.X)) 
    train_size = len(self.X) - test_size 
    return random_split(self, [train_size, test_size]) 

In [6]:
def prepare_data(path):
        dataset = CSVDataset(path)
        train, test = dataset.get_splits()
        train_dl = DataLoader(train, batch_size=len(train), shuffle=True)
        test_dl = DataLoader(test, batch_size=1024, shuffle=False)
        train_dl_all = DataLoader(train, batch_size=len(train), shuffle=False)
        test_dl_all = DataLoader(test, batch_size=len(test), shuffle=False)
        return train_dl, test_dl, train_dl_all, test_dl_all
        
train_dl, test_dl,  train_dl_all, test_dl_all = prepare_data(PATH)

x,y = next(iter(train_dl))
print(x.shape, y.shape)
x,y = next(iter(test_dl))
print(x.shape, y.shape)

Int64Index([0, 1, 2, 3], dtype='int64')
count     113
unique     49
top        16
freq        8
Name: 1, dtype: object
torch.Size([75, 3]) torch.Size([75, 1])
torch.Size([37, 3]) torch.Size([37, 1])


In [7]:
def visualize_data(path):
  df = pd.read_csv(path, header=None)
  display(df)

def visualize_dataset(train_dl, test_dl):
  print(f"Quantidade de casos de Treino:{len(train_dl.dataset)}")
  print(f"Quantidade de casos de Teste:{len(test_dl.dataset)}")
  x, y = next(iter(train_dl))
  print(f"Shape tensor batch casos treino, input: {x.shape}, output: {y.shape}")
  x, y = next(iter(test_dl))
  print(f"Shape tensor batch casos teste, input: {x.shape}, output: {y.shape}")

visualize_data(PATH)
visualize_dataset(train_dl, test_dl)

Unnamed: 0,0,1,2,3
0,id,age,sex,education
1,1,13,1,7
2,2,14,0,8
3,3,15,1,9
4,4,15,1,9
...,...,...,...,...
108,108,77,1,4
109,109,67,0,4
110,110,55,0,4
111,111,76,1,3


Quantidade de casos de Treino:75
Quantidade de casos de Teste:37
Shape tensor batch casos treino, input: torch.Size([75, 3]), output: torch.Size([75, 1])
Shape tensor batch casos teste, input: torch.Size([37, 3]), output: torch.Size([37, 1])


In [8]:
def visualize_holdout_balance(y_train, y_test):
  _, y_train = next(iter(train_dl_all))
  _, y_test = next(iter(test_dl_all))
  sns.set_style('whitegrid')
  casos_treino=len(y_train)
  casos_test=len(y_test)
  b_Train=np.count_nonzero(y_train == 0)
  g_Train = np.count_nonzero(y_train == 1)
  b_Test=np.count_nonzero(y_test == 0)
  g_Test = np.count_nonzero(y_test == 1)
  print("casos_treino:",casos_treino)
  print("g_Train: ", g_Train)
  print("b_Train: ", b_Train)
  print("g_Train/b_Train: ", g_Train/b_Train) 
  print("casos_test:",casos_test)
  print("g_Test: ", g_Test)
  print("b_Test: ", b_Test)
  print("g_Test/b_Test: ", g_Test/b_Test) 
  grafico=sns.barplot(
      x=['g_Train','b_Train', 'g_Test', 'b_Test'], 
      y=[g_Train,b_Train, g_Test, b_Test])
  grafico.set_title('Data balance ')
  plt.xticks(rotation=70)
  plt.tight_layout()
  plt.show() 

visualize_holdout_balance(train_dl_all, test_dl_all)

casos_treino: 75
g_Train:  0
b_Train:  1
g_Train/b_Train:  0.0
casos_test: 37
g_Test:  1
b_Test:  0


ZeroDivisionError: division by zero