In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torch.autograd import Variable 
import torchvision
from torchvision import transforms
from torch import utils
from sklearn.preprocessing import StandardScaler
import random
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, 
from tqdm import tqdm, notebook
import matplotlib.pyplot as plt

SyntaxError: trailing comma not allowed without surrounding parentheses (<ipython-input-1-fd24ebce1d12>, line 14)

In [None]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True
use_cuda

In [None]:
class LogisticRegressionModel(torch.nn.Module):
    def __init__(self, input_dim, n_hiden, first_hiden = None, center_hiden = None , last_hiden = 2): 
        if first_hiden == None:
            first_hiden = input_dim
        if center_hiden == None:
            center_hiden = first_hiden
        super(LogisticRegressionModel, self).__init__()
        self.leyer_list = nn.ModuleList()
        layer_w = input_dim
        hiden_dim = first_hiden
        for i in range(n_hiden+1):
            self.leyer_list.append(nn.Linear(layer_w, hiden_dim))
            if i < (n_hiden/2)-1:
                layer_w = hiden_dim
                hiden_dim = (int)((i+1)*(2*(center_hiden-first_hiden)/n_hiden)+first_hiden)
            else:
                layer_w = hiden_dim
                hiden_dim = (int)((i-n_hiden/2+1)*(2*(last_hiden-center_hiden)/n_hiden)+center_hiden)
        self.leyer_list.append(nn.Linear(layer_w, 1))
    def forward(self, x): 
        Relu = nn.LeakyReLU()
        Sigm = nn.Sigmoid()
        for layer in self.leyer_list[:-1]:
            x = Relu(layer(x))
        y = Sigm(self.leyer_list[-1](x))
        return y

In [None]:
from torch.utils import data

class Dataset(data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, features, labels):
        'Initialization'
        self.labels = labels
        self.features = features

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.features)

    def __getitem__(self, index):
        'Generates one sample of data'
        x = self.features[index]
        y = self.labels[index]
        return x, y


In [None]:
def acc(features, targets):
    targets = targets.to(device).view(-1,1).detach()
    outputs = model(features.to(device)).detach()
    loss_eval = loss_fun(outputs, targets)
    pred = (outputs>0.5).float()
    pred = (pred.to(device) == targets).sum()
    return pred.float()/len(outputs), loss_eval.detach()

In [None]:
def plot_training_classification(run_hist):
    """Plot the training history of the classification model."""
    fig, ax = plt.subplots(1,2, figsize=(20,6), sharex=True)
    x = np.arange(len(run_hist["train_loss"]))
    ax[0].plot(x, run_hist["train_loss"],'b', marker='.', label="train loss")
    ax[0].plot(x, run_hist["test_loss"],'r', marker='.', label="test loss")
    ax[0].legend()
    ax[1].plot(x, run_hist["train_acc"],'b', marker='.', label="train accuracy")
    ax[1].plot(x, run_hist["test_acc"],'r', marker='.', label="test accuracy")
    ax[1].legend()

In [2]:
def plot_roc(x, y, model):
    output = model(x.to(device))
    fpr, tpr, thresholds = roc_curve(y.cpu().detach().numpy(), output.cpu().detach().numpy())
    plt.figure()
    lw = 2
    auc_val = auc(fpr, tpr)
    plt.plot(fpr, tpr, color='darkorange',
           lw=lw, label='ROC curve (area = %0.2f)' % auc_val)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()
    return auc_val

In [3]:
train_data_file = "~/data2.csv"
data = pd.read_csv(train_data_file)
features = data.drop(["Unnamed: 0", "eventID", "track_particle_key", "track_position_z", "track_ghostProbability", "track_chi2", "track_phi", "track_position_phi"] , axis=1).astype(np.float64)
targets = np.array([(is_down and down_in_velo) for is_down, down_in_velo in zip(data.particle_isDown_noVelo.values, data.particle_isDown.values)]).astype(np.float64)
features = features.drop(["particle_isDown_noVelo", "particle_isDown", features.columns[0]], axis=1).values
print(targets)

NameError: name 'pd' is not defined

In [None]:
scaler = StandardScaler()
scaler.fit(features)
X = scaler.transform(features)
X_train, X_test, y_train, y_test = train_test_split(X, targets, test_size = 0.2)
print(X[0])

In [None]:
X_train = torch.FloatTensor(X_train)
y_train = torch.FloatTensor(y_train)
X_test = torch.FloatTensor(X_test).to(device)
y_test = torch.FloatTensor(y_test).to(device)

In [None]:
epochs = 500
batch_size = 10000
input_dim = X_train.shape[1]
output_dim = 1
lr_rate = 0.05
n_hiden = 10
first_hiden = 25
center_hiden = 20
last_hiden = 5
params = [n_hiden, first_hiden, center_hiden, last_hiden]
training_set = Dataset(X_train, y_train)
training_generator = utils.data.DataLoader(training_set, batch_size = batch_size, shuffle = True)

In [None]:
model = LogisticRegressionModel(input_dim, *params)
print(model)

In [None]:
stat_dict = {
    "train_loss" : [],
    "test_loss" : [],
    "train_acc" : [],
    "test_acc" : []
}
print("n_hiden: {}, first_hiden: {}, center_hiden: {} last_hiden: {}".format(*params))
loss_fun = torch.nn.BCELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr_rate)
model.to(device)
n_epoch = len(stat_dict["train_acc"])
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.2)
for epoch in range(n_epoch, epochs):
    train_acc, train_loss = acc(X_train, y_train)
    test_acc, test_loss = acc(X_test, y_test)
    for batch_features, batch_targets in training_generator:
        batch_features, batch_targets = batch_features.to(device).view(-1,input_dim), batch_targets.to(device).view(-1,1)
        def closure(batch_features, batch_targets):
            optimizer.zero_grad()
            pred = model(batch_features)
            loss_val = loss_fun(pred, batch_targets)
            loss_val.backward()
        optimizer.step(closure(batch_features, batch_targets))
        optimizer.zero_grad()
    scheduler.step()
    stat_dict["test_loss"].append(test_loss)
    stat_dict["test_acc"].append(test_acc)
    stat_dict["train_loss"].append(train_loss)    
    stat_dict["train_acc"].append(train_acc)
    print("epoch {0}, train_loss {1:.4f}, train_accuracy = {2:.4f}, test_loss {3:.4f}, test_accuracy = {4:.4f}".format(epoch+1, train_loss, train_acc, test_loss, test_acc))
    if epoch!=0 and epoch%100==0:
        plot_training_classification(stat_dict)
        auc_val = plot_roc(X_test, y_test, model)


In [None]:
#plot_training_classification(stat_dict)
auc_val = plot_roc(X_test, y_test, model)

In [None]:
# outputs = model(torch.FloatTensor(X)).detach().numpy()
# print(len(targets))
# for i in range(1500):
#     print(outputs[i], targets[i])
# fpr, tpr, thresholds = roc_curve(targets, outputs)
# auc(fpr, tpr)

In [None]:
print(acc(torch.FloatTensor(X), torch.FloatTensor(targets)))