In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torch.autograd import Variable 
import torchvision
from torchvision import transforms
from torch import utils
from sklearn.preprocessing import StandardScaler
import random
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

In [23]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True
use_cuda

True

In [24]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [0]:
from torch.utils import data

class Dataset(data.Dataset):
	'Characterizes a dataset for PyTorch'
	def __init__(self, features, labels):
		'Initialization'
		self.labels = labels
		self.features = features
    
	def __len__(self):
		'Denotes the total number of samples'
		return len(self.features)

	def __getitem__(self, index):
		'Generates one sample of data'
		x = self.features[index]
		y = self.labels[index]
		return x, y

In [0]:
train_data_file = "/content/gdrive/My Drive/PITE/data.csv"
data = pd.read_csv(train_data_file)
features = data.drop(["is_true_seed","has_MCParticle",
                      "is_downstream_reconstructible",
                      "has_MCParticle_not_electron",
                     "is_downstream_reconstructible_not_electron",
                     "seed_mva_value", "seed_p"] , axis=1).astype(np.float64)
targets = data.is_downstream_reconstructible.values
featuresnp = features.drop(features.columns[0], axis=1).values

In [27]:
print(featuresnp)

[[ 1.44792874e+00  1.08352274e+03  2.70000000e+01 ... -1.43095899e+02
  -7.22092939e-01 -3.86714296e-02]
 [ 3.54944730e-01  1.34514854e+03  2.40000000e+01 ... -2.84193871e+02
  -1.29122137e-01 -3.79203554e-02]
 [ 2.90652352e+00  1.01388841e+03  2.40000000e+01 ... -2.70235334e+02
  -4.56234802e-01 -1.51978073e-02]
 ...
 [ 5.49376297e+00  9.27026952e+02  1.50000000e+01 ... -3.18414374e+02
  -1.17171352e+00 -3.95624097e-02]
 [ 6.95178340e+00  9.63163094e+02  1.40000000e+01 ...  2.12572602e+02
  -1.06405209e+00 -7.97020466e-03]
 [ 3.93933640e+00  1.14446986e+03  1.80000000e+01 ...  1.15643206e+03
   4.69775485e-01  1.83560605e-01]]


In [0]:
class LogisticRegressionModel(torch.nn.Module): 
  
    def __init__(self, input_dim, n_hiden): 
        super(LogisticRegressionModel, self).__init__()
        self.leyer_list = nn.ModuleList()
        layer_w = input_dim
        factor = 1.4;
        for i in range(n_hiden):
            if(i<n_hiden/2):
                self.leyer_list.append(nn.Linear(layer_w, int(layer_w*factor)))
                layer_w = int(layer_w*factor)
            else:
                self.leyer_list.append(nn.Linear(layer_w, int(layer_w/factor)))
                layer_w = int(layer_w/factor)
        self.leyer_list.append(nn.Linear(layer_w, 1))
    def forward(self, x): 
        Relu = nn.ReLU()
        Sigm = nn.Sigmoid()
        for layer in self.leyer_list[:-1]:
            x = Relu(layer(x))
        y = Sigm(self.leyer_list[-1](x))
        return y

In [0]:
scaler = StandardScaler()
scaler.fit(featuresnp)
X = scaler.transform(featuresnp)
X_train, X_test, y_train, y_test = train_test_split(X, targets, test_size = 100000)
_, X_train_test, _, y_train_test = train_test_split(X_train, y_train, test_size = 100000)

In [0]:
X_train = torch.FloatTensor(X_train)
y_train = torch.FloatTensor(y_train)
X_test = torch.FloatTensor(X_test)
y_test = torch.FloatTensor(y_test)
X_train_test = torch.FloatTensor(X_train_test)
y_train_test = torch.FloatTensor(y_train_test)

In [0]:
batch_size = 200
epochs = 100
n_iters = epochs * (len(y_train) / batch_size)
input_dim = X_train.shape[1]
output_dim = 1
lr_rate = 0.0001
n_hiden = 32

In [32]:
model = LogisticRegressionModel(input_dim, n_hiden).to(device)
criterion = torch.nn.BCELoss()
print(model.leyer_list)
optimizer = torch.optim.Adam(model.parameters(), lr=lr_rate)

ModuleList(
  (0): Linear(in_features=9, out_features=12, bias=True)
  (1): Linear(in_features=12, out_features=16, bias=True)
  (2): Linear(in_features=16, out_features=22, bias=True)
  (3): Linear(in_features=22, out_features=30, bias=True)
  (4): Linear(in_features=30, out_features=42, bias=True)
  (5): Linear(in_features=42, out_features=58, bias=True)
  (6): Linear(in_features=58, out_features=81, bias=True)
  (7): Linear(in_features=81, out_features=113, bias=True)
  (8): Linear(in_features=113, out_features=158, bias=True)
  (9): Linear(in_features=158, out_features=221, bias=True)
  (10): Linear(in_features=221, out_features=309, bias=True)
  (11): Linear(in_features=309, out_features=432, bias=True)
  (12): Linear(in_features=432, out_features=604, bias=True)
  (13): Linear(in_features=604, out_features=845, bias=True)
  (14): Linear(in_features=845, out_features=1183, bias=True)
  (15): Linear(in_features=1183, out_features=1656, bias=True)
  (16): Linear(in_features=1656, ou

In [0]:
training_set = Dataset(X_train, y_train)
training_generator = utils.data.DataLoader(training_set, batch_size = batch_size, shuffle = True)


In [0]:
def acc(X, y, device):
  output = model(X.to(device))
  pred = (output>0.5).float()
  pred = (pred.to(device) == y.to(device).view(-1,1)).sum()
  return pred.float()/len(output)
  

In [0]:
statdict = {
    "loss" : [],
    "train_acc" : [],
    "test_acc" : []
}
test_acc = acc(X_test, y_test, device)
train_acc = acc(X_train_test, y_train_test, device)
statdict["test_acc"].append(test_acc)
statdict["train_acc"].append(train_acc)
print("train_accuracy = {}, test_accuracy = {}".format(train_acc, test_acc))
for epoch in range(epochs):
    for features, targets in training_generator:
        features, targets = features.to(device).view(-1,input_dim), targets.to(device).view(-1,1)
        optimizer.zero_grad()
        pred = model(features)
        loss = criterion(pred, targets)
        loss.backward()
        optimizer.step()
    statdict["loss"].append(loss.data)
    test_acc = acc(X_test, y_test, device)
    train_acc = acc(X_train_test, y_train_test, device)
    statdict["test_acc"].append(test_acc)
    statdict["train_acc"].append(train_acc)
    print("epoch {}, loss {}, train_accuracy = {}, test_accuracy = {}".format(epoch, loss.data, train_acc, test_acc))
    
        

train_accuracy = 0.46097999811172485, test_accuracy = 0.46219998598098755
epoch 0, loss 0.42365360260009766, train_accuracy = 0.7899199724197388, test_accuracy = 0.7900899648666382
