In [1]:
!pip install Torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from pandas import DataFrame
from sklearn import metrics
import pandas as pd
import numpy as np
import torch as T
device = T.device("cpu")

from sklearn.preprocessing import normalize, StandardScaler
from keras.models import Sequential
from keras.backend import log, mean
from keras.layers import Dense, Flatten, Activation, Dropout, Input
from keras.optimizers import Adam
from keras.losses import BinaryFocalCrossentropy
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [3]:
from google.colab import drive

drive.mount('/content/drive/', force_remount=True)
%cd '/content/drive/My Drive/IRES UCy/Colab Notebooks/PU Learning/'
#will need to modify for wherever your data set is stored

Mounted at /content/drive/
/content/drive/My Drive/IRES UCy/Colab Notebooks/PU Learning


In [4]:
class SolarDataset(T.utils.data.Dataset):
 #label	DCArrayOutput_W_	    Vmp	         Imp	  CellTemperature_C_	PlaneOfArrayIrradiance_W_m_2_   Fill Factor	    Gamma       	  Pmp	         Voc 	      Isc
 #-2	  3772.327	     36.33172302	  1.095984553   	16.107	                281.111               2.595129269	  0.141648698	  39.81900722	28.80097715	3.587915457
 #-2	  4715.409	     36.33172302	  1.369980764	    16.107	                281.111	              4.669694852  	0.177060882 	49.77376167	44.33885631	5.242090076
 #........
 # [0]        [1]         [2]             [3]           [4]                     [5]                     [6]             [7]           [8]         [9]         [10]

  def __init__(self, fn, tot_num_pos, tot_num_unl):
    self.rnd = np.random.RandomState(1)

    self.tot_num_pos = tot_num_pos  # number positive in data
    self.tot_num_unl = tot_num_unl  # num unlabeled in data

    pos_x_lst = []; pos_y_lst = []  # lists of numpy vectors
    unl_x_lst = []; unl_y_lst = []

    ln = 0  # line number (not including comments)
    j = 0  # counter for unlabeleds

    self.unl_idx_to_line_num = dict()
    # key = idx of an unlabeled item in memory,
    # val = corresponding line number in src data file

    fin = open(fn, "r")  # read into four lists of arrays
    for line in fin:
      line = line.strip()
      if line.startswith("#"): continue

      arr = np.fromstring(line, sep="\t", dtype=np.float32)
      if arr[0] == 1:
        pos_x = arr[[1,2,3,4,5,6,7,8,9,10]]
        pos_y = 1  # always 1 but allows multi-class
        pos_x_lst.append(pos_x)
        pos_y_lst.append(pos_y)
      elif arr[0] == -2:  # unlabeled
        unl_x = arr[[1,2,3,4,5,6,7,8,9,10]]
        unl_y = 0  # treat unlabeleds as negative (0)
        unl_x_lst.append(unl_x)
        unl_y_lst.append(unl_y)
        self.unl_idx_to_line_num[j] = ln
        j +=1
      else:
        print("Fatal: unknown label encountered in file")

      ln += 1  # only data lines

    fin.close()
 
    # data actual storage in 4 tensor-arrays
    self.train_x_pos = T.tensor(pos_x_lst, dtype=T.float32) # predictors for positives
    self.train_y_pos = T.tensor(pos_y_lst, dtype=T.float32).reshape(-1,1) # labels for positives (1s)
    self.train_x_unl = T.tensor(unl_x_lst, dtype=T.float32) # predictors for unlabels
    self.train_y_unl = T.tensor(unl_y_lst, dtype=T.float32).reshape(-1,1) # labels for unlabeleds (0s)

    self.num_pos_unl = 2 * tot_num_pos  # num items in virtual ds

    # set up indices of active and inactive unlabeled items
    all_unl_indices = np.arange(tot_num_unl)  # 180 indices
    self.rnd.shuffle(all_unl_indices)
    self.p = all_unl_indices[0 : tot_num_pos]  # 20 active unlabled
    self.q = all_unl_indices[tot_num_pos : tot_num_unl]  # inactive

  def __len__(self):
    return self.num_pos_unl  # virtual ds size 

  def __getitem__(self, idx):
    if idx < self.tot_num_pos:  # small index = pos = fetch direct
      return (self.train_x_pos[idx], self.train_y_pos[idx])
    else:   # large index = an unlabeled = map index
      ofset = idx - self.tot_num_pos
      ii = self.p[ofset]  # index of active unlabeled item
      return (self.train_x_unl[ii], self.train_y_unl[ii])

  def reinit(self):  # get (20) different unlabeled items
    all_unl_indices = np.arange(self.tot_num_unl)
    self.rnd.shuffle(all_unl_indices)
    self.p = all_unl_indices[0 : self.tot_num_pos] 
    self.q = all_unl_indices[self.tot_num_pos : self.tot_num_unl]

In [5]:
class Net(T.nn.Module):
  # binary classifier for Solar data

  def __init__(self):
    super(Net, self).__init__()
    self.hid1 = T.nn.Linear(10, 30)  # 10-(10-10)-1
    self.hid2 = T.nn.Linear(30, 20)
    self.hid3 = T.nn.Linear(20, 10)
    self.oupt = T.nn.Linear(10, 1)

    T.nn.init.xavier_uniform_(self.hid1.weight) 
    T.nn.init.zeros_(self.hid1.bias)
    T.nn.init.xavier_uniform_(self.hid2.weight) 
    T.nn.init.zeros_(self.hid2.bias)
    T.nn.init.xavier_uniform_(self.hid3.weight) 
    T.nn.init.zeros_(self.hid3.bias)
    T.nn.init.xavier_uniform_(self.oupt.weight) 
    T.nn.init.zeros_(self.oupt.bias)

  def forward(self, x):
    z = T.tanh(self.hid1(x))
    z = T.tanh(self.hid2(z))
    z = T.tanh(self.hid3(z))
    z = T.sigmoid(self.oupt(z))  # see BCELoss() below
    return z

In [6]:
def train(net, ds, bs, me, le, lr, verbose):
  # NN, dataset, batch_size, max_epochs,
  # log_every, learn_rate. optimizer and loss hard-coded.
  net.train()
  data_ldr = T.utils.data.DataLoader(ds, batch_size=bs, shuffle=True)
  loss_func = T.nn.BCELoss()  # assumes sigmoid activation
  opt = T.optim.Adam(net.parameters(), lr=lr)
  scheduler = T.optim.lr_scheduler.StepLR(opt, step_size=100, gamma=0.025) #decreases learning rate by gamma every step_size
  for epoch in range(0, me):
    epoch_loss = 0.0
    for (batch_idx, batch) in enumerate(data_ldr):
      X = batch[0]  # inputs
      Y = batch[1]  # targets

      opt.zero_grad()                # prepare gradients
      oupt = net(X)                  # compute output/target
      loss_val = loss_func(oupt, Y)  # a tensor
      epoch_loss += loss_val.item()  # accumulate for display
      loss_val.backward()            # compute gradients
      opt.step()                     # update weights
      scheduler.step()               # update learning rate

      if epoch % le == 0 and verbose == True:
        print(" epoch = %4d   loss = %0.4f" % (epoch, epoch_loss))

In [7]:
def truth_of_line(ln):
  # actual label for 0-based line number of PUL file
  if ln % 2 == 0: return 0  # files set up this way 
  else: return 1

In [8]:
def confusion_matrix_ascii(TN, FP, FN, TP):
  cm = np.array([[TN, FP], [FN, TP]])
  print("Predicted  0   1")
  print("True 0    " + str(cm[:,0]))
  print("True 1    " + str(cm[:,1]))

In [10]:
# 0. get started
print("\nSolar PUL using PyTorch \n")
T.manual_seed(1)
np.random.seed(1)

# 1. create data objects and standardize
print("Creating dynamic Solar train Dataset ")
print("Dataset has 20 positive and 180 unlabeled ")
df = pd.read_csv("asu_solar_dataset_PUL_200.csv")
df_x = df[['DCArrayOutput_W_', 'Vmp', 'Imp', 'CellTemperature_C_', 'PlaneOfArrayIrradiance_W_m_2_', 'Fill Factor', 'Gamma', 'Pmp', 'Voc', 'Isc']]
df[['DCArrayOutput_W_', 'Vmp', 'Imp', 'CellTemperature_C_', 'PlaneOfArrayIrradiance_W_m_2_', 'Fill Factor', 'Gamma', 'Pmp', 'Voc', 'Isc']] = (df_x-df_x.mean())/df_x.std()
df_csv = df.to_csv('train_file_200.txt', sep='\t', index=False, header=False)
train_file = 'train_file_200.txt'
train_ds = SolarDataset(train_file, 20, 180)

# 2. create neural network
print("\nCreating 10-(10-10)-1 binary NN classifier ")
net = Net().to(device)

# 3. prepare for training multiple times
print("\nSetting training parameters \n")
bat_size = 32
lrn_rate = 0.05
max_epochs = 800
ep_log_interval = 100

print("batch size = " + str(bat_size))
print("initial lrn_rate = %0.2f " % lrn_rate)
print("max_epochs = " + str(max_epochs))
print("loss function = BCELoss() ")
print("optimizer = Adam \n")

# track number times each inactive unlabeled is evaluated
# accumulate sum of p-values from each evaluation
eval_counts = np.zeros(180, dtype=np.int64)
eval_sums = np.zeros(180, dtype=np.float32)

# ----------------------------------------------------------

# 4. accumulate p-values for inactive items after session
num_trials = 8  # number times to train on a subset
for trial in range(num_trials):
  print("Training model " + str(trial) + " of " + str(num_trials), end="")
  train(net, train_ds, bat_size, max_epochs, ep_log_interval, lrn_rate, verbose=True) 

  print("  Done. Scoring inactive unlabeled items ")
  net.eval()
  for i in train_ds.q:  # idxs of inactive unlabeleds
    x = train_ds.train_x_unl[i]  # predictors
    with T.no_grad():
      p = net(x)          # between 0.0 and 1.0
    eval_counts[i] += 1
    eval_sums[i] += p.item()

  train_ds.reinit()   # get different unlabeleds

# ----------------------------------------------------------

# 5. guess 0 or 1 labels for unlabeled items
print("\nGuessing 0 or 1 for unlabeled items ")

lo = 0.45; hi = 0.55   # tune for accuracy vs. quantity

# there is more information about positives so
# to label an unknown as positive you need a higher 
# p-value criterion.

print("pseudo-prob thresholds: %0.2f  %0.2f " % (lo, hi))

#initialize counting variables
TN, TP, FP, FN = 0, 0, 0, 0
num_correct = 0; num_wrong = 0
for i in range(180):  # process each unlabeled data item
  ln = train_ds.unl_idx_to_line_num[i]  # line num in PUL file

  if eval_counts[i] == 0:
    print("Fatal: Never evaluated this unlabeled item ")
    input()
  else:
    avg_p = (eval_sums[i] * 1.0) / eval_counts[i]
    if avg_p >= lo and avg_p <= hi:  #too close to 0.5
      pass
    elif avg_p < lo and truth_of_line(ln) == 0: #predicted 0, actual 0
      num_correct += 1
      TN += 1
    elif avg_p > hi and truth_of_line(ln) == 1: #predicted 1, actual 1
      num_correct += 1
      TP += 1
    elif avg_p < lo and truth_of_line(ln) == 1: #predicted 0, actual 1
      num_wrong += 1
      FN += 1
    elif avg_p > hi and truth_of_line(ln) == 0: #predicted 1, actual 0
      num_wrong += 1
      FP += 1
    else :
      print("error in guessing, please restart")


print("\n---------------\n")
num_guessed = num_correct + num_wrong
print("num labels guessed = " + str(num_guessed))
print("num correct guessed labels = " + str(num_correct))
print("num wrong guessed labels   = " + str(num_wrong))
acc = (1.0 * num_correct) / (num_correct + num_wrong) 
pct = (1.0 * (num_correct + num_wrong)) / 180

print("pct of unlabeled items guessed = %0.4f " % pct)
print("accuracy of guessed items = %0.4f " % acc)

print("\n---------------\n")
print("confusion matrix : ")
confusion_matrix_ascii(TN, FP, FN, TP)


Solar PUL using PyTorch 

Creating dynamic Solar train Dataset 
Dataset has 20 positive and 180 unlabeled 

Creating 10-(10-10)-1 binary NN classifier 

Setting training parameters 

batch size = 32
initial lrn_rate = 0.05 
max_epochs = 800
loss function = BCELoss() 
optimizer = Adam 

Training model 0 of 8 epoch =    0   loss = 0.8801
 epoch =    0   loss = 1.6575
 epoch =  100   loss = 0.1583
 epoch =  100   loss = 0.9753
 epoch =  200   loss = 0.2249
 epoch =  200   loss = 0.7735
 epoch =  300   loss = 0.2684
 epoch =  300   loss = 0.6430
 epoch =  400   loss = 0.2230
 epoch =  400   loss = 0.7792
 epoch =  500   loss = 0.3392
 epoch =  500   loss = 0.4305
 epoch =  600   loss = 0.2606
 epoch =  600   loss = 0.6663
 epoch =  700   loss = 0.3050
 epoch =  700   loss = 0.5332
  Done. Scoring inactive unlabeled items 
Training model 1 of 8 epoch =    0   loss = 0.5494
 epoch =    0   loss = 0.8152
 epoch =  100   loss = 0.2552
 epoch =  100   loss = 0.3525
 epoch =  200   loss = 0.139