In [None]:
!git clone https://github.com/cruiseresearchgroup/TSCP2.git

fatal: destination path 'TSCP2' already exists and is not an empty directory.


In [None]:
!pip3 install pytorch-lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, Dataset
import torch

from math import floor
import pandas as pd
import scipy.io as sio
import csv
from tqdm import tqdm

import tscp_new as tscp
from IPython.display import clear_output
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix,f1_score

In [None]:
def load_usc_ds(path, window, mode='train'):

    X, lbl = extract_windows(path, window, mode)

    if mode == "all":
        return X, lbl
    train_size = int(floor(0.8 * X.shape[0]))
    if mode == "train":
        trainx = X[0:train_size]
        trainlbl = lbl[0:train_size]
        idx = np.arange(trainx.shape[0])
        np.random.shuffle(idx)
        trainx = trainx[idx,]
        trainlbl = trainlbl[idx]
        print('train samples : ', train_size)
        return trainx, trainlbl

    else:
        testx = X[train_size:]
        testlbl = lbl[train_size:]
        print('test shape {} and number of change points {} '.format(testx.shape, len(np.where(testlbl > 0)[0])))

        return testx, testlbl


def extract_windows(path, window_size, mode="train"):
    #files = os.scandir(path)
    windows = []
    lbl = []
    dataset = sio.loadmat(path+"usc.mat")

    ts = np.array(dataset['Y'])
    ts = ts[:,0]
    cp = np.array(dataset['L'])
    cp = cp[:,0]

    #cp = pd.read_csv(path+"usc_label.csv")
    num_cp = 0
    #ts = np.sqrt(np.power(x[:, 0], 2) + np.power(x[:, 1], 2) + np.power(x[:, 2], 2))
    for i in range(0, ts.shape[0] - window_size, 5):
        windows.append(np.array(ts[i:i + window_size]))
        # print("TS",ts[i:i+window_size])
        is_cp = np.where(cp[i:i + window_size] == 1)[0]
        if is_cp.size == 0:
            is_cp = [0]
        else:
            num_cp += 1
        lbl.append(is_cp[0])

        # print(is_cp)


    print("number of samples : {} /  number of samples with change point : {}".format(len(windows), num_cp))
    windows = np.array(windows)

    return windows, np.array(lbl)

def load_dataset(path, ds_name, win, bs, mode="train"):
    if ds_name == 'HASC':
        trainx, trainlbl = load_hasc_ds(path, window = 2 * win, mode=mode)
    elif ds_name == "USC":
        trainx, trainlbl = load_usc_ds(path, window=2 * win, mode=mode)
    else:
        raise ValueError("Undefined Dataset.")

    trainlbl = trainlbl.reshape((trainlbl.shape[0], 1))
    print(trainx.shape, trainlbl.shape)
    dataset = np.concatenate((trainlbl, trainx), 1)

    print("dataset shape : ", dataset.shape)
    if mode == "test":
        return dataset
    # Create TensorFlow dataset

    train_ds = TensorDataset(torch.from_numpy(dataset))
    #train_ds = DataLoader(train_ds, bs, num_workers=2)
    return train_ds

In [None]:
def estimate_CPs(sim, gt, name, train_name, metric='cosine', threshold=0.5):
    #if metric == "cosine":
    #    sim = _cosine_simililarity_dim1(h, f)

    est_cp = np.zeros(sim.shape[0])
    est_cp[np.where(sim < threshold)[0]] = 1
    tn, fp, fn, tp = confusion_matrix(gt, est_cp).ravel()
    f1 = f1_score(gt, est_cp)

    ## gt==1
    gt_id = np.where(gt == 1)[0]
    """
    plt.figure(figsize=(15, 7))
    plt.subplot(2, 1, 1)
    for i in gt_id:
        plt.axvline(x=i, ymin=0, ymax=1, color='k')
    plt.subplot(2, 1, 2)
    for i in np.where(est_cp == 1)[0]:
        plt.axvline(x=i, ymin=0, ymax=1, color='r')
    plt.savefig(name+".png")
    plt.savefig(name + ".pdf")
    """
    print("tn {}, fp {}, fn {}, tp {} ----- f1-score {}".format(tn, fp, fn, tp, f1))

    ## continuous series
    i = 1
    pos, seq_tp, seq_fn, seq_fp = 0, 0, 0, 0

    while i < gt.shape[0]:
        if gt[i] == 1:
            pos += 1
            j = i
            while gt[i] == 1:
                i += 1

            if np.sum(est_cp[j:i]) > 0:
                seq_tp += 1
                est_cp[j:i] = 0
            else:
                seq_fn += 1

        i += 1

    seq_fp = np.where(np.diff(est_cp) == 1)[0].shape[0]
    seq_f1 = (2 * seq_tp) / (2 * seq_tp + seq_fn + seq_fp)

    print("SEQ : Pos {}, fp {}, fn {}, tp {} ----- f1-score {}".format(pos, seq_fp, seq_fn, seq_tp, seq_f1))
    result = "tn, {}, fp, {}, fn, {}, tp, {}, f1-score, {}, Pos, {}, seqfp, {}, seqfn, {}, seqtp, {}, seqf1, {}\n".format(tn, fp, fn, tp, f1, pos, seq_fp, seq_fn, seq_tp, seq_f1)
    return result

In [None]:
DS_NAME = 'USC'
DATA_PATH = './TSCP2/data/'
OUTPUT_PATH = os.path.join('./output/', DS_NAME)
MODEL_PATH = os.path.join('./output/', "model")
LOSS = 'nce'
SIM = 'cosine'
GPU = 0

WIN = 100
CODE_SIZE = 10
BATCH_SIZE = 32
EPOCHS = 30
LR = 1e-4
TEMP = 0.5
TAU = 0.1
BETA = 1
EVALFREQ = 25
decay_steps = 1000


train_name = "CP2_model_" + DS_NAME + "_T" + str(TEMP) + "_WIN" + str(WIN) + \
             "_BS" + str(BATCH_SIZE) + "_CS" + str(CODE_SIZE) + "_lr" + str(LR) + \
             "_LOSS" + LOSS +  "_SIM" + SIM + "_TAU" + str(TAU) + "_BETA" + str(BETA)
print("------------------------------------>>> " + train_name)

# -------------------------------
# 1 PREPARE DATASET
# -------------------------------
train_ds = load_dataset(DATA_PATH, DS_NAME, WIN, BATCH_SIZE, mode = "train")
test_ds = load_dataset(DATA_PATH, DS_NAME, WIN, BATCH_SIZE, mode = "test")


------------------------------------>>> CP2_model_USC_T0.5_WIN100_BS32_CS10_lr0.0001_LOSSnce_SIMcosine_TAU0.1_BETA1
number of samples : 18687 /  number of samples with change point : 1400
train samples :  14949
(14949, 200) (14949, 1)
dataset shape :  (14949, 201)
number of samples : 18687 /  number of samples with change point : 1400
test shape (3738, 200) and number of change points 274 
(3738, 200) (3738, 1)
dataset shape :  (3738, 201)


In [None]:
prep_model = tscp.Encoder(code_size = CODE_SIZE, seq_len = WIN)

similarity = tscp._cosine_simililarity_dim2

tscp_model = tscp.TSCP_model(prep_model, train_ds, test_ds, batch_size=BATCH_SIZE, temperature=TEMP, lr=LR, decay_steps=decay_steps, window_1=WIN)
optimizer = tscp_model.configure_optimizers()
lr = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer, T_max=decay_steps)


In [None]:
train_loader = tscp_model.train_dataloader()
val_loader = tscp_model.val_dataloader()

train_losses = []
val_losses = []

epoch_wise_sim = []
epoch_wise_neg = []
for epoch in tqdm(range(EPOCHS)):
    
    iteration = 0
    train_losses_iters = []
    step_wise_sim = []
    step_wise_neg = []
    
    for index, batch in enumerate(train_loader):
            
          loss, sim, neg = tscp_model.training_step(batch, index)
          train_losses_iters.append(float(loss))
          step_wise_sim.append(float(sim))
          step_wise_neg.append(float(neg))
          if not iteration % 400:
            print("train_losses_iters", train_losses_iters[-1])

          loss.backward()
          optimizer.step()
          lr.step()
          optimizer.zero_grad()

          iteration += 1
            
          #print("iteration", iteration)
        
          if not iteration % 400:
            tscp_model.eval()
            vall = []
            with torch.no_grad():
              for c, b in enumerate(val_loader):
                val_loss, _, _ = tscp_model.validation_step(b, c)
                vall.append(float(val_loss.detach()))
                if c>10:
                  break
              print("val_loss", np.mean(vall))
              val_losses.append(np.mean(vall))
              
            
            tscp_model.train()
    print("epoch_train_loss", np.mean(train_losses_iters))  
    train_losses.append(np.mean(train_losses_iters))
    print("epoch_val_loss", np.mean(val_losses))
    epoch_wise_sim.append(np.mean(step_wise_sim))
    epoch_wise_neg.append(np.mean(step_wise_neg))



train_losses_iters 0.3131650984287262
val_loss 0.3027332586546739
train_losses_iters 0.033841852098703384


  3%|▎         | 1/30 [01:53<55:04, 113.95s/it]

epoch_train_loss 0.06819479631530678
epoch_val_loss 0.3027332586546739
train_losses_iters 0.057341329753398895
val_loss 0.3027282655239105
train_losses_iters 0.03157064691185951


  7%|▋         | 2/30 [03:48<53:22, 114.36s/it]

epoch_train_loss 0.03598870670932353
epoch_val_loss 0.30273076208929217
train_losses_iters 0.03559800982475281
val_loss 0.30271531144777936
train_losses_iters 0.032975003123283386


 10%|█         | 3/30 [05:44<51:48, 115.12s/it]

epoch_train_loss 0.03377771104534722
epoch_val_loss 0.3027256118754546
train_losses_iters 0.03375104069709778
val_loss 0.3043800046046575
train_losses_iters 0.037485864013433456


 13%|█▎        | 4/30 [07:41<50:15, 115.97s/it]

epoch_train_loss 0.03583169599167175
epoch_val_loss 0.3031392100577553
train_losses_iters 0.03560640662908554
val_loss 0.30416984111070633
train_losses_iters 0.038103386759757996


In [None]:
torch.save({
            'model_state_dict': tscp_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            }, './tscp1001032_opt.pt')

In [None]:
cpt = torch.load('tscp7588_opt.pt')
tscp_model.load_state_dict(cpt['model_state_dict'])
optimizer.load_state_dict(cpt['optimizer_state_dict'])

In [None]:
x_test, lbl_test = test_ds[:,1:], test_ds[:,0]

num = x_test.shape[0]
lbl_test = np.array(lbl_test).reshape((lbl_test.shape[0], 1))
history = prep_model(torch.from_numpy(x_test[:, 0:WIN].reshape((num, 1, WIN))).float())
future = prep_model(torch.from_numpy(x_test[:, WIN:].reshape((num, 1, WIN))).float())
pred_out = np.concatenate((lbl_test, history.detach().numpy(), future.detach().numpy()), 1)
rep_sim = tscp._cosine_simililarity_dim1(history, future)

#np.savetxt(os.path.join(OUTPUT_PATH, "pred_sim", train_name + "_pred_sim.csv"), np.concatenate((lbl_test, np.array(rep_sim).reshape((rep_sim.shape[0],1))),1), delimiter=',',
#                   header="lbl,"+LOSS, comments="")
#print("Saved test similarity result!")

#history = history.detach().numpy()
#future = future.detach().numpy()
print('Average similarity for test set : Reps : {}'.format(np.mean(rep_sim.detach().numpy())))
gt = np.zeros(lbl_test.shape[0])
gt[np.where((lbl_test > int(2 * WIN * 0.15)) & (lbl_test < int(2 * WIN * 0.85)))[0]] = 1
# threshold_segmentation(h_pred,f_pred, gt, train_name, os.path.join(OUT_PATH,"Evaluation.txt"), threshold = np.mean(rep_sim) - np.std(rep_sim))
result = estimate_CPs(rep_sim.detach().numpy(), gt, os.path.join(OUTPUT_PATH, train_name),
                    os.path.join(OUTPUT_PATH, "Evaluation.txt"),
                    metric='cosine', threshold=0.5)

In [None]:
result = estimate_CPs(rep_sim.detach().numpy(), gt, os.path.join(OUTPUT_PATH, train_name),
                    os.path.join(OUTPUT_PATH, "Evaluation.txt"),
                    metric='cosine', threshold=0.3)

tn 3577, fp 17, fn 112, tp 34 ----- f1-score 0.3451776649746193
SEQ : Pos 7, fp 2, fn 5, tp 2 ----- f1-score 0.36363636363636365
