<a href="https://colab.research.google.com/github/Blacknahil/Predictive-coding-Network./blob/main/pcn_for_sms_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Loading and preproccessing the csv file

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder


# Load the dataset
csv_file="sample_data/spam.csv"


data = pd.read_csv(csv_file, sep=",", encoding="latin-1", usecols=[0, 1], names=["Label", "Message"], skiprows=1)

print("Loading the data done!")
print("the data head is ", data.head())


data["Label"] = data["Label"].map({"spam":1,"ham":0})

# vectorize

vectorize = TfidfVectorizer(max_features=1000)
X= vectorize.fit_transform(data["Message"]).toarray()

print("vectorizing the data done ")

# Encode labels as one-hot vectors
encoder= OneHotEncoder()

Y = encoder.fit_transform(data[["Label"]]).toarray()

# split into training and validation sets

X_train,X_valid,Y_train,Y_valid= train_test_split(X,Y,test_size=0.2,random_state=42)


# save the data into .npy files

np.save("sample_data/sms_spam/trainX.npy",X_train)
np.save("sample_data/sms_spam/trainY.npy",Y_train)

np.save("sample_data/sms_spam/validX.npy",X_valid)
np.save("sample_data/sms_spam/validY.npy",Y_valid)


print("Data processing Done!")
print("Files saved in sample_data/sms_spam folder ")



Loading the data done!
the data head is    Label                                            Message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
vectorizing the data done 
Data processing Done!
Files saved in sample_data/sms_spam folder 


## Importing PCN and using it for sms data set using the preprocessed data

In [None]:
from jax import numpy as jnp, random
import sys, getopt as gopt, optparse, time
from pcn import PCN ## bring in model from museum
## bring in ngc-learn analysis tools
from ngclearn.utils.metric_utils import measure_ACC, measure_CatNLL


# read in general program arguments
# external dataset arguments
dataX = "sample_data/sms_spam/trainX.npy"
dataY = "sample_data/sms_spam/trainY.npy"
devX = "sample_data/sms_spam/validX.npy"
devY = "sample_data/sms_spam/validY.npy"
verbosity = 0 ## verbosity level (0 - fairly minimal, 1 - prints multiple lines on I/O)

print("Train-set: X: {} | Y: {}".format(dataX, dataY))
print("  Dev-set: X: {} | Y: {}".format(devX, devY))
print(dataX)
print(dataY)

_X = jnp.load(dataX)
_Y = jnp.load(dataY)

Xdev = jnp.load(devX)
Ydev = jnp.load(devY)
x_dim = _X.shape[1]
patch_shape = (int(jnp.sqrt(x_dim)), int(jnp.sqrt(x_dim)))
y_dim = _Y.shape[1]

n_iter = 30
mb_size = 50
n_batches = int(_X.shape[0]/mb_size)
save_point = 20 ## save model params every modulo "save_point"

## set up JAX seeding
dkey = random.PRNGKey(1234)
dkey, *subkeys = random.split(dkey, 10)

## build model
print("--- Building Model ---")
model = PCN(subkeys[1], x_dim, y_dim, hid1_dim=512, hid2_dim=512, T=20,
            dt=1., tau_m=20., act_fx="sigmoid", eta=0.001, exp_dir="exp",
            model_name="pcn")
model.save_to_disk() # save final state of synapses to disk
print("--- Starting Simulation ---")

def eval_model(model, Xdev, Ydev, mb_size): ## evals model's test-time inference performance
    n_batches = int(Xdev.shape[0]/mb_size)

    n_samp_seen = 0
    nll = 0. ## negative Categorical log liklihood
    acc = 0. ## accuracy
    for j in range(n_batches):
        ## extract data block/batch
        idx = j * mb_size
        Xb = Xdev[idx: idx + mb_size,:]
        Yb = Ydev[idx: idx + mb_size,:]
        ## run model inference
        yMu_0, yMu, _ = model.process(obs=Xb, lab=Yb, adapt_synapses=False)
        ## record metric measurements
        _nll = measure_CatNLL(yMu_0, Yb) * Xb.shape[0] ## un-normalize score
        _acc = measure_ACC(yMu_0, Yb) * Yb.shape[0] ## un-normalize score
        nll += _nll
        acc += _acc

        n_samp_seen += Yb.shape[0]

    nll = nll/(Xdev.shape[0]) ## calc full dev-set nll
    acc = acc/(Xdev.shape[0]) ## calc full dev-set acc
    return nll, acc

trAcc_set = []
acc_set = []
efe_set = []

sim_start_time = time.time() ## start time profiling

_, tr_acc = eval_model(model, _X, _Y, mb_size=1000)
nll, acc = eval_model(model, Xdev, Ydev, mb_size=1000)
print("-1: Dev: Acc = {}  NLL = {} | Tr: Acc = {} EFE = --".format(acc, nll, tr_acc))
if verbosity >= 2:
    print(model._get_norm_string())
trAcc_set.append(tr_acc) ## random guessing is where models typically start
acc_set.append(acc)
efe_set.append(-2000.)
jnp.save("exp/dev_acc.npy", jnp.asarray(acc_set))
jnp.save("exp/efe.npy", jnp.asarray(efe_set))

for i in range(n_iter):
    ## shuffle data (to ensure i.i.d. assumption holds)
    dkey, *subkeys = random.split(dkey, 2)
    ptrs = random.permutation(subkeys[0],_X.shape[0])
    X = _X[ptrs,:]
    Y = _Y[ptrs,:]

    ## begin a single epoch
    n_samp_seen = 0
    train_EFE = 0. ## training free energy (online) estimate
    trAcc = 0. ## training accuracy score
    for j in range(n_batches):
        dkey, *subkeys = random.split(dkey, 2)
        ## sample mini-batch of patterns
        idx = j * mb_size #j % 2 # 1
        Xb = X[idx: idx + mb_size,:]
        Yb = Y[idx: idx + mb_size,:]
        ## perform a step of inference/learning
        yMu_0, yMu, _EFE = model.process(obs=Xb, lab=Yb, adapt_synapses=True)
        ## track online training EFE and accuracy
        train_EFE += _EFE * mb_size
        n_samp_seen += Yb.shape[0]
        if verbosity >= 1:
            print("\r EFE = {} over {} samples ".format((train_EFE/n_samp_seen),
                                                        n_samp_seen), end="")
    if verbosity >= 1:
        print()

    ## evaluate current progress of model on dev-set
    nll, acc = eval_model(model, Xdev, Ydev, mb_size=1000)
    _, tr_acc = eval_model(model, _X, _Y, mb_size=1000)
    if (i+1) % save_point == 0 or i == (n_iter-1):
        model.save_to_disk(params_only=True) # save final state of synapses to disk
        jnp.save("exp/trAcc.npy", jnp.asarray(trAcc_set))
        jnp.save("exp/acc.npy", jnp.asarray(acc_set))
        jnp.save("exp/efe.npy", jnp.asarray(efe_set))
    ## record current generalization stats and print to I/O
    trAcc_set.append(tr_acc)
    acc_set.append(acc)
    efe_set.append((train_EFE/n_samp_seen))
    io_str = ("{} Dev: Acc = {}, NLL = {} | "
              "Tr: Acc = {}, EFE = {}"
             ).format(i, acc, nll, tr_acc, (train_EFE/n_samp_seen))
    if verbosity >= 1:
        print(io_str)
    else:
        print("\r{}".format(io_str), end="")
    if verbosity >= 2:
        print(model._get_norm_string())
if verbosity == 0:
    print("")

## stop time profiling
sim_end_time = time.time()
sim_time = sim_end_time - sim_start_time
sim_time_hr = (sim_time/3600.0) # convert time to hours

print("------------------------------------")
vAcc_best = jnp.amax(jnp.asarray(acc_set))
print(" Trial.sim_time = {} h  ({} sec)  Best Acc = {}".format(sim_time_hr, sim_time, vAcc_best))

jnp.save("exp/trAcc.npy", jnp.asarray(trAcc_set))
jnp.save("exp/acc.npy", jnp.asarray(acc_set))
jnp.save("exp/efe.npy", jnp.asarray(efe_set))


Train-set: X: sample_data/sms_spam/trainX.npy | Y: sample_data/sms_spam/trainY.npy
  Dev-set: X: sample_data/sms_spam/validX.npy | Y: sample_data/sms_spam/validY.npy
sample_data/sms_spam/trainX.npy
sample_data/sms_spam/trainY.npy
--- Building Model ---




--- Starting Simulation ---
-1: Dev: Acc = 0.11479820311069489  NLL = 12.722171783447266 | Tr: Acc = 0.1191384345293045 EFE = --
29 Dev: Acc = 0.8807175159454346, NLL = 0.072615846991539 | Tr: Acc = 0.8925285935401917, EFE = -0.5954628586769104
------------------------------------
 Trial.sim_time = 0.15903820719983844 h  (572.5375459194183 sec)  Best Acc = 0.8852018713951111
