# Imports

In [1]:
import pandas as pd
import numpy as np
import torch
import nltk
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import torch.utils.data as torch_data
import scipy
from sklearn.metrics import confusion_matrix

from nltk.stem import WordNetLemmatizer 

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# from Utils.pytorch_utils import torch_net
from Utils.pytorch_utils import sparse_to_matrix #, accuracy_test

from Utils.NLP_utils import accuracy, find_senteces_with_lemma, get_wordnet_pos, load_and_lemmatize_data, load_processed_data

# pickle file, data set as readable json file, since original data set is a 'pseudo json', written in text file.
DATA_SET_FILE = r"C:\Users\גורים\PycharmProjects\NLP_training\datasets\News_Category_Dataset_v2_mod.pkl"
PROCESSED_DATA_SET = r"C:\Users\גורים\PycharmProjects\NLP_training\datasets\News_Category_Dataset_v2_mod_processed.pkl"

CrossEntropyLoss = "CrossEntropyLoss"
MSELoss = "MSELoss"

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\אבינעם\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\אבינעם\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\אבינעם\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# set loss type

In [2]:
# loss_name = CrossEntropyLoss
loss_name = MSELoss

# helper functions

In [3]:
def sparse_to_matrix(A):
    if type(A) == scipy.sparse.csr.csr_matrix:
        return np.array(A.todense())
    return A


In [4]:
def confusion_matrix_disp(y_pred,y_true,label = False):
    cm = confusion_matrix(y_pred,y_true)
    if not label:
        label_t,label_p = set(y_true) , set(y_pred)
    else:
        label_t,label_p = label
    cm_pd = pd.DataFrame(cm,columns = ["{}_P".format(i)  for i in label_p],index = ["{}_T".format(i)  for i in label_t])
    return cm_pd

def accuracy_test_dummies(model, x, y, data_set_name = 'test',print_sample = False):
    predicted = torch.argmax(model(torch.tensor(x, dtype=torch.float)), dim=-1).numpy()
    truth = np.argmax(y, axis=-1)
    # print(np.array((predicted, truth)))
    print (f"Accuracy {data_set_name} = ",
           round( np.array(predicted == truth).mean()* 100, 3 ),
           "%")
    if print_sample:
        ps = print_sample
        sample = pd.DataFrame((predicted[:ps],truth[:ps]),index=['predicted','truth'])
        print (sample)
        print (pd.value_counts(predicted))
        print ("\tConfusion Matrix:\n",confusion_matrix_disp (predicted,truth))
        
def accuracy_test_classes(*args, **kwargs):
    ## TODO support accuracy test for 
    return None       

if loss_name == MSELoss:
    accuracy_test = accuracy_test_dummies
elif loss_name == CrossEntropyLoss:
    accuracy_test = accuracy_test_classes


## load data

In [5]:
# loading data

# dataset, headlines, headlines_orig = load_and_lemmatize_data(DATA_SET_FILE)
dataset, headlines, headlines_orig = load_processed_data(PROCESSED_DATA_SET)

# reduce dataset to n categories:

In [6]:
categories = dataset['category']
pd.value_counts(categories)

# filter data for two categories, to make problem easier
filter_index =  (categories == 'RELIGION')  | (categories == 'SCIENCE')  | (categories == 'TASTE')  | (categories == 'PARENTING')
dataset   = dataset[filter_index]
headlines = np.array(headlines)[filter_index]
headlines_orig = np.array(headlines_orig)[filter_index]

In [7]:
categories = dataset['category']
Y  = np.array(pd.get_dummies(categories))

def categories_to_index(categories):
    d = {}
    for i, cat in enumerate(set(categories)):
        d[cat] = i
        
    r = np.array(range(len(categories)))

    for cat,i in d.items():
        # print (cat,i)
        r[categories == cat ] = i
    return r

In [8]:
if loss_name == CrossEntropyLoss:  
    Y = categories_to_index(categories)

In [9]:
# split data and lables to train/test

headlines_train, headlines_test,\
headlines_train_orig, headlines_test_orig,\
Y_train, Y_test,\
cat_train, cat_test\
    = sklearn.model_selection.train_test_split(
    headlines,headlines_orig, Y, categories, test_size = 0.3)

In [10]:
# extract features (Bag Of Words) using Vectorizer

max_features=1000

vectorizer = CountVectorizer
# vectorizer = TfidfVectorizer
matrix = vectorizer(max_features=max_features, ngram_range=(1, 2), max_df=0.1 ,min_df = 5)
matrix.fit(headlines_train)
X_train = matrix.transform(headlines_train)# .todense()
X_test = matrix.transform(headlines_test)# .todense()

# --- convert to data frame for display and debug ---
# tokens = matrix.get_feature_names()
# X_train= pd.DataFrame(X_train,columns=tokens)
# X_test= pd.DataFrame(X_test,columns=tokens)

assert X_train.shape[1]==max_features, X_train.shape[1]

# build Model

In [11]:
def torch_net(X_in, Y_in, X_test, Y_test,
              hidden_layers=[10], device=torch.device('cpu'), epoch=30, batch_size=17):
    
    def set_learning_rate(optimizer,lr):
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
            
    def print_learning_rate(optimizer):
        for param_group in optimizer.param_groups:
            print('lr', param_group['lr'])
    
    # hiden_layers = [size1,size2...]

    dtype = torch.float
    # device = torch.device("cuda:0") # Uncomment this to run on GPU

    # N is batch size; D_in is input dimension;
    # H is hidden dimension; D_out is output dimension.
    N, D_in = X_in.shape
    D_out = Y_in.shape[-1]

    # Create random input and output data

    [X_in, Y_in, X_test, Y_test] = \
        [sparse_to_matrix(A) for A in[X_in, Y_in, X_test, Y_test]]

    X = torch.tensor(X_in, device=device, dtype=dtype)
    Y = torch.tensor(Y_in, device=device, dtype=dtype)

    #create neural network net with multiple hidden layers with H dimetions:
    dims = [D_in, *hidden_layers, D_out]
    layers = []
    for dim_ind in range(len(dims)-2):
        layers.append(torch.nn.Linear(dims[dim_ind], dims[dim_ind+1]))
        layers.append(torch.nn.ReLU())
    layers.append(torch.nn.Dropout(0.5))
    layers.append(torch.nn.Linear(dims[-2], D_out))   
    
    model = torch.nn.Sequential(*layers)
    if loss_name == CrossEntropyLoss:
        loss_fn = torch.nn.CrossEntropyLoss(reduction='mean')     
    elif loss_name == MSELoss:
        loss_fn = torch.nn.MSELoss(reduction='mean') 

    # Use the optim package to define an Optimizer that will update the weights of
    # the model for us. Here we will use Adam; the optim package contains many other
    # optimization algoriths. The first argument to the Adam constructor tells the
    # optimizer which Tensors it should update.
    learning_rate = 0.05
    weight_decay = 0.00001
    lr_decay = 0.9

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    dataloader = torch_data.DataLoader(
        torch_data.TensorDataset(X, Y), batch_size=batch_size,
        shuffle=True, num_workers=4)
    print ("Start training:")
    print ("loss name is: ",loss_name, " Using loss function: ", loss_fn)
    accuracy_test(model, X_test, Y_test, data_set_name='test',print_sample = 16)
    accuracy_test(model, X_in, Y_in, data_set_name='train',print_sample = 16)
    
    epoch_lr = learning_rate

    for e in range(epoch):
        for t,(x_batch, y_batch) in enumerate(dataloader):
            # Forward pass: compute predicted y by passing x to the model.
            y_pred = model(x_batch)
#             print ("x_batch.shape", x_batch.shape)
#             print ("y_batch.shape", y_batch.shape)
#             print ("y_pred.shape", y_pred.shape)
#             print ("y_batch[0,0]", y_batch[0,0])
#             print ("y_pred[0,0]", y_pred[0,0])
#             y_pred_soft = torch.nn.functional.softmax(y_pred, dim = -1)
            # Compute and print loss.
#             batch_class_weights = torch.ones((y_batch.dim()))
#             loss = loss_fn(y_pred_soft, y_batch, weight = batch_class_weights)
#             loss = loss_fn(y_pred_soft, y_batch)
            loss = loss_fn(y_pred, y_batch)
            if not ( (t +1) % 2000 ) :
                print(f"iter-{t+1}, loss {round(loss.item(),3)}")

            # Before the backward pass, use the optimizer object to zero all of the
            # gradients for the variables it will update (which are the learnable
            # weights of the model). This is because by default, gradients are
            # accumulated in buffers( i.e, not overwritten) whenever .backward()
            # is called. Checkout docs of torch.autograd.backward for more details.
            optimizer.zero_grad()

            # Backward pass: compute gradient of the loss with respect to model
            # parameters

            #  $$$ this command destroy exit() command $$$
            loss.backward()

            # Calling the step function on an Optimizer makes an update to its
            # parameters
            optimizer.step()
        accuracy_test(model, X_test, Y_test, data_set_name= 'test', print_sample=15)
        accuracy_test(model, X_in, Y_in, data_set_name= 'train', print_sample = 15)
        
        epoch_lr = epoch_lr * lr_decay
        print_learning_rate(optimizer)
        print(f"epoch-{e+1}, loss {round(loss.item(),3)}")
        print("------------------------------------")
        set_learning_rate(optimizer,epoch_lr)
        
    print ("DONE, returning model")
    return model

print ("updated torch nn")

updated torch nn


# Train the model

In [None]:
# train model using pytorch
model = torch_net(X_train, Y_train,X_test,Y_test,[],epoch=1000)


Start training:
loss name is:  MSELoss  Using loss function:  MSELoss()
Accuracy test =  20.524 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   3   3   2   1   1   1   0   2   1   3   1   2   1   1   0
truth       3   0   3   2   2   2   0   0   0   0   0   0   2   1   1   1
1    1872
3    1526
0     664
2     591
dtype: int64
	Confusion Matrix:
       0_P  1_P  2_P  3_P
0_T   385  101   86   92
1_T  1001  298  289  284
2_T   334   88   84   85
3_T   872  275  191  188
Accuracy train =  20.444 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   2   3   1   1   1   3   3   2   3   3   3   1   0   0   0   3
truth       0   0   0   1   0   2   1   3   0   3   0   1   2   1   3   0
1    4446
3    3355
0    1604
2    1449
dtype: int64
	Confusion Matrix:
       0_P  1_P  2_P  3_P
0_T   951  215  207  231
1_T  2337  735  704  670
2_T   852  196  194  207
3_T  1945  648  423  339
Accuracy test =  46.228 %
         

Accuracy test =  61.724 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14
predicted   0   0   3   0   0   0   0   0   0   0   0   0   2   0   0
truth       3   0   3   2   2   2   0   0   0   0   0   0   2   1   1
0    3167
1     563
3     485
2     438
dtype: int64
	Confusion Matrix:
       0_P  1_P  2_P  3_P
0_T  2143  380  350  294
1_T   172  278   56   57
2_T   135   68  194   41
3_T   142   36   50  257
Accuracy train =  62.659 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14
predicted   0   0   0   0   0   2   1   3   0   0   0   0   0   0   0
truth       0   0   0   1   0   2   1   3   0   3   0   1   2   1   3
0    7328
1    1296
3    1154
2    1076
dtype: int64
	Confusion Matrix:
       0_P  1_P  2_P  3_P
0_T  5051  855  814  608
1_T   350  692  119  135
2_T   344  140  473  119
3_T   340  107  122  585
lr 0.021523360500000012
epoch-9, loss 0.278
------------------------------------
Accuracy test =  60.628 %
           0   1   2   3

Accuracy test =  68.279 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14
predicted   0   0   0   2   0   0   0   0   0   0   0   0   0   0   1
truth       3   0   3   2   2   2   0   0   0   0   0   0   2   1   1
0    3309
3     506
2     426
1     412
dtype: int64
	Confusion Matrix:
       0_P  1_P  2_P  3_P
0_T  2313  381  323  292
1_T    61  298   30   23
2_T    98   48  256   24
3_T   120   35   41  310
Accuracy train =  69.864 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14
predicted   0   0   0   2   0   2   1   3   0   0   0   0   0   1   3
truth       0   0   0   1   0   2   1   3   0   3   0   1   2   1   3
0    7657
1    1090
3    1077
2    1030
dtype: int64
	Confusion Matrix:
       0_P  1_P  2_P  3_P
0_T  5450  854  718  635
1_T   175  778   85   52
2_T   228   95  651   56
3_T   232   67   74  704
lr 0.00833859084983329
epoch-18, loss 0.133
------------------------------------
