# Imports

In [2]:
import pandas as pd
import numpy as np
import torch
import nltk
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import torch.utils.data as torch_data
import scipy
from sklearn.metrics import confusion_matrix

from nltk.stem import WordNetLemmatizer 

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# from Utils.pytorch_utils import torch_net
from Utils.pytorch_utils import sparse_to_matrix #, accuracy_test

from Utils.NLP_utils import accuracy, find_senteces_with_lemma, get_wordnet_pos, load_and_lemmatize_data, load_processed_data

# pickle file, data set as readable json file, since original data set is a 'pseudo json', written in text file.
DATA_SET_FILE = r"C:\Users\גורים\PycharmProjects\NLP_training\datasets\News_Category_Dataset_v2_mod.pkl"
PROCESSED_DATA_SET = r"C:\Users\גורים\PycharmProjects\NLP_training\datasets\News_Category_Dataset_v2_mod_processed.pkl"


REQUIRED_CATEGORIES = ['RELIGION','SCIENCE', 'TASTE','PARENTING']
NUM_CATEGORIES = len(REQUIRED_CATEGORIES)
    
    
CrossEntropyLoss = "CrossEntropyLoss"
MSELoss = "MSELoss"

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\אבינעם\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\אבינעם\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\אבינעם\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# set loss type

In [3]:
loss_name = CrossEntropyLoss
# loss_name = MSELoss

# helper functions

In [4]:
def sparse_to_matrix(A):
    if type(A) == scipy.sparse.csr.csr_matrix:
        return np.array(A.todense())
    return A


In [20]:
def confusion_matrix_disp(y_pred,y_true,label = False):
    cm = confusion_matrix(y_pred,y_true)
    if not label:
        label = range(NUM_CATEGORIES)
    cm_pd = pd.DataFrame(cm,index = ["{}_P".format(i)  for i in label],columns = ["{}_T".format(i)  for i in label])
    return cm_pd

def accuracy_test_dummies(model, x, y, data_set_name = 'test',print_sample = False):
    predicted = torch.argmax(model(torch.tensor(x, dtype=torch.float)), dim=-1).numpy()
    truth = np.argmax(y, axis=-1)
    # print(np.array((predicted, truth)))
    print (f"Accuracy {data_set_name} = ",
           round( np.array(predicted == truth).mean()* 100, 3 ),
           "%")
    if print_sample:
        ps = print_sample
        sample = pd.DataFrame((predicted[:ps],truth[:ps]),index=['predicted','truth'])
        print (sample)
        print (pd.value_counts(predicted))
        print ("\tConfusion Matrix:\n",confusion_matrix_disp (predicted,truth))
        
def accuracy_test_classes(model, x, y, data_set_name = 'test',print_sample = False):
    predicted = torch.argmax(model(torch.tensor(x, dtype=torch.float)), dim=-1).numpy()
    truth = y.ravel()
    # print(np.array((predicted, truth)))
    print (f"Accuracy {data_set_name} = ",
           round( np.array(predicted == truth).mean()* 100, 3 ),
           "%")
    if print_sample:
        ps = print_sample
        sample = pd.DataFrame((predicted[:ps],truth[:ps]),index=['predicted','truth'])
        print (sample)
        print (pd.value_counts(predicted))
        print ("\tConfusion Matrix:\n",confusion_matrix_disp (predicted,truth))

if loss_name == MSELoss:
    accuracy_test = accuracy_test_dummies
elif loss_name == CrossEntropyLoss:
    accuracy_test = accuracy_test_classes


## load data

In [6]:
# loading data

# dataset, headlines, headlines_orig = load_and_lemmatize_data(DATA_SET_FILE)
dataset, headlines, headlines_orig = load_processed_data(PROCESSED_DATA_SET)

# reduce dataset to n categories:

In [7]:
categories = dataset['category']
pd.value_counts(categories)

# filter data for desired categories, to make problem easier
filter_categories = True
if filter_categories:
    filter_index =  categories.isin(REQUIRED_CATEGORIES)
    dataset   = dataset[filter_index]
    headlines = np.array(headlines)[filter_index]
    headlines_orig = np.array(headlines_orig)[filter_index]
    
else:
    NUM_CATEGORIES = len(set(categories))
    


In [8]:
def categories_to_index(categories):
    d = {}
    for i, cat in enumerate(set(categories)):
        d[cat] = i
        
    r = np.array(range(len(categories)))

    for cat,i in d.items():
        # print (cat,i)
        r[categories == cat ] = i
    return r

categories = dataset['category']
if loss_name == CrossEntropyLoss:  
    Y = categories_to_index(categories)[:,np.newaxis]
else:
    Y  = np.array(pd.get_dummies(categories)) 

In [9]:
# split data and lables to train/test

headlines_train, headlines_test,\
headlines_train_orig, headlines_test_orig,\
Y_train, Y_test,\
    = sklearn.model_selection.train_test_split(
    headlines,headlines_orig, Y, test_size = 0.3)

In [10]:
# extract features (Bag Of Words) using Vectorizer

max_features=1000

vectorizer = CountVectorizer
# vectorizer = TfidfVectorizer
matrix = vectorizer(max_features=max_features, ngram_range=(1, 2), max_df=0.1 ,min_df = 5)
matrix.fit(headlines_train)
X_train = matrix.transform(headlines_train)# .todense()
X_test = matrix.transform(headlines_test)# .todense()

# --- convert to data frame for display and debug ---
# tokens = matrix.get_feature_names()
# X_train= pd.DataFrame(X_train,columns=tokens)
# X_test= pd.DataFrame(X_test,columns=tokens)

assert X_train.shape[1]==max_features, X_train.shape[1]

# build Model

In [16]:
def torch_net(X_in, Y_in, X_test, Y_test,
              hidden_layers=[10], device=torch.device('cpu'), epoch=30, batch_size=17):
    
    def set_learning_rate(optimizer,lr):
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
            
    def print_learning_rate(optimizer):
        for param_group in optimizer.param_groups:
            print('lr', param_group['lr'])
    
    # hiden_layers = [size1,size2...]

        
    dtype = torch.float
    # device = torch.device("cuda:0") # Uncomment this to run on GPU

    # N is batch size; D_in is input dimension;
    # H is hidden dimension; D_out is output dimension.
    N, D_in = X_in.shape
    
    if loss_name == MSELoss: 
        D_out = Y_in.shape[-1] 
    elif loss_name == CrossEntropyLoss: 
        D_out = NUM_CATEGORIES

    # Create random input and output data

    [X_in, Y_in, X_test, Y_test] = \
        [sparse_to_matrix(A) for A in[X_in, Y_in, X_test, Y_test]]

    X = torch.tensor(X_in, device=device, dtype=dtype)
    if loss_name == CrossEntropyLoss:
        y_dtype = torch.int64
    else:
        y_dtype = dtype
    Y = torch.tensor(Y_in, device=device, dtype=y_dtype)
#     print (Y)

    #create neural network net with multiple hidden layers with H dimetions:
    dims = [D_in, *hidden_layers, D_out]
    layers = []
    for dim_ind in range(len(dims)-2):
        layers.append(torch.nn.Linear(dims[dim_ind], dims[dim_ind+1]))
        layers.append(torch.nn.ReLU())
    layers.append(torch.nn.Dropout(0.5))
    layers.append(torch.nn.Linear(dims[-2], D_out))   
    
    model = torch.nn.Sequential(*layers)
    if loss_name == CrossEntropyLoss:
        weights = (1/pd.value_counts(Y.tolist(),normalize=True)).to_list()
        weights = torch.tensor(weights)
        loss_fn = torch.nn.CrossEntropyLoss(reduction='mean', weight=weights)     
    elif loss_name == MSELoss:
        loss_fn = torch.nn.MSELoss(reduction='mean') 

    # Use the optim package to define an Optimizer that will update the weights of
    # the model for us. Here we will use Adam; the optim package contains many other
    # optimization algoriths. The first argument to the Adam constructor tells the
    # optimizer which Tensors it should update.
    learning_rate = 0.05
    weight_decay = 0.0001
    lr_decay = 0.9

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    dataloader = torch_data.DataLoader(
        torch_data.TensorDataset(X, Y), batch_size=batch_size,
        shuffle=True, num_workers=4)
    print ("Start training:")
    print ("loss name is: ",loss_name, " Using loss function: ", loss_fn)
    accuracy_test(model, X_test, Y_test, data_set_name='test',print_sample = 16)
    accuracy_test(model, X_in, Y_in, data_set_name='train',print_sample = 16)
    
    epoch_lr = learning_rate

    for e in range(epoch):
        for t,(x_batch, y_batch) in enumerate(dataloader):
            # Forward pass: compute predicted y by passing x to the model.
            y_pred = model(x_batch)
#             print ("x_batch.shape", x_batch.shape)
#             print ("y_batch.shape", y_batch.shape)
#             print ("y_pred.shape", y_pred.shape)
#             print ("y_batch[0,0]", y_batch[0,0])
#             print ("y_pred[0,0]", y_pred[0,0])
            y_pred_soft = torch.nn.functional.softmax(y_pred, dim = -1)
            # Compute and print loss.
#             batch_class_weights = torch.ones((y_batch.dim()))
#             loss = loss_fn(y_pred_soft, y_batch, weight = batch_class_weights)
#             loss = loss_fn(y_pred_soft, y_batch)
#             print (y_batch)
            loss = loss_fn(y_pred, y_batch)
            if not ( (t +1) % 2000 ) :
                print(f"iter-{t+1}, loss {round(loss.item(),3)}")

            # Before the backward pass, use the optimizer object to zero all of the
            # gradients for the variables it will update (which are the learnable
            # weights of the model). This is because by default, gradients are
            # accumulated in buffers( i.e, not overwritten) whenever .backward()
            # is called. Checkout docs of torch.autograd.backward for more details.
            optimizer.zero_grad()

            # Backward pass: compute gradient of the loss with respect to model
            # parameters

            #  $$$ this command destroy exit() command $$$
            loss.backward()

            # Calling the step function on an Optimizer makes an update to its
            # parameters
            optimizer.step()
            
        if not (e%10): 
            accuracy_test(model, X_test, Y_test, data_set_name= 'test', print_sample=15)
            accuracy_test(model, X_in, Y_in, data_set_name= 'train', print_sample = 15)
        
        epoch_lr = epoch_lr * lr_decay
        print_learning_rate(optimizer)
        print(f"epoch-{e+1}, loss {round(loss.item(),3)}")
        print("------------------------------------")
        set_learning_rate(optimizer,epoch_lr)
        
    print ("DONE, returning model")
    return model

print ("updated torch nn")

updated torch nn


# Train the model

In [21]:
# train model using pytorch
model = torch_net(X_train, Y_train[:,0],X_test,Y_test,[100,50,50],epoch=1000)


Start training:
loss name is:  CrossEntropyLoss  Using loss function:  CrossEntropyLoss()
Accuracy test =  13.282 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   3   0   0   3   3   0   3   0   3   0   3   3   0   3   3   0
truth       1   0   2   1   1   0   1   1   1   1   1   1   2   0   1   3
0    2530
3    2123
dtype: int64
	Confusion Matrix:
      0_T   1_T  2_T  3_T
0_P  337  1369  442  382
1_P    0     0    0    0
2_P    0     0    0    0
3_P  281  1206  355  281
Accuracy train =  13.746 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   3   0   0   3   0   3   3   3   0   0   0   3   3   0   3   0
truth       3   1   1   1   1   1   1   1   2   3   1   1   0   1   1   1
0    5927
3    4927
dtype: int64
	Confusion Matrix:
      0_T   1_T  2_T  3_T
0_P  838  3337  973  779
1_P    0     0    0    0
2_P    0     0    0    0
3_P  722  2765  786  654
Accuracy test =  56.286 %
           0   1   2   3   4   

KeyboardInterrupt: 