# Imports

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau
import nltk
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import torch.utils.data as torch_data
import scipy
from sklearn.metrics import confusion_matrix

from nltk.stem import WordNetLemmatizer 

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# from Utils.pytorch_utils import torch_net
from Utils.pytorch_utils import sparse_to_matrix #, accuracy_test

from Utils.NLP_utils import accuracy, find_senteces_with_lemma, get_wordnet_pos, load_and_lemmatize_data, load_processed_data

# pickle file, data set as readable json file, since original data set is a 'pseudo json', written in text file.
DATA_SET_FILE = r"C:\Users\גורים\PycharmProjects\NLP_training\datasets\News_Category_Dataset_v2_mod.pkl"
PROCESSED_DATA_SET = r"C:\Users\גורים\PycharmProjects\NLP_training\datasets\News_Category_Dataset_v2_mod_processed.pkl"

ALL_CATEGORIES = ['POLITICS', 'WELLNESS', 'ENTERTAINMENT', 'TRAVEL', 'STYLE & BEAUTY',
       'PARENTING', 'HEALTHY LIVING', 'QUEER VOICES', 'FOOD & DRINK',
       'BUSINESS', 'COMEDY', 'SPORTS', 'BLACK VOICES', 'HOME & LIVING',
       'PARENTS', 'THE WORLDPOST', 'WEDDINGS', 'WOMEN', 'IMPACT', 'DIVORCE',
       'CRIME', 'MEDIA', 'WEIRD NEWS', 'GREEN', 'WORLDPOST', 'RELIGION',
       'STYLE', 'SCIENCE', 'WORLD NEWS', 'TASTE', 'TECH', 'MONEY', 'ARTS',
       'FIFTY', 'GOOD NEWS', 'ARTS & CULTURE', 'ENVIRONMENT', 'COLLEGE',
       'LATINO VOICES', 'CULTURE & ARTS', 'EDUCATION']

# REQUIRED_CATEGORIES = ['RELIGION','SCIENCE', 'TASTE','PARENTING' , 'COLLEGE' ,'POLITICS' ]
REQUIRED_CATEGORIES = (np.array(ALL_CATEGORIES)[::3]).tolist()
print (REQUIRED_CATEGORIES)
NUM_CATEGORIES = len(REQUIRED_CATEGORIES)
    
    
CrossEntropyLoss = "CrossEntropyLoss"
MSELoss = "MSELoss"

['POLITICS', 'TRAVEL', 'HEALTHY LIVING', 'BUSINESS', 'BLACK VOICES', 'THE WORLDPOST', 'IMPACT', 'MEDIA', 'WORLDPOST', 'SCIENCE', 'TECH', 'FIFTY', 'ENVIRONMENT', 'CULTURE & ARTS']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\אבינעם\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\אבינעם\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\אבינעם\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# set loss type

In [2]:
loss_name = CrossEntropyLoss
# loss_name = MSELoss

# helper functions

In [3]:
def sparse_to_matrix(A):
    if type(A) == scipy.sparse.csr.csr_matrix:
        return np.array(A.todense())
    return A


In [4]:
def confusion_matrix_disp(y_pred,y_true,label = False):
    cm = confusion_matrix(y_pred,y_true)
    if not label:
        label = range(NUM_CATEGORIES)
    cm_pd = pd.DataFrame(cm,index = ["{}_P".format(i)  for i in label],columns = ["{}_T".format(i)  for i in label])
    return cm_pd

# def accuracy_test_dummies(model, x, y, data_set_name = 'test',print_sample = False):
#     predicted = torch.argmax(model(torch.tensor(x, dtype=torch.float)), dim=-1).numpy()
#     truth = np.argmax(y, axis=-1)
#     # print(np.array((predicted, truth)))
#     print (f"Accuracy {data_set_name} = ",
#            round( np.array(predicted == truth).mean()* 100, 3 ),
#            "%")
#     if print_sample:
#         ps = print_sample
#         sample = pd.DataFrame((predicted[:ps],truth[:ps]),index=['predicted','truth'])
#         print (sample)
#         print (pd.value_counts(predicted))
#         print ("\tConfusion Matrix:\n",confusion_matrix_disp (predicted,truth))
        
# def accuracy_test_classes(model, x, y, data_set_name = 'test',print_sample = False):
#     predicted = torch.argmax(model(torch.tensor(x, dtype=torch.float)), dim=-1).numpy()
#     truth = y.ravel()
#     # print(np.array((predicted, truth)))
#     print (f"Accuracy {data_set_name} = ",
#            round( np.array(predicted == truth).mean()* 100, 3 ),
#            "%")
#     if print_sample:
#         ps = print_sample
#         sample = pd.DataFrame((predicted[:ps],truth[:ps]),index=['predicted','truth'])
#         print (sample)
#         print (pd.value_counts(predicted))
#         print ("\tConfusion Matrix:\n",confusion_matrix_disp (predicted,truth))

def accuracy_test(model, x, y, data_set_name = 'test',print_sample = False,top_n_guess = 1):
          
    if loss_name == MSELoss:
        truth = np.argmax(y, axis=-1)
    elif loss_name == CrossEntropyLoss:
        truth = y.ravel()
    
    model_result = model(torch.tensor(x, dtype=torch.float))
    top_guesses = torch.argsort( model_result , dim = -1, descending=True ).numpy() [:,:top_n_guess]
    # predicted   = torch.argmax ( model_result, dim=-1).numpy()
    true_predicted = (top_guesses == truth[:,None]).any(1)
    
#     print ("model_result = ", model_result[:10])
#     print ("top_guesses =", top_guesses[:10])
#     print ("predicted = ", predicted[:10])

    print (f"Accuracy (top {top_n_guess} guesses) - {data_set_name} = ",
           round( np.array(true_predicted).mean()* 100, 3 ),
           "%")
    if print_sample:
        ps = print_sample
        sample = pd.DataFrame((top_guesses[:ps],truth[:ps]),index=['predicted','truth'])
        print (sample)
        # print (pd.value_counts(predicted))
    print (f"\tConfusion Matrix {data_set_name}:\n",confusion_matrix_disp (top_guesses[:,0],truth))
    
    
        
# if loss_name == MSELoss:
#     accuracy_test = accuracy_test_dummies
# elif loss_name == CrossEntropyLoss:
#     accuracy_test = accuracy_test_classes


## load data

In [5]:
# loading data

# dataset, headlines, headlines_orig = load_and_lemmatize_data(DATA_SET_FILE)
dataset, headlines, headlines_orig = load_processed_data(PROCESSED_DATA_SET)

# reduce dataset to n categories:

In [6]:
categories = dataset['category']
pd.value_counts(categories)

# filter data for desired categories, to make problem easier
filter_categories = True
if filter_categories:
    filter_index =  categories.isin(REQUIRED_CATEGORIES)
    dataset   = dataset[filter_index]
    headlines = np.array(headlines)[filter_index]
    headlines_orig = np.array(headlines_orig)[filter_index]
    
else:
    NUM_CATEGORIES = len(set(categories))
    


In [8]:
def categories_to_index(categories):
    d = {}
    for i, cat in enumerate(REQUIRED_CATEGORIES):
        d[cat] = i
        
    r = np.array(range(len(categories)))

    for cat,i in d.items():
        # print (cat,i)
        r[categories == cat ] = i
    return r

categories = dataset['category']
if loss_name == CrossEntropyLoss:  
    Y = categories_to_index(categories)[:,np.newaxis]
else:
    Y  = np.array(pd.get_dummies(categories)) 

In [9]:
# split data and lables to train/test

headlines_train, headlines_test,\
headlines_train_orig, headlines_test_orig,\
Y_train, Y_test,\
    = sklearn.model_selection.train_test_split(
    headlines,headlines_orig, Y, test_size = 0.3)

In [32]:
# extract features (Bag Of Words) using Vectorizer

max_features=2000

vectorizer = CountVectorizer
# vectorizer = TfidfVectorizer
matrix = vectorizer(max_features=max_features, ngram_range=(1, 2), max_df=0.1 ,min_df = 5)
matrix.fit(headlines_train)
X_train = matrix.transform(headlines_train)# .todense()
X_test = matrix.transform(headlines_test)# .todense()

# --- convert to data frame for display and debug ---
# tokens = matrix.get_feature_names()
# X_train= pd.DataFrame(X_train,columns=tokens)
# X_test= pd.DataFrame(X_test,columns=tokens)

assert X_train.shape[1]==max_features, X_train.shape[1]

# build Model

In [35]:
def torch_net(X_train, Y_train, X_test, Y_test,
              hidden_layers=[10], device=torch.device('cpu'), epoch=30, batch_size=17):
    
    def set_learning_rate(optimizer,lr):
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
            
    def print_learning_rate(optimizer):
        for param_group in optimizer.param_groups:
            print('lr', param_group['lr'])
    
    # hiden_layers = [size1,size2...]

        
    dtype = torch.float
    # device = torch.device("cuda:0") # Uncomment this to run on GPU

    # N is batch size; D_in is input dimension;
    # H is hidden dimension; D_out is output dimension.
    N, D_in = X_train.shape
    
    if loss_name == MSELoss: 
        D_out = Y_train.shape[-1] 
    elif loss_name == CrossEntropyLoss: 
        D_out = NUM_CATEGORIES

    # Create random input and output data

    [X_train, Y_train, X_test, Y_test] = \
        [sparse_to_matrix(A) for A in[X_train, Y_train, X_test, Y_test]]

    X = torch.tensor(X_train, device=device, dtype=dtype)
    if loss_name == CrossEntropyLoss:
        y_dtype = torch.int64
    else:
        y_dtype = dtype
    Y = torch.tensor(Y_train, device=device, dtype=y_dtype)
#     print (Y)

    #create neural network net with multiple hidden layers with H dimetions:
    dims = [D_in, *hidden_layers, D_out]
    layers = []
    for dim_ind in range(len(dims)-2):
        layers.append(torch.nn.Linear(dims[dim_ind], dims[dim_ind+1]))
        layers.append(torch.nn.ReLU())
    layers.append(torch.nn.Dropout(0.5))
    layers.append(torch.nn.Linear(dims[-2], D_out))   
    
    model = torch.nn.Sequential(*layers)
    if loss_name == CrossEntropyLoss:
        weights = (np.power(1/pd.value_counts(Y.tolist(),normalize=True),1.2)).to_list()
        weights = torch.tensor(weights)
        loss_fn = torch.nn.CrossEntropyLoss(reduction='mean', weight=weights)     
    elif loss_name == MSELoss:
        loss_fn = torch.nn.MSELoss(reduction='mean') 

    # Use the optim package to define an Optimizer that will update the weights of
    # the model for us. Here we will use Adam; the optim package contains many other
    # optimization algoriths. The first argument to the Adam constructor tells the
    # optimizer which Tensors it should update.
    learning_rate = 0.005
    weight_decay = 0.001
    lr_decay = 0.9

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.9, patience=10000, verbose=True, threshold=0.0001,
                                  threshold_mode='rel', cooldown=2000, min_lr=1e-10, eps=1e-08)
    
    dataloader = torch_data.DataLoader(
        torch_data.TensorDataset(X, Y), batch_size=batch_size,
        shuffle=True, num_workers=4)
    print ("Start training:")
    print ("loss name is: ",loss_name, " Using loss function: ", loss_fn)
    accuracy_test(model, X_test, Y_test, data_set_name='test',print_sample = 10, top_n_guess= 1 )
    # accuracy_test(model, X_train, Y_train, data_set_name='train',print_sample = 0, top_n_guess= 3)
    
    epoch_lr = learning_rate
    min_loss = 100.0
    i_iter =-1
    for e in range(epoch):
        for t,(x_batch, y_batch) in enumerate(dataloader):
            i_iter +=1
            # Forward pass: compute predicted y by passing x to the model.
            y_pred = model(x_batch)
#             print ("x_batch.shape", x_batch.shape)
#             print ("y_batch.shape", y_batch.shape)
#             print ("y_pred.shape", y_pred.shape)
#             print ("y_batch[0,0]", y_batch[0,0])
#             print ("y_pred[0,0]", y_pred[0,0])
            y_pred_soft = torch.nn.functional.softmax(y_pred, dim = -1)
            # Compute and print loss.
#             batch_class_weights = torch.ones((y_batch.dim()))
#             loss = loss_fn(y_pred_soft, y_batch, weight = batch_class_weights)
#             loss = loss_fn(y_pred_soft, y_batch)
#             print (y_batch)
            loss = loss_fn(y_pred, y_batch)
            if loss < min_loss:
                print (f"loss < min_loss, updating min_loss to {loss}, i_iter {i_iter}")
#                 accuracy_test(model, X_train, Y_train, data_set_name= 'train', print_sample = 0, top_n_guess= 1)
#                 accuracy_test(model, X_test, Y_test, data_set_name= 'test', print_sample=0, top_n_guess= 1)
                min_loss = loss
            if not ( (t +1) % 2000 ) :
                print(f"iter-{t+1}, loss {round(loss.item(),3)}")

            # Before the backward pass, use the optimizer object to zero all of the
            # gradients for the variables it will update (which are the learnable
            # weights of the model). This is because by default, gradients are
            # accumulated in buffers( i.e, not overwritten) whenever .backward()
            # is called. Checkout docs of torch.autograd.backward for more details.
            optimizer.zero_grad()

            # Backward pass: compute gradient of the loss with respect to model
            # parameters

            #  $$$ this command destroy exit() command $$$
            loss.backward()

            # Calling the step function on an Optimizer makes an update to its
            # parameters
            optimizer.step()
            
            scheduler.step(loss)
            
            
        if not ((e+1) %10): 
            accuracy_test(model, X_train, Y_train, data_set_name= 'train', print_sample = 0, top_n_guess= 1)
            accuracy_test(model, X_test, Y_test, data_set_name= 'test', print_sample=0, top_n_guess= 1)
        
#         epoch_lr = epoch_lr * lr_decay
        print_learning_rate(optimizer)
        print(f"epoch-{e+1}, loss {round(loss.item(),3)}")
        print("------------------------------------")
        # set_learning_rate(optimizer,epoch_lr)
        
    print ("DONE, returning model")
    return model

print ("updated torch nn")

updated torch nn


# Train the model

In [None]:
# train model using pytorch
# model = torch_net(X_train, Y_train[:,0],X_test,Y_test,[100,50,50],epoch=1000)
model = torch_net(X_train, Y_train[:,0],X_test,Y_test,[100,70,50],epoch=100)


Start training:
loss name is:  CrossEntropyLoss  Using loss function:  CrossEntropyLoss()
Accuracy (top 1 guesses) - test =  7.035 %
                                                           0
predicted  [[3], [4], [4], [3], [3], [3], [3], [3], [3], ...
truth                        [0, 10, 8, 0, 0, 8, 7, 1, 0, 0]
	Confusion Matrix test:
        0_T   1_T   2_T   3_T  4_T  5_T  6_T  7_T  8_T  9_T  10_T  11_T  12_T  \
0_P      2     0     0     0    0    0    0    0    0    0     0     0     0   
1_P    107    26    22    13   16   11    5    8    4   12     8     2     2   
2_P      0     0     0     0    0    0    0    0    0    0     0     0     0   
3_P   6932  2128  1426  1268  983  781  729  618  523  479   420   303   301   
4_P   2713   837   562   515  399  283  295  220  219  196   185   115   104   
5_P      0     0     0     0    0    0    0    0    0    0     0     0     0   
6_P      7     4     1     0    0    0    0    0    0    0     0     0     0   
7_P      0     0   

lr 0.0045000000000000005
epoch-13, loss 1.151
------------------------------------
Epoch 43080: reducing learning rate of group 0 to 4.0500e-03.
iter-2000, loss 1.804
lr 0.004050000000000001
epoch-14, loss 0.634
------------------------------------
iter-2000, loss 1.736
lr 0.004050000000000001
epoch-15, loss 3.175
------------------------------------
iter-2000, loss 1.971
lr 0.004050000000000001
epoch-16, loss 2.712
------------------------------------
iter-2000, loss 1.905
Epoch 55081: reducing learning rate of group 0 to 3.6450e-03.
lr 0.0036450000000000007
epoch-17, loss 1.002
------------------------------------
iter-2000, loss 1.417
lr 0.0036450000000000007
epoch-18, loss 1.555
------------------------------------
iter-2000, loss 1.834
lr 0.0036450000000000007
epoch-19, loss 1.86
------------------------------------
iter-2000, loss 1.035
Accuracy (top 1 guesses) - train =  46.426 %
	Confusion Matrix train:
        0_T   1_T   2_T   3_T   4_T   5_T   6_T   7_T  8_T  9_T  10_T  11_T

iter-2000, loss 2.453
lr 0.002391484500000001
epoch-32, loss 2.456
------------------------------------
iter-2000, loss 1.56
lr 0.002391484500000001
epoch-33, loss 1.866
------------------------------------
iter-2000, loss 1.61
lr 0.002391484500000001
epoch-34, loss 2.217
------------------------------------
iter-2000, loss 1.158
Epoch 115086: reducing learning rate of group 0 to 2.1523e-03.
lr 0.002152336050000001
epoch-35, loss 2.391
------------------------------------
iter-2000, loss 1.54
lr 0.002152336050000001
epoch-36, loss 2.447
------------------------------------
iter-2000, loss 1.962
lr 0.002152336050000001
epoch-37, loss 0.924
------------------------------------
iter-2000, loss 1.892
lr 0.002152336050000001
epoch-38, loss 1.915
------------------------------------
Epoch 127087: reducing learning rate of group 0 to 1.9371e-03.
iter-2000, loss 1.547
lr 0.001937102445000001
epoch-39, loss 0.292
------------------------------------
iter-2000, loss 1.725
Accuracy (top 1 guesses

iter-2000, loss 2.16
lr 0.0014121476824050009
epoch-51, loss 0.816
------------------------------------
iter-2000, loss 1.055
lr 0.0014121476824050009
epoch-52, loss 0.315
------------------------------------
iter-2000, loss 1.544
Epoch 175091: reducing learning rate of group 0 to 1.2709e-03.
lr 0.0012709329141645008
epoch-53, loss 0.325
------------------------------------
iter-2000, loss 1.365
lr 0.0012709329141645008
epoch-54, loss 1.284
------------------------------------
iter-2000, loss 0.995
lr 0.0012709329141645008
epoch-55, loss 1.323
------------------------------------
iter-2000, loss 0.967
lr 0.0012709329141645008
epoch-56, loss 2.911
------------------------------------
Epoch 187092: reducing learning rate of group 0 to 1.1438e-03.
iter-2000, loss 1.062
lr 0.0011438396227480508
epoch-57, loss 1.025
------------------------------------
iter-2000, loss 1.914
lr 0.0011438396227480508
epoch-58, loss 3.303
------------------------------------
iter-2000, loss 1.121
lr 0.00114383