# Imports

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau
import nltk
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import torch.utils.data as torch_data
import scipy
from sklearn.metrics import confusion_matrix

from nltk.stem import WordNetLemmatizer 

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# from Utils.pytorch_utils import torch_net
from Utils.pytorch_utils import sparse_to_matrix #, accuracy_test

from Utils.NLP_utils import accuracy, find_senteces_with_lemma, get_wordnet_pos, load_and_lemmatize_data, load_processed_data

# pickle file, data set as readable json file, since original data set is a 'pseudo json', written in text file.
DATA_SET_FILE = r"datasets\News_Category_Dataset_v2_mod.pkl"
PROCESSED_DATA_SET = r"datasets\News_Category_Dataset_v2_mod_processed.pkl"

ALL_CATEGORIES = ['POLITICS', 'WELLNESS', 'ENTERTAINMENT', 'TRAVEL', 'STYLE & BEAUTY',
       'PARENTING', 'HEALTHY LIVING', 'QUEER VOICES', 'FOOD & DRINK',
       'BUSINESS', 'COMEDY', 'SPORTS', 'BLACK VOICES', 'HOME & LIVING',
       'PARENTS', 'THE WORLDPOST', 'WEDDINGS', 'WOMEN', 'IMPACT', 'DIVORCE',
       'CRIME', 'MEDIA', 'WEIRD NEWS', 'GREEN', 'WORLDPOST', 'RELIGION',
       'STYLE', 'SCIENCE', 'WORLD NEWS', 'TASTE', 'TECH', 'MONEY', 'ARTS',
       'FIFTY', 'GOOD NEWS', 'ARTS & CULTURE', 'ENVIRONMENT', 'COLLEGE',
       'LATINO VOICES', 'CULTURE & ARTS', 'EDUCATION']

# REQUIRED_CATEGORIES = ['RELIGION','SCIENCE', 'TASTE','PARENTING' , 'COLLEGE' ,'POLITICS' ]
REQUIRED_CATEGORIES = (np.array(ALL_CATEGORIES)[:10]).tolist()
print (REQUIRED_CATEGORIES)
NUM_CATEGORIES = len(REQUIRED_CATEGORIES)
    
    
CrossEntropyLoss = "CrossEntropyLoss"
MSELoss = "MSELoss"

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


['POLITICS', 'FOOD & DRINK', 'WEDDINGS', 'WORLDPOST', 'ARTS', 'EDUCATION']


# set loss type

In [2]:
loss_name = CrossEntropyLoss
# loss_name = MSELoss

# helper functions

In [3]:
def sparse_to_matrix(A):
    if type(A) in [scipy.sparse.csr.csr_matrix, torch.sparse]:
        print ("type of matrix onverted")
        return np.array(A.todense())
    return A

def spy_sparse2torch_sparse(data):
    """

    :param data: a scipy sparse csr matrix
    :return: a sparse torch tensor
    """
    samples=data.shape[0]
    features=data.shape[1]
    values=data.data
    coo_data=data.tocoo()
    indices=torch.LongTensor([coo_data.row,coo_data.col])
    t=torch.sparse.FloatTensor(indices,torch.from_numpy(values).float(),[samples,features])
    return t


In [4]:
def confusion_matrix_disp(y_pred,y_true,label = False):
    cm = confusion_matrix(y_pred,y_true)
    if not label:
        label = range(NUM_CATEGORIES)
    cm_pd = pd.DataFrame(cm,index = ["{}_P".format(i)  for i in label],columns = ["{}_T".format(i)  for i in label])
    return cm_pd

def accuracy_test(model, x, y, data_set_name = 'test',print_sample = False,top_n_guess = 1):
          
    if loss_name == MSELoss:
        truth = np.argmax(y, axis=-1)
    elif loss_name == CrossEntropyLoss:
        truth = y.ravel()
    
    model_result = model(torch.tensor(x, dtype=torch.float))
    top_guesses = torch.argsort( model_result , dim = -1, descending=True ).numpy() [:,:top_n_guess]
    # predicted   = torch.argmax ( model_result, dim=-1).numpy()
    true_predicted = (top_guesses == truth[:,None]).any(1)
    

    print (f"Accuracy (top {top_n_guess} guesses) - {data_set_name} = ",
           round( np.array(true_predicted).mean()* 100, 3 ),
           "%")
    if print_sample:
        ps = print_sample
        sample = pd.DataFrame((top_guesses[:ps],truth[:ps]),index=['predicted','truth'])
        print (sample)
        # print (pd.value_counts(predicted))
    print (f"\tConfusion Matrix {data_set_name}:\n",confusion_matrix_disp (top_guesses[:,0],truth))
    

## load data

In [5]:
# loading data

# dataset, headlines, headlines_orig = load_and_lemmatize_data(DATA_SET_FILE)
dataset, headlines, headlines_orig = load_processed_data(PROCESSED_DATA_SET)

# reduce dataset to n categories:

In [6]:
categories = dataset['category']
pd.value_counts(categories)

# filter data for desired categories, to make problem easier
filter_categories = True
if filter_categories:
    filter_index =  categories.isin(REQUIRED_CATEGORIES)
    dataset   = dataset[filter_index]
    headlines = np.array(headlines)[filter_index]
    headlines_orig = np.array(headlines_orig)[filter_index]
    
else:
    NUM_CATEGORIES = len(set(categories))
    


In [7]:
def categories_to_index(categories):
    d = {}
    for i, cat in enumerate(REQUIRED_CATEGORIES):
        d[cat] = i
        
    r = np.array(range(len(categories)))

    for cat,i in d.items():
        # print (cat,i)
        r[categories == cat ] = i
    return r

categories = dataset['category']
if loss_name == CrossEntropyLoss:  
    Y = categories_to_index(categories)[:,np.newaxis]
else:
    Y  = np.array(pd.get_dummies(categories)) 

In [8]:
# split data and lables to train/test

headlines_train, headlines_test,\
headlines_train_orig, headlines_test_orig,\
Y_train, Y_test,\
    = sklearn.model_selection.train_test_split(
    headlines,headlines_orig, Y, test_size = 0.3)

In [9]:
# extract features (Bag Of Words) using Vectorizer

max_features=2000

vectorizer = CountVectorizer
# vectorizer = TfidfVectorizer
matrix = vectorizer(max_features=max_features, ngram_range=(1, 2), max_df=0.1 ,min_df = 5)
matrix.fit(headlines_train)
X_train = matrix.transform(headlines_train)# .todense()
X_test = matrix.transform(headlines_test)# .todense()

# --- convert to data frame for display and debug ---
# tokens = matrix.get_feature_names()
# X_train= pd.DataFrame(X_train,columns=tokens)
# X_test= pd.DataFrame(X_test,columns=tokens)

assert X_train.shape[1]==max_features, X_train.shape[1]

In [10]:
(X_train, X_test) = [spy_sparse2torch_sparse(A) for A in (X_train, X_test)]

In [11]:
def print_learning_rate(optimizer):
        for param_group in optimizer.param_groups:
            print('lr', param_group['lr'])
    

# build Model

In [12]:
def torch_net(X_train, Y_train, X_test, Y_test,
              hidden_layers=[10], device=torch.device('cpu'), n_epoch=30, batch_size=17):
    
    dtype = torch.float
    # device = torch.device("cuda:0") # Uncomment this to run on GPU

    # N is batch size; D_in is input dimension;
    # H is hidden dimension; D_out is output dimension.
    N, D_in = X_train.shape
    
    if loss_name == MSELoss: 
        D_out = Y_train.shape[-1] 
    elif loss_name == CrossEntropyLoss: 
        D_out = NUM_CATEGORIES

    # Create random input and output data
    [X_train, Y_train, X_test, Y_test] = \
        [sparse_to_matrix(A) for A in[X_train, Y_train, X_test, Y_test]]

    X = torch.tensor(X_train, device=device, dtype=dtype)
    if loss_name == CrossEntropyLoss:
        y_dtype = torch.int64
    else:
        y_dtype = dtype
    Y = torch.tensor(Y_train, device=device, dtype=y_dtype)
#     print (Y)

    #create neural network net with multiple hidden layers with H dimetions:
    dims = [D_in, *hidden_layers, D_out]
    layers = []
    for dim_ind in range(len(dims)-2):
        layers.append(torch.nn.Linear(dims[dim_ind], dims[dim_ind+1]))
        layers.append(torch.nn.ReLU())
    layers.append(torch.nn.Dropout(0.5))
    layers.append(torch.nn.Linear(dims[-2], D_out))   
    
    model = torch.nn.Sequential(*layers)
    if loss_name == CrossEntropyLoss:
        weights = (np.power(1/pd.value_counts(Y.tolist(),normalize=True),1.2)).to_list()
        weights = torch.tensor(weights)
        loss_fn = torch.nn.CrossEntropyLoss(reduction='mean', weight=weights)     
    elif loss_name == MSELoss:
        loss_fn = torch.nn.MSELoss(reduction='mean') 

    # Use the optim package to define an Optimizer that will update the weights of
    # the model for us. Here we will use Adam; the optim package contains many other
    # optimization algoriths. The first argument to the Adam constructor tells the
    # optimizer which Tensors it should update.
    learning_rate = 0.005
    weight_decay = 0.001
    lr_decay = 0.9

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.9, patience=10000, verbose=True, threshold=0.0001,
                                  threshold_mode='rel', cooldown=2000, min_lr=1e-10, eps=1e-08)
  
    dataloader = torch_data.DataLoader(
        torch_data.TensorDataset(X.to_dense(), Y), 
        batch_size=batch_size,
        shuffle=True,
        # num_workers=4,
        )
#     print ("Start training:")
#     print ("loss name is: ",loss_name, " Using loss function: ", loss_fn)
#     accuracy_test(model, X_test, Y_test, data_set_name='test',print_sample = 10, top_n_guess= 1 )
    # accuracy_test(model, X_train, Y_train, data_set_name='train',print_sample = 0, top_n_guess= 3)
    
    epoch_lr = learning_rate
    
    
    return model, dataloader, optimizer, loss_fn, scheduler
    
print ("updated torch nn")

updated torch nn


In [13]:
def train_model(model, n_epoch, dataloader, optimizer, loss_fn, scheduler):
    enumerate(dataloader)
    i_iter =-1
    min_loss = 100.0

    for e in range(n_epoch):
        for t,(x_batch, y_batch) in enumerate(dataloader):
            i_iter +=1
            # Forward pass: compute predicted y by passing x to the model.
            y_pred = model(x_batch)

            # y_pred_soft  is need for MSE computation
            # y_pred_soft = torch.nn.functional.softmax(y_pred, dim = -1)
            # Compute and print loss.
            #  batch_class_weights = torch.ones((y_batch.dim()))
            # loss = loss_fn(y_pred_soft, y_batch, weight = batch_class_weights)
            # loss = loss_fn(y_pred_soft, y_batch)

            loss = loss_fn(y_pred, y_batch)
            if loss < min_loss:
                print (f"loss < min_loss, updating min_loss to {loss}, i_iter {i_iter}")
#                 accuracy_test(model, X_train, Y_train, data_set_name= 'train', print_sample = 0, top_n_guess= 1)
#                 accuracy_test(model, X_test, Y_test, data_set_name= 'test', print_sample=0, top_n_guess= 1)
                min_loss = loss
            if not ( (t +1) % 2000 ) :
                print(f"iter-{t+1}, loss {round(loss.item(),3)}")

            # Before the backward pass, use the optimizer object to zero all of the
            # gradients for the variables it will update (which are the learnable
            # weights of the model). This is because by default, gradients are
            # accumulated in buffers( i.e, not overwritten) whenever .backward()
            # is called. Checkout docs of torch.autograd.backward for more details.
            optimizer.zero_grad()

            # Backward pass: compute gradient of the loss with respect to model
            # parameters

            #  $$$ this command destroy exit() command $$$
            loss.backward()

            # Calling the step function on an Optimizer makes an update to its
            # parameters
            optimizer.step()

            if scheduler:
                scheduler.step(loss)


        if not ((e+1) %3): 
            accuracy_test(model, X_train, Y_train, data_set_name= 'train', print_sample = 0, top_n_guess= 1)
            accuracy_test(model, X_test, Y_test, data_set_name= 'test', print_sample=0, top_n_guess= 1)

#         epoch_lr = epoch_lr * lr_decay
        print_learning_rate(optimizer)
        print(f"epoch-{e+1}, loss {round(loss.item(),3)}")
        print("------------------------------------")
        # set_learning_rate(optimizer,epoch_lr)

    print ("DONE, returning model")
    return model, optimizer, loss, n_epoch


# Define model, optimizer, loss, dataloader, scheduler

In [14]:
# train model using pytorch
# model = torch_net(X_train, Y_train[:,0],X_test,Y_test,[100,50,50],epoch=1000)
model, dataloader, optimizer, loss_fn, scheduler =  torch_net(X_train, Y_train[:,0],X_test,Y_test,[100,70,50])



In [41]:
print ("Before training:")
print ("loss name is: ",loss_name, " Using loss function: ", loss_fn)
accuracy_test(model, X_test, Y_test, data_set_name='test',print_sample = 0, top_n_guess= 1 )
accuracy_test(model, X_train, Y_train, data_set_name='train',print_sample = 0, top_n_guess= 1)

n_epoch = 3
model, optimizer, loss, epoch = train_model(model, n_epoch, dataloader, optimizer, loss_fn, scheduler)

print ("After training:")
print ("loss name is: ",loss_name, " Using loss function: ", loss_fn)
accuracy_test(model, X_test, Y_test, data_set_name='test',print_sample = 0, top_n_guess= 1 )
accuracy_test(model, X_train, Y_train, data_set_name='train',print_sample = 0, top_n_guess= 1)

Before training:
loss name is:  CrossEntropyLoss  Using loss function:  CrossEntropyLoss()


  from ipykernel import kernelapp as app


Accuracy (top 1 guesses) - test =  38.85 %
                                                           0
predicted  [[13], [8], [0], [13], [3], [11], [9], [7], [9...
truth                         [2, 0, 0, 5, 3, 4, 2, 0, 9, 6]
	Confusion Matrix test:
        0_T   1_T   2_T  3_T  4_T  5_T  6_T  7_T  8_T  9_T  10_T  11_T  12_T  \
0_P   3688     7    54   93   24   65   20   82   25    6    13     5    10   
1_P    185  1797   125   80   52   33   51   18   54   41    19    22    48   
2_P     55     5    27   19    4    3   26    2    2    3     2     4     1   
3_P    765    80    94  572   33   30   85   45   32   17    77    19    16   
4_P    530    57    50   39  648   27   61   47   26   22    14    22    15   
5_P    575    18    16   29   16  537   25   21  276    9     8     1     7   
6_P    487    55   161  109   81   61  310   18   51   15    34    16    10   
7_P   1976    30    35  117  154   50   32  454   28   20    22     7    11   
8_P    336    62    40   37   32  194 

13_P   160  
Accuracy (top 1 guesses) - train =  35.382 %
	Confusion Matrix train:
        0_T   1_T   2_T   3_T   4_T  5_T  6_T   7_T  8_T  9_T  10_T  11_T  \
0_P   4098     2    17    33     7   14    6    21    1    1     3     1   
1_P    219  3873   113    97    32   49   34     9   33   22    14    52   
2_P    384   155  1204   275    34    8  160    10   26   41    44   246   
3_P   1839   281   278  1747    54   41  183    70   64   26   125    32   
4_P   2418   241   252   182  2042  111  339   237  118   79    47    54   
5_P    762    58    10    41     5  936   26    20  428    4     4     0   
6_P   1169   167   483   237   157  138  884    36  119   37    36    47   
7_P   6462    39    66   208   214   69   35  1188   29   10    27     7   
8_P   1541   187    72   112    55  899   93    69  668   26    20     9   
9_P    900   281   485   164   115   70  130    49   68  946    87    45   
10_P   762   187   160   503   127   62   96   141   45   51   918    21   
11_P

In [15]:
def save_model(epoch, model, optimizer, loss, scheduler, PATH):
    torch.save( {'epoch': 3,
                'model': model,
                'optimizer': optimizer,
                'loss': loss,
                'scheduler': scheduler,    
                },
                PATH)


# running n times, and saving after eveny m epochs

In [20]:
PATH_TEMPLATE = "saved_models\model_{}.pt"

epoch_per_iter = 10
iters = 10
for iter in range(iters):
    model, optimizer, loss, epoch = train_model(model, epoch_per_iter, dataloader, optimizer, loss_fn, scheduler)
    
    print ("After training:")
    accuracy_test(model, X_test, Y_test, data_set_name='test',print_sample = 0, top_n_guess= 1 )
    accuracy_test(model, X_train, Y_train, data_set_name='train',print_sample = 0, top_n_guess= 1)

    print ("saving the model")
    save_model(epoch, model, optimizer, loss, scheduler, PATH_TEMPLATE.format(iter))

loss < min_loss, updating min_loss to 0.14967256784439087, i_iter 0
loss < min_loss, updating min_loss to 0.12891797721385956, i_iter 27
loss < min_loss, updating min_loss to 0.06754700094461441, i_iter 33
loss < min_loss, updating min_loss to 0.028285078704357147, i_iter 45
lr 0.005
epoch-1, loss 0.5
------------------------------------
lr 0.005
epoch-2, loss 0.706
------------------------------------
DONE, returning model
After training:


  from ipykernel import kernelapp as app


Accuracy (top 1 guesses) - test =  64.193 %
	Confusion Matrix test:
       0_T   1_T  2_T  3_T  4_T  5_T
0_P  5702     4    1   18    5    1
1_P   233  1493   40   22   35   18
2_P   268    42  944    7   25   10
3_P  1872    50   11  535   64   19
4_P   743   218   57  115  295   41
5_P  1032    54   18   74   28  219
Accuracy (top 1 guesses) - train =  67.064 %
	Confusion Matrix train:
        0_T   1_T   2_T   3_T  4_T  5_T
0_P  13448     1     3    18    2    4
1_P    418  3716    75    31   54   13
2_P    643    57  2333    19   33    5
3_P   4206    79    29  1466   74   22
4_P   1688   437   112   196  856   75
5_P   2486    75    28    78   38  577
saving the model
loss < min_loss, updating min_loss to 0.37200048565864563, i_iter 0
loss < min_loss, updating min_loss to 0.15625405311584473, i_iter 13
loss < min_loss, updating min_loss to 0.0983743965625763, i_iter 41
loss < min_loss, updating min_loss to 0.0924290269613266, i_iter 77
loss < min_loss, updating min_loss to 0.07016

In [29]:
model1 = torch.load(PATH)

In [30]:
model1

{'epoch': 3, 'model': Sequential(
   (0): Linear(in_features=2000, out_features=100, bias=True)
   (1): ReLU()
   (2): Linear(in_features=100, out_features=70, bias=True)
   (3): ReLU()
   (4): Linear(in_features=70, out_features=50, bias=True)
   (5): ReLU()
   (6): Dropout(p=0.5, inplace=False)
   (7): Linear(in_features=50, out_features=14, bias=True)
 ), 'optimizer': Adam (
 Parameter Group 0
     amsgrad: False
     betas: (0.9, 0.999)
     eps: 1e-08
     lr: 0.005
     weight_decay: 0.001
 ), 'loss': tensor(3.2718, requires_grad=True), 'scheduler': <torch.optim.lr_scheduler.ReduceLROnPlateau at 0x1a294e50ef0>}

In [103]:
enumerate(torch_data.TensorDataset(X_train,torch.tensor(Y_train)))

<enumerate at 0x1a2c66fb3f0>