# Imports

In [2]:
import pandas as pd
import numpy as np
import torch
import nltk
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import torch.utils.data as torch_data
import scipy

from nltk.stem import WordNetLemmatizer 

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# from Utils.pytorch_utils import torch_net
from Utils.pytorch_utils import sparse_to_matrix #, accuracy_test

from Utils.NLP_utils import accuracy, find_senteces_with_lemma, get_wordnet_pos, load_and_lemmatize_data, load_processed_data

# pickle file, data set as readable json file, since original data set is a 'pseudo json', written in text file.
DATA_SET_FILE = r"C:\Users\גורים\PycharmProjects\NLP_training\datasets\News_Category_Dataset_v2_mod.pkl"
PROCESSED_DATA_SET = r"C:\Users\גורים\PycharmProjects\NLP_training\datasets\News_Category_Dataset_v2_mod_processed.pkl"


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\אבינעם\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\אבינעם\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\אבינעם\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# helper functions

In [3]:
def sparse_to_matrix(A):
    if type(A) == scipy.sparse.csr.csr_matrix:
        return np.array(A.todense())
    return A


# neural network build

In [57]:
def torch_net(X_in, Y_in, X_test, Y_test,
              hidden_layers=[10], device=torch.device('cpu'), epoch=30, batch_size=10):
    
    def set_learning_rate(optimizer,lr):
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
            
    def print_learning_rate(optimizer):
        for param_group in optimizer.param_groups:
            print(param_group['lr'])
    
    # hiden_layers = [size1,size2...]

    dtype = torch.float
    # device = torch.device("cuda:0") # Uncomment this to run on GPU

    # N is batch size; D_in is input dimension;
    # H is hidden dimension; D_out is output dimension.
    N, D_in = X_in.shape
    D_out = Y_in.shape[-1]

    # TODO - support multiple layers
    H = hidden_layers[0]
    # N, D_in, H, D_out = 64, 1000, 100, 10

    # Create random input and output data

    [X_in, Y_in, X_test, Y_test] = \
        [sparse_to_matrix(A) for A in[X_in, Y_in, X_test, Y_test]]

    X = torch.tensor(X_in, device=device, dtype=dtype)
    Y = torch.tensor(Y_in, device=device, dtype=dtype)

    # Use the nn package to define our model and loss function.
    
#     model = torch.nn.Sequential(
#         torch.nn.Linear(D_in, H),
#         torch.nn.ReLU(),
#         torch.nn.Linear(H, D_out),
#     )
    
    #create neural network net with multiple hidden layers with H dimetions:
    dims = [D_in, *hidden_layers, D_out]
    layers = []
    for dim_ind in range(len(dims)-2):
        layers.append(torch.nn.Linear(dims[dim_ind], dims[dim_ind+1]))
        layers.append(torch.nn.ReLU())
    layers.append(torch.nn.Dropout(0.5))
    layers.append(torch.nn.Linear(hidden_layers[-1], D_out))
    
    model = torch.nn.Sequential(*layers)
    loss_fn = torch.nn.MSELoss(reduction='sum')    
    # loss_fn = torch.nn.CrossEntropyLoss()

    # Use the optim package to define an Optimizer that will update the weights of
    # the model for us. Here we will use Adam; the optim package contains many other
    # optimization algoriths. The first argument to the Adam constructor tells the
    # optimizer which Tensors it should update.
    learning_rate = 0.05
    weight_decay = 0.99
    lr_decay = 0.96

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    dataloader = torch_data.DataLoader(
        torch_data.TensorDataset(X, Y), batch_size=batch_size,
        shuffle=True, num_workers=4)
    print ("Start training:")
#     accuracy_test(model, X_test, Y_test, data_set_name='test',print_sample = 16)
#     accuracy_test(model, X_in, Y_in, data_set_name='train',print_sample = 16)
    
    epoch_lr = learning_rate

    for e in range(epoch):
        print(f"\t +++ epoch: {e+1} +++")
        for t,(x_batch, y_batch) in enumerate(dataloader):
            # Forward pass: compute predicted y by passing x to the model.
            y_pred = model(x_batch)

            # Compute and print loss.
            loss = loss_fn(y_pred, y_batch)
            if not ( (t +1) % 2000 ) :
                print(f"iter-{t+1}, loss {round(loss.item(),3)}")

            # Before the backward pass, use the optimizer object to zero all of the
            # gradients for the variables it will update (which are the learnable
            # weights of the model). This is because by default, gradients are
            # accumulated in buffers( i.e, not overwritten) whenever .backward()
            # is called. Checkout docs of torch.autograd.backward for more details.
            optimizer.zero_grad()

            # Backward pass: compute gradient of the loss with respect to model
            # parameters

            #  $$$ this command destroy exit() command $$$
            loss.backward()

            # Calling the step function on an Optimizer makes an update to its
            # parameters
            optimizer.step()
        accuracy_test(model, X_test, Y_test, data_set_name= 'test', print_sample=16)
        accuracy_test(model, X_in, Y_in, data_set_name= 'train', print_sample = 16)
        
        epoch_lr = epoch_lr * lr_decay
        set_learning_rate(optimizer,epoch_lr)
        print ("printing learning rate after update:")
        print_learning_rate(optimizer)
        
    print ("DONE, returning model")
    return model

print ("updated torch nn")

updated torch nn


In [5]:
def accuracy_test(model, x, y, data_set_name = 'test',print_sample = False):
    predicted = torch.argmax(model(torch.tensor(x, dtype=torch.float)), dim=-1).numpy()
    truth = np.argmax(y, axis=-1)
    # print(np.array((predicted, truth)))
    print (f"Accuracy {data_set_name} = ",
           round( np.array(predicted == truth).mean()* 100, 3 ),
           "%")
    if print_sample:
        ps = print_sample
        sample = pd.DataFrame((predicted[:ps],truth[:ps]),index=['predicted','truth'])
        print (sample)
        print (pd.value_counts(predicted))


## load data

In [46]:
# loading data

# dataset, headlines, headlines_orig = load_and_lemmatize_data(DATA_SET_FILE)
dataset, headlines, headlines_orig = load_processed_data(PROCESSED_DATA_SET)

In [47]:
categories = dataset['category']
pd.value_counts(categories)

# filter data for two categories, to make problem easier
filter_index =  (categories == 'RELIGION')  | (categories == 'SCIENCE')  | (categories == 'TASTE'  )
dataset   = dataset[filter_index]
headlines = np.array(headlines)[filter_index]
headlines_orig = np.array(headlines_orig)[filter_index]

In [51]:
categories = dataset['category']
Y  = np.array(pd.get_dummies(categories))
# Y = np.array(categories)


def categories_to_index(categories):
    d = {}
    for i, cat in enumerate(set(categories)):
        d[cat] = i
        
    r = np.array(range(len(categories)))

    for cat,i in d.items():
        # print (cat,i)
        r[categories == cat ] = i
    return r

# Y = categories_to_index(categories)

In [52]:
# split data and lables to train/test

headlines_train, headlines_test,\
headlines_train_orig, headlines_test_orig,\
Y_train, Y_test,\
cat_train, cat_test\
    = sklearn.model_selection.train_test_split(
    headlines,headlines_orig, Y, categories, test_size = 0.3)

In [59]:
# extract features (Bag Of Words) using Vectorizer

max_features=1000

vectorizer = CountVectorizer
# vectorizer = TfidfVectorizer
matrix = vectorizer(max_features=max_features, ngram_range=(1, 2), max_df=0.1 ,min_df = 5)
matrix.fit(headlines_train)
X_train = matrix.transform(headlines_train)# .todense()
X_test = matrix.transform(headlines_test)# .todense()

# --- convert to data frame for display and debug ---
# tokens = matrix.get_feature_names()
# X_train= pd.DataFrame(X_train,columns=tokens)
# X_test= pd.DataFrame(X_test,columns=tokens)

assert X_train.shape[1]==max_features, X_train.shape[1]

# Train the model

In [None]:
# train model using pytorch
# model = torch_net(X_train, Y_train.values,X_test,Y_test.values,[50],epoch=100)
model = torch_net(X_train, Y_train,X_test,Y_test,[50],epoch=1000)


Start training:
	 +++ epoch: 1 +++
Accuracy test =  40.849 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   2   1   0   1   1   2   1   1   2   1   1   2   1   2
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
2    1071
1     830
0     148
dtype: int64
Accuracy train =  40.786 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   1   1   1   1   1   1   2   2   2   2   2   2   1   1   1
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
2    2409
1    1985
0     387
dtype: int64
printing learning rate after update:
0.048
	 +++ epoch: 2 +++
Accuracy test =  43.387 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1658
2     252
1     139
dtype: int64
Accuracy train =  43.38 %
   

Accuracy test =  41.679 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   1   0   0   0   1   1   1   0   1   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1147
1     839
2      63
dtype: int64
Accuracy train =  41.205 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   1   0   0   1   1   1   1   0   1   0   1   0   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2716
1    1937
2     128
dtype: int64
printing learning rate after update:
0.028233665617755668
	 +++ epoch: 15 +++
Accuracy test =  36.945 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    2048
2       1
dtype: int64
Accuracy train =  37.524 %
           0   1   2   3   4   

Accuracy test =  43.973 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   1   1   2   1   1   1   1   1   1   1   1   1   1   1   1
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
1    1611
0     277
2     161
dtype: int64
Accuracy train =  43.903 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   1   0   1   1   1   1   0   1   1   2   1   1   1   1
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
1    3686
0     673
2     422
dtype: int64
printing learning rate after update:
0.016607080712817468
	 +++ epoch: 28 +++
Accuracy test =  32.699 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
1    2042
2       7
dtype: int64
Accuracy train =  32.274 %
           0   1   2   3   4   

Accuracy test =  52.562 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   1   1   1   0   0   0   1   1   2   0   1   1   0   1
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1238
1     537
2     274
dtype: int64
Accuracy train =  54.758 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   0   1   0   0   1   1   0   0   0   0   2   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2904
1    1221
2     656
dtype: int64
printing learning rate after update:
0.009768307577765997
	 +++ epoch: 41 +++
Accuracy test =  48.951 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   2   0   0   2   0   2   2   2   0   0   0   0   0   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1123
2     848
1      78
dtype: int64
Accuracy train =  50.094 %
           0   1   2 

Accuracy test =  58.809 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   0   1   0   0   0   1   1   2   1   1   0   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1374
2     371
1     304
dtype: int64
Accuracy train =  60.197 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   2   1   0   0   0   0   0   0   2   0   0   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    3269
2     816
1     696
dtype: int64
printing learning rate after update:
0.005745731870875706
	 +++ epoch: 54 +++
Accuracy test =  61.689 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   2   0   2   1   1   2   1   1   1   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
2    720
0    667
1    662
dtype: int64
Accuracy train =  61.263 %
           0   1   2   3

Accuracy test =  65.203 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   2   1   1   1   2   0   1   2   1   0   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    874
1    651
2    524
dtype: int64
Accuracy train =  65.928 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   2   1   0   1   0   0   0   0   1   0   2   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2169
1    1426
2    1186
dtype: int64
printing learning rate after update:
0.0033796473410747157
	 +++ epoch: 67 +++
Accuracy test =  63.982 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   2   1   0   1   2   1   0   2   0   0   0   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1192
2     588
1     269
dtype: int64
Accuracy train =  64.171 %
           0   1   2   

Accuracy test =  66.715 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   2   0   2   0   1   2   1   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1124
2     603
1     322
dtype: int64
Accuracy train =  69.211 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   0   1   0   0   0   0   0   0   2   0   2   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2624
2    1381
1     776
dtype: int64
printing learning rate after update:
0.0019879131861216784
	 +++ epoch: 80 +++
Accuracy test =  68.668 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   0   2   1   1   1   0   1   1   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1011
1     628
2     410
dtype: int64
Accuracy train =  70.383 %
           0   1   2

Accuracy test =  72.914 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   2   2   2   1   1   2   1   1   2   0   1
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
2    761
1    660
0    628
dtype: int64
Accuracy train =  74.19 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   2   1   0   0   0   0   0   1   2   0   2   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
2    1758
0    1556
1    1467
dtype: int64
printing learning rate after update:
0.001169293253626807
	 +++ epoch: 93 +++
Accuracy test =  73.206 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   2   0   2   1   1   2   1   1   1   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    859
1    599
2    591
dtype: int64
Accuracy train =  75.193 %
           0   1   2   3   4

Accuracy test =  74.134 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   1   2   1   1   2   1   1   1   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    893
1    665
2    491
dtype: int64
Accuracy train =  76.072 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   0   1   0   0   0   0   0   0   2   0   0   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2095
1    1507
2    1179
dtype: int64
printing learning rate after update:
0.0006877798902499341
	 +++ epoch: 106 +++
Accuracy test =  74.183 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   2   1   0   0   2   1   1   2   1   1   1   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    911
2    659
1    479
dtype: int64
Accuracy train =  76.93 %
           0   1   2   3  

Accuracy test =  72.426 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   2   1   0   0   2   0   1   2   0   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1126
2     544
1     379
dtype: int64
Accuracy train =  73.499 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   0   1   0   0   0   0   0   0   2   0   0   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2651
2    1219
1     911
dtype: int64
printing learning rate after update:
0.0004045530716652777
	 +++ epoch: 119 +++
Accuracy test =  75.451 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   2   1   0   0   2   1   1   2   1   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    959
2    599
1    491
dtype: int64
Accuracy train =  76.323 %
           0   1   2  

Accuracy test =  76.672 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   2   1   2   1   1   2   0   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    975
2    551
1    523
dtype: int64
Accuracy train =  77.16 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   0   1   0   0   0   0   0   0   2   0   0   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2292
2    1275
1    1214
dtype: int64
printing learning rate after update:
0.00023795866979236244
	 +++ epoch: 132 +++
Accuracy test =  74.963 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   2   0   2   1   1   2   1   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    980
2    564
1    505
dtype: int64
Accuracy train =  76.846 %
           0   1   2   3 

Accuracy test =  74.963 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   2   1   2   1   1   2   1   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1011
2     535
1     503
dtype: int64
Accuracy train =  76.72 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   0   1   0   0   0   0   0   1   2   0   0   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2382
2    1227
1    1172
dtype: int64
printing learning rate after update:
0.00013996761487007286
	 +++ epoch: 145 +++
Accuracy test =  75.305 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   2   0   2   1   1   2   1   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    969
2    578
1    502
dtype: int64
Accuracy train =  77.348 %
           0   1   2  

Accuracy test =  76.574 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   0   2   1   1   2   1   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    968
2    581
1    500
dtype: int64
Accuracy train =  76.888 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   0   1   0   0   0   0   0   0   0   0   0   1   1
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2355
2    1272
1    1154
dtype: int64
printing learning rate after update:
8.232914240742589e-05
	 +++ epoch: 158 +++
Accuracy test =  75.744 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   0   2   1   1   2   0   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1007
2     564
1     478
dtype: int64
Accuracy train =  77.139 %
           0   1   2  

Accuracy test =  75.403 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   0   2   1   1   2   1   1   1   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1022
2     562
1     465
dtype: int64
Accuracy train =  76.637 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   2   1   0   0   0   0   0   0   2   0   0   1   1
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2393
2    1303
1    1085
dtype: int64
printing learning rate after update:
4.842611411099686e-05
	 +++ epoch: 171 +++
Accuracy test =  76.672 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   2   0   2   1   1   2   0   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1005
2     563
1     481
dtype: int64
Accuracy train =  77.055 %
           0   1   

Accuracy test =  75.159 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   0   2   1   1   2   1   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1025
2     550
1     474
dtype: int64
Accuracy train =  77.557 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   0   1   0   0   0   0   0   0   2   0   0   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2376
2    1292
1    1113
dtype: int64
printing learning rate after update:
2.84843065203576e-05
	 +++ epoch: 184 +++
Accuracy test =  76.184 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   0   2   1   1   2   1   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    989
2    575
1    485
dtype: int64
Accuracy train =  77.787 %
           0   1   2   

Accuracy test =  75.695 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   2   0   2   1   1   2   1   1   1   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1032
2     547
1     470
dtype: int64
Accuracy train =  77.034 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   0   1   0   0   0   0   0   0   2   0   0   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2392
2    1282
1    1107
dtype: int64
printing learning rate after update:
1.675450803436321e-05
	 +++ epoch: 197 +++
Accuracy test =  74.378 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   2   1   2   1   1   2   1   1   1   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1027
2     547
1     475
dtype: int64
Accuracy train =  76.95 %
           0   1   2

Accuracy test =  75.256 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   2   0   2   1   1   2   1   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1023
2     549
1     477
dtype: int64
Accuracy train =  77.348 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   2   1   0   0   0   0   0   1   2   0   0   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2387
2    1275
1    1119
dtype: int64
printing learning rate after update:
9.855024529837742e-06
	 +++ epoch: 210 +++
Accuracy test =  75.744 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   1   0   2   1   1   2   1   1   1   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1012
2     547
1     490
dtype: int64
Accuracy train =  77.39 %
           0   1   2

Accuracy test =  75.354 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   0   2   1   1   2   1   1   1   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1021
2     556
1     472
dtype: int64
Accuracy train =  77.369 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   2   1   0   0   0   0   0   1   2   0   2   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2364
2    1301
1    1116
dtype: int64
printing learning rate after update:
5.796738900629554e-06
	 +++ epoch: 223 +++
Accuracy test =  76.428 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    999
2    570
1    480
dtype: int64
Accuracy train =  77.034 %
           0   1   2  

Accuracy test =  75.793 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   0   2   1   1   2   1   1   0   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1027
2     553
1     469
dtype: int64
Accuracy train =  77.473 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   0   1   0   0   0   0   0   0   2   0   0   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2388
2    1301
1    1092
dtype: int64
printing learning rate after update:
3.4096497457044043e-06
	 +++ epoch: 236 +++
Accuracy test =  75.061 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   0   2   1   1   2   0   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1033
2     539
1     477
dtype: int64
Accuracy train =  76.888 %
           0   1  

Accuracy test =  75.354 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   0   2   1   1   2   1   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1029
2     547
1     473
dtype: int64
Accuracy train =  77.431 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   2   1   0   0   0   0   0   1   2   0   0   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2372
2    1295
1    1114
dtype: int64
printing learning rate after update:
2.005560641539246e-06
	 +++ epoch: 249 +++
Accuracy test =  75.988 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   0   1   2   0   2   1   1   2   1   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1005
2     562
1     482
dtype: int64
Accuracy train =  77.327 %
           0   1   

Accuracy test =  74.573 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   1   2   1   1   2   1   1   1   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    998
2    573
1    478
dtype: int64
Accuracy train =  77.055 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   2   1   0   0   0   0   0   0   2   0   0   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2397
2    1291
1    1093
dtype: int64
printing learning rate after update:
1.1796735110281374e-06
	 +++ epoch: 262 +++
Accuracy test =  75.256 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   2   0   2   1   1   2   0   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1010
2     562
1     477
dtype: int64
Accuracy train =  77.16 %
           0   1   2  

Accuracy test =  76.086 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   0   2   1   1   2   1   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    997
2    573
1    479
dtype: int64
Accuracy train =  77.348 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   1   2   1   0   0   0   0   0   1   2   0   0   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2342
2    1310
1    1129
dtype: int64
printing learning rate after update:
6.938855718435883e-07
	 +++ epoch: 275 +++
Accuracy test =  75.403 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   0   2   1   1   2   0   1   1   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1000
2     553
1     496
dtype: int64
Accuracy train =  77.076 %
           0   1   2  

Accuracy test =  75.744 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   0   2   1   1   2   1   1   0   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    994
2    574
1    481
dtype: int64
Accuracy train =  77.703 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   2   1   0   0   0   0   0   0   2   0   0   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2377
2    1291
1    1113
dtype: int64
printing learning rate after update:
4.081444419253551e-07
	 +++ epoch: 288 +++
Accuracy test =  76.086 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   1   2   1   1   2   1   1   1   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    997
2    563
1    489
dtype: int64
Accuracy train =  76.95 %
           0   1   2   3  

Accuracy test =  75.744 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   0   2   1   1   2   1   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1002
2     559
1     488
dtype: int64
Accuracy train =  76.658 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   0   1   0   0   0   0   0   0   2   0   0   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2375
2    1289
1    1117
dtype: int64
printing learning rate after update:
2.4007111868887446e-07
	 +++ epoch: 301 +++
Accuracy test =  76.574 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   0   2   1   1   2   1   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1001
2     562
1     486
dtype: int64
Accuracy train =  77.536 %
           0   1  

Accuracy test =  74.671 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1031
2     551
1     467
dtype: int64
Accuracy train =  77.118 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   0   1   0   0   0   0   0   0   2   0   0   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2371
2    1277
1    1133
dtype: int64
printing learning rate after update:
1.4121016019879612e-07
	 +++ epoch: 314 +++
Accuracy test =  75.842 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   2   1   0   0   0   1   1   2   0   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1021
2     562
1     466
dtype: int64
Accuracy train =  76.909 %
           0   1  

Accuracy test =  75.403 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1018
2     556
1     475
dtype: int64
Accuracy train =  77.557 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   0   1   0   0   0   0   0   1   2   0   0   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2381
2    1285
1    1115
dtype: int64
printing learning rate after update:
8.306000926838583e-08
	 +++ epoch: 327 +++
Accuracy test =  75.403 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   2   1   0   0   2   1   1   2   1   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1004
2     561
1     484
dtype: int64
Accuracy train =  77.16 %
           0   1   2

Accuracy test =  74.817 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   0   2   1   1   2   1   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1013
2     546
1     490
dtype: int64
Accuracy train =  76.448 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   0   1   0   0   0   0   0   1   2   0   0   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2352
2    1287
1    1142
dtype: int64
printing learning rate after update:
4.885601099773525e-08
	 +++ epoch: 340 +++
Accuracy test =  75.403 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   1   2   1   1   2   1   1   1   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1031
2     547
1     471
dtype: int64
Accuracy train =  76.386 %
           0   1   

Accuracy test =  75.451 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   0   2   1   1   2   0   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1021
2     563
1     465
dtype: int64
Accuracy train =  76.616 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   0   1   0   0   0   0   0   0   2   0   0   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2388
2    1294
1    1099
dtype: int64
printing learning rate after update:
2.873717245682189e-08
	 +++ epoch: 353 +++
Accuracy test =  76.33 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   0   0   2   1   1   2   1   1   1   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    990
2    567
1    492
dtype: int64
Accuracy train =  77.055 %
           0   1   2   

Accuracy test =  74.817 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   2   0   2   1   1   2   1   1   0   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    993
2    566
1    490
dtype: int64
Accuracy train =  76.762 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   2   1   0   0   0   0   0   0   2   0   0   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2365
2    1279
1    1137
dtype: int64
printing learning rate after update:
1.6903244123868488e-08
	 +++ epoch: 366 +++
Accuracy test =  75.256 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   1   0   0   2   1   2   0   2   1   1   2   0   1   2   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    992
2    576
1    481
dtype: int64
Accuracy train =  76.679 %
           0   1   2   3

Accuracy test =  74.183 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   2   1   0   1   2   1   1   2   1   1   1   0   0
truth       1   0   0   2   1   0   1   2   1   1   2   1   1   2   0   0
0    1016
2     549
1     484
dtype: int64
Accuracy train =  77.327 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   0   0   0   0   1   0   0   0   0   0   0   2   0   0   1   0
truth       0   0   1   0   1   0   0   0   0   0   1   2   0   2   1   0
0    2358
2    1305
1    1118
dtype: int64
printing learning rate after update:
9.942511301012424e-09
	 +++ epoch: 379 +++


In [38]:
categories

0                  CRIME
1          ENTERTAINMENT
2          ENTERTAINMENT
3          ENTERTAINMENT
4          ENTERTAINMENT
5          ENTERTAINMENT
6          ENTERTAINMENT
7          ENTERTAINMENT
8          ENTERTAINMENT
9          ENTERTAINMENT
10         ENTERTAINMENT
11            WORLD NEWS
12                IMPACT
13              POLITICS
14              POLITICS
15              POLITICS
16              POLITICS
17              POLITICS
18              POLITICS
19              POLITICS
20            WEIRD NEWS
21         ENTERTAINMENT
22            WEIRD NEWS
23            WORLD NEWS
24            WORLD NEWS
25            WORLD NEWS
26            WORLD NEWS
27          BLACK VOICES
28          BLACK VOICES
29          BLACK VOICES
               ...      
200823      QUEER VOICES
200824      QUEER VOICES
200825            IMPACT
200826            IMPACT
200827            IMPACT
200828       ENVIRONMENT
200829       ENVIRONMENT
200830       ENVIRONMENT
200831       ENVIRONMENT


In [None]:
# find frequency of lemma per category
tokens = matrix.get_feature_names()
X_train= pd.DataFrame(X_train.todense(),columns=tokens)

In [None]:
X_train['LABELED_CATEGORIES'] = categories
X_gb = X_train.groupby('LABELED_CATEGORIES')

In [69]:
model.score(X_test,Y_test)

0.25847805599662493

In [None]:
X_gb.sum()

In [168]:
tokens_01_no_digits = matrix.get_feature_names()
print ( set(tokens_01_no_digits)-set(tokens_100))




In [68]:
# train model using linear regression

linear_model   = sklearn.linear_model.LinearRegression()
n = -1
linear_model.fit(X_train[:n],Y_train[:n])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [19]:
p_test = model.predict(X_test)
n = 50000
accuracy(p_test[:n],Y_test.values[:n])

[38 34 10 ... 22 10 22]
accuracy is 0.079668422


0.079668422

In [226]:
categories_names = Y.columns
p_cat = pd.Series([categories_names[i] for i in p.argmax(axis = 1) ])
p_cat.value_counts(normalize = True)

POLITICS          0.291940
WELLNESS          0.237761
ENTERTAINMENT     0.135637
STYLE & BEAUTY    0.067890
PARENTING         0.055722
TRAVEL            0.043594
CRIME             0.016360
FOOD & DRINK      0.015892
QUEER VOICES      0.015061
WEDDINGS          0.013707
HOME & LIVING     0.012800
WOMEN             0.012522
DIVORCE           0.010236
COMEDY            0.009390
SPORTS            0.009251
BLACK VOICES      0.008389
MEDIA             0.007000
BUSINESS          0.006482
THE WORLDPOST     0.004874
GREEN             0.004122
COLLEGE           0.003376
TECH              0.002838
ARTS              0.002405
EDUCATION         0.001787
WORLD NEWS        0.001509
SCIENCE           0.001449
PARENTS           0.001394
IMPACT            0.000951
WEIRD NEWS        0.000831
ARTS & CULTURE    0.000772
HEALTHY LIVING    0.000692
WORLDPOST         0.000692
RELIGION          0.000677
MONEY             0.000632
FIFTY             0.000553
GOOD NEWS         0.000488
STYLE             0.000239
T

In [227]:
dataset.category.value_counts(normalize = True)

POLITICS          0.163000
WELLNESS          0.088756
ENTERTAINMENT     0.079949
TRAVEL            0.049225
STYLE & BEAUTY    0.048040
PARENTING         0.043201
HEALTHY LIVING    0.033328
QUEER VOICES      0.031436
FOOD & DRINK      0.030998
BUSINESS          0.029559
COMEDY            0.025765
SPORTS            0.024316
BLACK VOICES      0.022544
HOME & LIVING     0.020886
PARENTS           0.019691
THE WORLDPOST     0.018242
WEDDINGS          0.018177
WOMEN             0.017376
IMPACT            0.017222
DIVORCE           0.017057
CRIME             0.016953
MEDIA             0.014015
WEIRD NEWS        0.013293
GREEN             0.013054
WORLDPOST         0.012840
RELIGION          0.012726
STYLE             0.011222
SCIENCE           0.010844
WORLD NEWS        0.010839
TASTE             0.010435
TECH              0.010366
MONEY             0.008499
ARTS              0.007513
FIFTY             0.006975
GOOD NEWS         0.006960
ARTS & CULTURE    0.006667
ENVIRONMENT       0.006587
C

In [15]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, cat_train) 
print ("fit complete")
# print (neigh.score(X_test,cat_test))

fit complete


In [49]:
n = 100
print ([type(t) for t in (neigh.predict(X_test[:n]),cat_test[:n],np.array(headlines_test[:n]))])
# print ((neigh.predict(X_test[:n]),cat_test[:n], np.array(headlines_test[:n])))
pd.DataFrame( (neigh.predict(X_test[:n]),cat_test[:n], np.array(headlines_test[:n]))).T

[<class 'numpy.ndarray'>, <class 'pandas.core.series.Series'>, <class 'numpy.ndarray'>]


Unnamed: 0,0,1,2
0,POLITICS,WOMEN,how rape culture and racism combine to hurt as...
1,BUSINESS,TRAVEL,turkey season travel deal be all stuff and no ...
2,QUEER VOICES,HOME & LIVING,"this oscar season , we 're see red"
3,HEALTHY LIVING,HEALTHY LIVING,"for target cancer treatment , just upload your..."
4,ENTERTAINMENT,PARENTS,13 must-reads for blend family
5,FOOD & DRINK,RELIGION,god and the battle over woman 's body ( all to...
6,PARENTS,WELLNESS,acupuncturists spill : the 12 health tip they ...
7,COMEDY,WEDDINGS,drunk groom caught cheat on his bride during r...
8,FOOD & DRINK,BUSINESS,welfare limit left poor adrift a recession hit
9,FOOD & DRINK,ENTERTAINMENT,tom hank will play mr. rogers in 'you be my fr...
