In [9]:
import pandas as pd
import numpy as np
import torch
import nltk
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import torch.utils.data as torch_data

from nltk.stem import WordNetLemmatizer 

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# from Utils.pytorch_utils import torch_net
from Utils.pytorch_utils import sparse_to_matrix #, accuracy_test

from Utils.NLP_utils import accuracy, find_senteces_with_lemma, get_wordnet_pos, load_and_lemmatize_data, load_processed_data

# pickle file, data set as readable json file, since original data set is a 'pseudo json', written in text file.
DATA_SET_FILE = r"C:\Users\גורים\PycharmProjects\NLP_training\datasets\News_Category_Dataset_v2_mod.pkl"
PROCESSED_DATA_SET = r"C:\Users\גורים\PycharmProjects\NLP_training\datasets\News_Category_Dataset_v2_mod_processed.pkl"


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\אבינעם\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\אבינעם\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\אבינעם\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [48]:
def torch_net(X_in, Y_in, X_test, Y_test,
              hidden_layers=[10], device=torch.device('cpu'), epoch=30, batch_size=10):
    
    def set_learning_rate(optimizer,lr):
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
            
    def print_learning_rate(optimizer):
        for param_group in optimizer.param_groups:
            print(param_group['lr'])
    
    # hiden_layers = [size1,size2...]

    dtype = torch.float
    # device = torch.device("cuda:0") # Uncomment this to run on GPU

    # N is batch size; D_in is input dimension;
    # H is hidden dimension; D_out is output dimension.
    N, D_in = X_in.shape
    D_out = Y_in.shape[-1]

    # TODO - support multiple layers
    H = hidden_layers[0]
    # N, D_in, H, D_out = 64, 1000, 100, 10

    # Create random input and output data

    [X_in, Y_in, X_test, Y_test] = \
        [sparse_to_matrix(A) for A in[X_in, Y_in, X_test, Y_test]]

    X = torch.tensor(X_in, device=device, dtype=dtype)
    Y = torch.tensor(Y_in, device=device, dtype=dtype)

    # Use the nn package to define our model and loss function.
    
#     model = torch.nn.Sequential(
#         torch.nn.Linear(D_in, H),
#         torch.nn.ReLU(),
#         torch.nn.Linear(H, D_out),
#     )
    
    #create neural network net with multiple hidden layers with H dimetions:
    dims = [D_in, *hidden_layers, D_out]
    layers = []
    for dim_ind in range(len(dims)-2):
        layers.append(torch.nn.Linear(dims[dim_ind], dims[dim_ind+1]))
        layers.append(torch.nn.ReLU())
    layers.append(torch.nn.Dropout(0.5))
    layers.append(torch.nn.Linear(hidden_layers[-1], D_out))
    
    model = torch.nn.Sequential(*layers)
    loss_fn = torch.nn.MSELoss(reduction='sum')

    # Use the optim package to define an Optimizer that will update the weights of
    # the model for us. Here we will use Adam; the optim package contains many other
    # optimization algoriths. The first argument to the Adam constructor tells the
    # optimizer which Tensors it should update.
    learning_rate = 0.05
    weight_decay = 0.99
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    dataloader = torch_data.DataLoader(
        torch_data.TensorDataset(X, Y), batch_size=batch_size,
        shuffle=True, num_workers=4)
    print ("Before training:")
    accuracy_test(model, X_test, Y_test, data_set_name='test',print_sample = 16)
    accuracy_test(model, X_in, Y_in, data_set_name='train',print_sample = 16)
    
    epoch_lr = learning_rate
    for e in range(epoch):
        print(f"\t +++ epoch: {e+1} +++")
        for t,(x_batch, y_batch) in enumerate(dataloader):
            # Forward pass: compute predicted y by passing x to the model.
            y_pred = model(x_batch)

            # Compute and print loss.
            loss = loss_fn(y_pred, y_batch)
            if not ( (t +1) % 2000 ) :
                print(f"iter-{t+1}, loss {round(loss.item(),3)}")

            # Before the backward pass, use the optimizer object to zero all of the
            # gradients for the variables it will update (which are the learnable
            # weights of the model). This is because by default, gradients are
            # accumulated in buffers( i.e, not overwritten) whenever .backward()
            # is called. Checkout docs of torch.autograd.backward for more details.
            optimizer.zero_grad()

            # Backward pass: compute gradient of the loss with respect to model
            # parameters

            #  $$$ this command destroy exit() command $$$
            loss.backward()

            # Calling the step function on an Optimizer makes an update to its
            # parameters
            optimizer.step()
        accuracy_test(model, X_test, Y_test, data_set_name= 'test', print_sample=16)
        accuracy_test(model, X_in, Y_in, data_set_name= 'train', print_sample = 16)
        
        epoch_lr = epoch_lr * 0.5
        set_learning_rate(optimizer,epoch_lr)
        print ("printing learning rate after update:")
        print_learning_rate(optimizer)
        
    print ("DONE, returning model")
    return model

In [43]:
def accuracy_test(model, x, y, data_set_name = 'test',print_sample = False):
    predicted = torch.argmax(model(torch.tensor(x, dtype=torch.float)), dim=-1).numpy()
    truth = np.argmax(y, axis=-1)
    # print(np.array((predicted, truth)))
    print (f"Accuracy {data_set_name} = ",
           round( np.array(predicted == truth).mean()* 100, 3 ),
           "%")
    if print_sample:
        ps = print_sample
        sample = pd.DataFrame((predicted[:ps],truth[:ps]),index=['predicted','truth'])
        print (sample)
        print (pd.value_counts(predicted))


In [19]:
# loading data

# dataset, headlines, headlines_orig = load_and_lemmatize_data(DATA_SET_FILE)
dataset, headlines, headlines_orig = load_processed_data(PROCESSED_DATA_SET)

In [20]:
categories = dataset['category']
Y  = pd.get_dummies(categories)

In [45]:
# split data and lables to train/test

headlines_train, headlines_test,\
headlines_train_orig, headlines_test_orig,\
Y_train, Y_test,\
cat_train, cat_test\
    = sklearn.model_selection.train_test_split(
    headlines,headlines_orig, Y, categories, test_size = 0.3)

In [46]:
# extract features (Bag Of Words) using Vectorizer

max_features=1000

vectorizer = CountVectorizer
# vectorizer = TfidfVectorizer
matrix = vectorizer(max_features=max_features, ngram_range=(1, 2), max_df=0.1 ,min_df = 5)
matrix.fit(headlines_train)
X_train = matrix.transform(headlines_train)# .todense()
X_test = matrix.transform(headlines_test)# .todense()

# --- convert to data frame for display and debug ---
# tokens = matrix.get_feature_names()
# X_train= pd.DataFrame(X_train,columns=tokens)
# X_test= pd.DataFrame(X_test,columns=tokens)

assert X_train.shape[1]==max_features, X_train.shape[1]

In [None]:
# train model using pytorch
model = torch_net(X_train, Y_train.values,X_test,Y_test.values,[50],epoch=100)

Before training:
Accuracy test =  2.352 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2
truth      13  38  32   0   6  24  34  37  26  24  15  24  24  24  24  24
2     56416
37     2564
7       373
27      320
4       312
30      157
35       92
23        8
11        8
0         4
14        1
39        1
dtype: int64
Accuracy train =  2.433 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted   2   2   7   2   2   2   2  27   2   2   2   2   2   2   2   2
truth      24  40  10  39  14   5   3   6  34  24  24  24  10  37  17  24
2     131335
37      6131
7        884
4        793
27       787
30       362
35       207
23        42
11        39
14         9
0          5
39         3
dtype: int64
	 +++ epoch: 1 +++
iter-2000, loss 9.468
iter-4000, loss 10.377
iter-6000, loss 9.77
iter-8000, loss 10.041
iter-10000, loss 9.569
iter-12000, loss 9.362
iter-140

Accuracy train =  16.224 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted  24  24  24  24  24  24  24  24  24  24  24  24  24  24  24  24
truth      24  40  10  39  14   5   3   6  34  24  24  24  10  37  17  24
24    140597
dtype: int64
printing learning rate after update:
4.8828125e-05
	 +++ epoch: 11 +++
iter-2000, loss 9.494
iter-4000, loss 9.426
iter-6000, loss 9.735
iter-8000, loss 9.339
iter-10000, loss 9.759
iter-12000, loss 8.988
iter-14000, loss 9.454
Accuracy test =  16.476 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted  24  24  24  24  24  24  24  24  24  24  24  24  24  24  24  24
truth      13  38  32   0   6  24  34  37  26  24  15  24  24  24  24  24
24    60256
dtype: int64
Accuracy train =  16.224 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted  24  24  24  24  24  24  24  24  24  24  24  24  24  24  24  24
truth      24  40  10  39  14   5   3   6  34  24

iter-8000, loss 9.689
iter-10000, loss 9.676
iter-12000, loss 8.704
iter-14000, loss 9.718
Accuracy test =  16.476 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted  24  24  24  24  24  24  24  24  24  24  24  24  24  24  24  24
truth      13  38  32   0   6  24  34  37  26  24  15  24  24  24  24  24
24    60256
dtype: int64
Accuracy train =  16.224 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted  24  24  24  24  24  24  24  24  24  24  24  24  24  24  24  24
truth      24  40  10  39  14   5   3   6  34  24  24  24  10  37  17  24
24    140597
dtype: int64
printing learning rate after update:
2.384185791015625e-08
	 +++ epoch: 22 +++
iter-2000, loss 9.665
iter-4000, loss 9.257
iter-6000, loss 9.334
iter-8000, loss 9.047
iter-10000, loss 9.663
iter-12000, loss 9.675
iter-14000, loss 9.844
Accuracy test =  16.476 %
           0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
predicted  24  24  24  24

In [None]:
# find frequency of lemma per category
tokens = matrix.get_feature_names()
X_train= pd.DataFrame(X_train.todense(),columns=tokens)

In [None]:
X_train['LABELED_CATEGORIES'] = categories
X_gb = X_train.groupby('LABELED_CATEGORIES')

In [69]:
model.score(X_test,Y_test)

0.25847805599662493

In [None]:
X_gb.sum()

In [168]:
tokens_01_no_digits = matrix.get_feature_names()
print ( set(tokens_01_no_digits)-set(tokens_100))




In [68]:
# train model using linear regression

linear_model   = sklearn.linear_model.LinearRegression()
n = -1
linear_model.fit(X_train[:n],Y_train[:n])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [19]:
p_test = model.predict(X_test)
n = 50000
accuracy(p_test[:n],Y_test.values[:n])

[38 34 10 ... 22 10 22]
accuracy is 0.079668422


0.079668422

In [226]:
categories_names = Y.columns
p_cat = pd.Series([categories_names[i] for i in p.argmax(axis = 1) ])
p_cat.value_counts(normalize = True)

POLITICS          0.291940
WELLNESS          0.237761
ENTERTAINMENT     0.135637
STYLE & BEAUTY    0.067890
PARENTING         0.055722
TRAVEL            0.043594
CRIME             0.016360
FOOD & DRINK      0.015892
QUEER VOICES      0.015061
WEDDINGS          0.013707
HOME & LIVING     0.012800
WOMEN             0.012522
DIVORCE           0.010236
COMEDY            0.009390
SPORTS            0.009251
BLACK VOICES      0.008389
MEDIA             0.007000
BUSINESS          0.006482
THE WORLDPOST     0.004874
GREEN             0.004122
COLLEGE           0.003376
TECH              0.002838
ARTS              0.002405
EDUCATION         0.001787
WORLD NEWS        0.001509
SCIENCE           0.001449
PARENTS           0.001394
IMPACT            0.000951
WEIRD NEWS        0.000831
ARTS & CULTURE    0.000772
HEALTHY LIVING    0.000692
WORLDPOST         0.000692
RELIGION          0.000677
MONEY             0.000632
FIFTY             0.000553
GOOD NEWS         0.000488
STYLE             0.000239
T

In [227]:
dataset.category.value_counts(normalize = True)

POLITICS          0.163000
WELLNESS          0.088756
ENTERTAINMENT     0.079949
TRAVEL            0.049225
STYLE & BEAUTY    0.048040
PARENTING         0.043201
HEALTHY LIVING    0.033328
QUEER VOICES      0.031436
FOOD & DRINK      0.030998
BUSINESS          0.029559
COMEDY            0.025765
SPORTS            0.024316
BLACK VOICES      0.022544
HOME & LIVING     0.020886
PARENTS           0.019691
THE WORLDPOST     0.018242
WEDDINGS          0.018177
WOMEN             0.017376
IMPACT            0.017222
DIVORCE           0.017057
CRIME             0.016953
MEDIA             0.014015
WEIRD NEWS        0.013293
GREEN             0.013054
WORLDPOST         0.012840
RELIGION          0.012726
STYLE             0.011222
SCIENCE           0.010844
WORLD NEWS        0.010839
TASTE             0.010435
TECH              0.010366
MONEY             0.008499
ARTS              0.007513
FIFTY             0.006975
GOOD NEWS         0.006960
ARTS & CULTURE    0.006667
ENVIRONMENT       0.006587
C

In [15]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, cat_train) 
print ("fit complete")
# print (neigh.score(X_test,cat_test))

fit complete


In [49]:
n = 100
print ([type(t) for t in (neigh.predict(X_test[:n]),cat_test[:n],np.array(headlines_test[:n]))])
# print ((neigh.predict(X_test[:n]),cat_test[:n], np.array(headlines_test[:n])))
pd.DataFrame( (neigh.predict(X_test[:n]),cat_test[:n], np.array(headlines_test[:n]))).T

[<class 'numpy.ndarray'>, <class 'pandas.core.series.Series'>, <class 'numpy.ndarray'>]


Unnamed: 0,0,1,2
0,POLITICS,WOMEN,how rape culture and racism combine to hurt as...
1,BUSINESS,TRAVEL,turkey season travel deal be all stuff and no ...
2,QUEER VOICES,HOME & LIVING,"this oscar season , we 're see red"
3,HEALTHY LIVING,HEALTHY LIVING,"for target cancer treatment , just upload your..."
4,ENTERTAINMENT,PARENTS,13 must-reads for blend family
5,FOOD & DRINK,RELIGION,god and the battle over woman 's body ( all to...
6,PARENTS,WELLNESS,acupuncturists spill : the 12 health tip they ...
7,COMEDY,WEDDINGS,drunk groom caught cheat on his bride during r...
8,FOOD & DRINK,BUSINESS,welfare limit left poor adrift a recession hit
9,FOOD & DRINK,ENTERTAINMENT,tom hank will play mr. rogers in 'you be my fr...
