In [1]:
import json
import numpy as np
import pandas as pd
from scipy import linalg
from sklearn.feature_extraction.text import CountVectorizer

## Latent Semantic Indexing

For each category (i.e. each sheet in the `tasks.xlsx` file):

1. Create the document-term matrix `A` (called `vectors` below)


2. Apply SVD (singular value decomposition) to decompose `AT` (term-document matrix) into:

    * term-concept matrix `U`
    * singular value matrix `sigma`
    * concept-document matrix `V` in the form: `A = U*sigma*VT`
        * want document-concept matrix `VT` for query mapping

3. query mapping done in `runNLP.ipynb`


The parameters: `U`, `sigma`, `VT`, `vocab` (or terms) are stored as NumPy matrices, and saved in compressed NumPy format `.npz`.

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

file_name = "tasks.xlsx"
npzfiles = ['nlp_1','nlp_2','nlp_3','nlp_4','nlp_5','nlp_6','nlp_7',
            'nlp_8','nlp_9','nlp_10','nlp_11','nlp_12','nlp_13']

vectorizer = CountVectorizer(stop_words='english')

for sheet,npzfile in enumerate(npzfiles):
    
    text = [w[0] for w in pd.read_excel(file_name, sheet_name=sheet, header=None).values]
    vectors = vectorizer.fit_transform(text).todense() # (documents, vocab)
    U, s, VT = linalg.svd(vectors.T, full_matrices=False)
    sigma = np.diag(s)
    vocab = np.array(vectorizer.get_feature_names())
    
    np.savez('nlp_resources/'+npzfile+'.npz',U=U,sigma=sigma,VT=VT,vocab=vocab)

## Saving Tasks

The tasks are parsed from the `tasks.xlsx` file and saved in separate `JSON` files - one per category.


Example (category 2, `tasks_2.json`):

>```
{
	"0": "Did you know: It's better to challenge a fear food...",
	"1": "Did you know: When you eat low-carbs during the day...",
	"2": "Did you know: Asians eat lots of rice and carbs...",
	"3": "Did you know: If you start eating when you're not hungry...",
	"4": "Did you know: If you don't have enough carbs...",
	"5": "Did you know: Fibre helps digestion and is necessary...",
	"6": "Did you know: It is recommended that people consume fish...",
	"7": "Did you know: Many people consume far more protein than they need..."
}```

In [3]:
jsonfiles = ['tasks_1','tasks_2','tasks_3','tasks_4','tasks_5','tasks_6','tasks_7',
             'tasks_8','tasks_9','tasks_10','tasks_11','tasks_12','tasks_13']

for sheet,jsonfile in enumerate(jsonfiles):
    
    text = [w[0] for w in pd.read_excel(file_name, sheet_name=sheet, header=None).values]
    dictOfTasks = { i : text[i] for i in range(0, len(text) ) }
    with open('tasks/'+jsonfile+'.json', 'w') as fp:
        json.dump(dictOfTasks, fp)

## Accuracy

`targets (13 x 5)`: (category by task) - tagged top 5 tasks within each category


**Queries** (which will be journal entries) emulated by web-scraped forum posts
* stored in `journal_entries.xlsx`
* one query per category


Calculated accracy for each **dimension** (dimensionality reduction during SVD)
* Optimal solution: 6 dimensions


---
* **accuracy by category** - _at least one_ of five tasks hit: 12/13 = 92.3%
* **accuracy by task** - consider _all_ tagged tasks: 40/65 = 61.5%


In [2]:
from ipynb.fs.full.runNLP import nlp

In [3]:
targets = np.asarray(
    [[44,51,52,12,25],
     [1,2,4,7,6],
     [31,43,18,42,14],
     [21,24,11,17,22],
     [0,2,8,9,25],
     [25,20,23,27,21],
     [2,5,14,12,18],
     [0,1,15,16,3],
     [5,4,3,21,23],
     [6,27,34,2,39],
     [13,11,12,18,26],
     [9,11,0,22,23],
     [11,8,22,9,20]]
)

entries = [w[0] for w in pd.read_excel('journal_entries.xlsx', header=None).values]

In [6]:
# check dimensions 1 to 8 (smallest category (2) has 8 concepts/dimensions)
for dim in range(8):
    rec = []

    for i,(target,entry) in enumerate(zip(targets,entries)):

        parameters = "nlp_resources/nlp_"+str(i+1)+".npz"
        tasks_json = "tasks/tasks_"+str(i+1)+".json"

        # Verify the targets
        #with open(tasks_json, 'r') as fp:
        #    tasks_dict = json.load(fp)
        #tasks = list(tasks_dict.values())
        #for t in target:
        #    print(tasks[t])
        #print('')

        recs = nlp(entry,parameters,tasks_json,dim)
        rec_str = list(recs.keys())[0:5]
        rec.append([int(r) for r in rec_str])
    
    preds = np.asarray(rec)

    acc = np.asarray([[True if p in target else False for p in pred] for pred,target in zip(preds,targets)])

    print('Dimensions:\t{0}\nAt least one:\t{1}\nAll:\t\t{2}\n'.format(
        dim+1,
        100*(sum((acc).flatten())/(targets.shape[0]*targets.shape[1])),
        100*(sum(np.sum(acc, axis=1) > 0)/targets.shape[0])
    ))



Dimensions:	1
At least one:	16.923076923076923
All:		46.15384615384615

Dimensions:	2
At least one:	26.153846153846157
All:		61.53846153846154

Dimensions:	3
At least one:	29.230769230769234
All:		69.23076923076923

Dimensions:	4
At least one:	38.46153846153847
All:		76.92307692307693

Dimensions:	5
At least one:	46.15384615384615
All:		76.92307692307693

Dimensions:	6
At least one:	61.53846153846154
All:		92.3076923076923

Dimensions:	7
At least one:	47.69230769230769
All:		92.3076923076923

Dimensions:	8
At least one:	47.69230769230769
All:		92.3076923076923

