# Filter dataset concepts

The goal of this notebook is to filter all nouns, pronous and verbs from the MSC dataset.
The resulting list is used to create a reduced knowledge graph, which contains only concepts that occur in the MSC dataset.

In [None]:
from dataset.msc_sessions import MSC_Session
import json
import spacy
from tqdm import tqdm

### Define source files and target file

In [None]:
# Datasets
datadir = "/Users/FrankVerhoef/Programming/PEX/data/"
basedir = "msc/msc_dialogue/"
sessions = [1, 2, 3, 4]
subset = 'train'

# File to save to resulting filtered concepts
kg_datadir = "kg_data/"
target_path = datadir + kg_datadir + "dataset_concepts.txt"

### Add words from MSC corpus to vocab

In [None]:
corpus = MSC_Session(
    basedir=datadir+basedir,
    sessions=sessions,
    subset=subset
)

nlp = spacy.load('en_core_web_sm')

def filter(token):
    """
    Filter: keep only tokens that satisfy the conditions in this filter
    """
    return token.pos_ in ['NOUN', 'PROPN', 'VERB']

vocab = {}
i=0
for doc in tqdm(nlp.pipe(corpus), desc="Filter MSC dataset"):
    tokens = [token.text for token in doc if filter(token)]
    # add words to vocab
    for t in tokens:
        if vocab.get(t, False):
            vocab[t] += 1
        else:
            vocab[t] = 1
    
    i+=1
print("Read {} lines with {} tokens".format(i, sum(vocab.values())))
print("Vocab has {} entries".format(len(vocab.keys())))

### Inspect the resulting vocab

In [None]:
sorted(vocab.items(), key=lambda x:x[1], reverse=True)[:50]

In [None]:
sorted(vocab.items(), key=lambda x:x[0], reverse=False)[:50]

### Save tokens to target file

In [None]:
with open(target_path, "w") as f:
    for c in vocab.keys():
        f.write(c)
        f.write('\n')