# Exploritory Data Analysis

In [1]:
from collections import Counter
from string import punctuation
import spacy
import os
import json
import elasticsearch
from elasticsearch import Elasticsearch
import utils
import spacy

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
es = Elasticsearch(http_auth=(os.environ['ES_USER'], os.environ['ES_PWD']))

In [5]:
INDEX_NAME = 'trec2019_stem'

### Load training data

In [6]:
path = "../treccastweb/2019/data/training/train_topics_v1.0.json"
with open(path, 'r') as f:
    train_topics = json.load(f)

In [7]:
path = "../baseline/train_topics_mod_nowapo.qrel"
with open(path, 'r') as f:
    qrels = []
    for line in f:
        qrels.append(line.strip())

### Evaluation of retrieval

In [8]:
system_rankings = [2, 1, 3, 4, 5, 6, 10, 7, 9, 8]
ground_truth = {4: 3, 1: 2, 2: 1} #doc_id:relevance, scores: 0 non relevant, 1 poor, 2 good, 3 excellent 

In [9]:
utils.ndcg(system_rankings, ground_truth, k=5)

0.7991575453673245

### Add query id for each turn in each topic

In [10]:
for topic in train_topics:
    utils.add_qid(topic)

## POS tagger
https://universaldependencies.org/u/pos/all.html

In [11]:
def get_pronominal(text):
    doc = nlp(text) 
    return [token.text for token in doc if token.pos_ == 'PRON']

def get_determiner(text):
    doc = nlp(text)
    return [token.text for token in doc if token.pos_ == 'DET']

def get_noun(text):
    doc = nlp(text)
    return [token.text for token in doc if token.pos_ == 'NOUN']

def get_proper_noun(text):
    doc = nlp(text)
    return [token.text for token in doc if token.pos_ == 'PROPN']
    
def get_adjective(text):
    doc = nlp(text)
    return [token.text for token in doc if token.pos_ == 'ADJ']

def get_adverb(text):
    doc = nlp(text)
    return [token.text for token in doc if token.pos_ == 'ADV']

def get_adposition(text):
    doc = nlp(text)
    return [token.text for token in doc if token.pos_ == 'ADP']

def get_verb(text):
    doc = nlp(text)
    return [token.text for token in doc if token.pos_ == 'VERB']
    

def pos_analysis(topic):
    title = topic['title']
    for i, turn in enumerate([title] + topic['turn']):
        if i == 0:
            print("## Title data ##")
            text = turn
        else:
            text = turn['raw_utterance']
        pronominals = get_pronominal(text)
        determiner = get_determiner(text)
        noun = get_noun(text)
        proper_noun = get_proper_noun(text)
        adjective = get_adjective(text)
        adverb = get_adverb(text)
        adposition = get_adposition(text)
        verb = get_verb(text)
        print(f'Original text: {text}\nPronominals: {pronominals}'
              f'\nDeterminer: {determiner}\nNoun: {noun}\n'
              f'Proper noun: {proper_noun}\nAdjective: {adjective}\n'
              f'Adverb: {adverb}\nAdposition: {adposition}\n'
              f'Verb: {verb}')
        print()

In [12]:
pos_analysis(train_topics[0])
pos_analysis(train_topics[3])
pos_analysis(train_topics[6])

## Title data ##
Original text: Career choice for Nursing and Physician's Assistant
Pronominals: []
Determiner: []
Noun: ['Career', 'choice']
Proper noun: ['Nursing', 'Physician', 'Assistant']
Adjective: []
Adverb: []
Adposition: ['for']
Verb: []

Original text: What is a physician's assistant?
Pronominals: ['What']
Determiner: ['a']
Noun: ['physician', 'assistant']
Proper noun: []
Adjective: []
Adverb: []
Adposition: []
Verb: []

Original text: What are the educational requirements required to become one?
Pronominals: ['What']
Determiner: ['the']
Noun: ['requirements']
Proper noun: []
Adjective: ['educational']
Adverb: []
Adposition: []
Verb: ['required', 'become']

Original text: What does it cost?
Pronominals: ['What', 'it']
Determiner: []
Noun: []
Proper noun: []
Adjective: []
Adverb: []
Adposition: []
Verb: ['cost']

Original text: What's the average starting salary in the UK?
Pronominals: ['What']
Determiner: ['the', 'the']
Noun: ['starting', 'salary']
Proper noun: ['UK']
Adjecti