In [69]:
import os
import re
import json
import spacy
import numpy as np
import pandas as pd
from spacy import displacy
from read_problems import read_questions_json
from read_problems import read_all_problems

%matplotlib inline

In [2]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load('en')

In [3]:
def load_documents():
    docs = json.load(open('all_data_collected.json'))
    return pd.DataFrame(docs, columns=['id', 'text'])

def get_best_documents(docs_df, keywords):
    f = docs_df['text'].str.contains(keywords[0], case=False, regex=False)
    for keyword in keywords[1:]:
        f = f & docs_df['text'].str.contains(keyword, case=False, regex=False)
    return docs_df[f]

def display_cfa_text(text):
    for t in text.split('\n'):
        if t == '':
            continue
        print(t)
        print()

def get_text_by_id(docs_df, doc_id):
    index = docs_df[docs_df['id'] == doc_id].index
    return docs_df.iloc[index[0]]['text']

docs_df = load_documents()

In [32]:
class QuestionCategory(object):
    DEF_KEYWORD = 0
    KEYWORD_DEF = 1
    KEYWORD_DEF_START_END = 2
    
    CHAR_KEYWORD = 3
    KEYWORD_CHAR = 4
    SITUATION_KEYWORD = 5
    KEYWORD_SITUATION = 6
    SITUATION_CHAR = 7
    CHAR_SITUATION = 8
    SITUATION_ACTION = 9
    ACTION_EFFECT = 10
    ACTION_REASON = 11
    KEYWORD_COMPARISON = 12
    SITUATION_DEF = 13
    CALCULUS = 14
    OTHER = 666

def set_categories(questions, cat_indexes):
    questions['category'] = QuestionCategory.OTHER
    for cat_index in cat_indexes:
        index = cat_index[0]
        cat = cat_index[1]
        questions.loc[index, 'category'] = cat

In [5]:
def load_glossary():
    path = os.path.join('..', 'Data', 'material_handbook', 'glossary.json')
    return json.load(open(path))

def calculate_average_question_len(q):
    res = q['choice_A_len'] + \
          q['choice_B_len'] + \
          q['choice_C_len']
    if q['choice_D'] is np.NaN:
        return res / 3
    res += q['choice_D_len']
    return res / 4

def add_length_features(df):
    df['question_len'] = df['question'].str.len()
    df['choice_A_len'] = df['choice_A'].str.len()
    df['choice_B_len'] = df['choice_B'].str.len()
    df['choice_C_len'] = df['choice_C'].str.len()
    df['choice_D_len'] = df['choice_D'].str.len()
    df['average_question_len'] = df.apply(calculate_average_question_len, axis=1)
    df['question_choice_len_ratio'] = df['question_len'] / df['average_question_len']

def sentence_with_term(sentence, glossary):
    if sentence is np.NaN:
        return False
    for keyword, definition in glossary.items():
        if keyword.lower() in sentence.lower():
            return True
    return False

def any_choice_in_glossary(df):
    if df['choice_D'] is np.NaN:
        return df['choice_A_in_glossary'] or \
               df['choice_B_in_glossary'] or \
               df['choice_C_in_glossary']
    return df['choice_A_in_glossary'] or \
           df['choice_B_in_glossary'] or \
           df['choice_C_in_glossary'] or \
           df['choice_D_in_glossary']
            
def get_default_filters(df):
    filters = ~df['question'].str.contains(', CFA,')
    filters &= df['question_choice_len_ratio'] < 12.0
    filters &= df['any_choice_in_glossary'] == True
    return filters

def extract_definitions(df, filters_factory=None):
    glossary = load_glossary()
    df_copy = df.copy()
    add_length_features(df_copy)
    df_copy['choice_A_in_glossary'] = df_copy['choice_A'].apply(lambda choice: sentence_with_term(choice, glossary))
    df_copy['choice_B_in_glossary'] = df_copy['choice_B'].apply(lambda choice: sentence_with_term(choice, glossary))
    df_copy['choice_C_in_glossary'] = df_copy['choice_C'].apply(lambda choice: sentence_with_term(choice, glossary))
    df_copy['choice_D_in_glossary'] = df_copy['choice_D'].apply(lambda choice: sentence_with_term(choice, glossary))
    df_copy['any_choice_in_glossary'] = df_copy.apply(any_choice_in_glossary, axis=1)
    if filters_factory is None:
        filters_factory = get_default_filters
    return df_copy[filters_factory(df)]

In [41]:
all_df = read_all_problems()
all_df['category'] = QuestionCategory.OTHER

all_df.loc[1479, 'category'] = QuestionCategory.SITUATION_KEYWORD
all_df.loc[[991, 1481], 'category'] = QuestionCategory.KEYWORD_CHAR
all_df.loc[[1255, 1486], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1494, 2337], 'category'] = QuestionCategory.KEYWORD_CHAR
all_df.loc[[1497, 2339], 'category'] = QuestionCategory.KEYWORD_CHAR
all_df.loc[[1145, 1504], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1508], 'category'] = QuestionCategory.KEYWORD_DEF_START_END
all_df.loc[[1516, 1774, 2119, 2353], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1518, 2122, 2354], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1556, 2153], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1098, 1578, 1745, 1937], 'category'] = QuestionCategory.KEYWORD_COMPARISON
all_df.loc[[1579, 2185], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1583], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1586, 1875, 2178], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1591, 1959, 2189], 'category'] = QuestionCategory.KEYWORD_DEF_START_END
all_df.loc[[1600], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1626], 'category'] = QuestionCategory.SITUATION_CHAR
all_df.loc[[1276, 1632], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1158, 1643], 'category'] = QuestionCategory.CHAR_KEYWORD
all_df.loc[[1649, 2248], 'category'] = QuestionCategory.SITUATION_KEYWORD
all_df.loc[[1669, 2268], 'category'] = QuestionCategory.SITUATION_CHAR
all_df.loc[[1744], 'category'] = QuestionCategory.KEYWORD_CHAR
all_df.loc[[1749], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1757], 'category'] = QuestionCategory.KEYWORD_DEF
all_df.loc[[1763], 'category'] = QuestionCategory.KEYWORD_DEF_START_END
all_df.loc[[1137, 1851], 'category'] = QuestionCategory.CHAR_KEYWORD

all_df.loc[27, 'category'] = QuestionCategory.KEYWORD_DEF_START_END
all_df.loc[93, 'category'] = QuestionCategory.SITUATION_ACTION
all_df.loc[344, 'category'] = QuestionCategory.CHAR_KEYWORD
all_df.loc[386, 'category'] = QuestionCategory.KEYWORD_DEF_START_END
all_df.loc[453, 'category'] = QuestionCategory.SITUATION_ACTION
all_df.loc[557, 'category'] = QuestionCategory.ACTION_EFFECT
all_df.loc[652, 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[703, 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[764, 'category'] = QuestionCategory.ACTION_EFFECT
all_df.loc[905, 'category'] = QuestionCategory.ACTION_REASON
all_df.loc[1059, 'category'] = QuestionCategory.CALCULUS
all_df.loc[1231, 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[1750, 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[1849, 'category'] = QuestionCategory.SITUATION_ACTION
all_df.loc[2375, 'category'] = QuestionCategory.SITUATION_DEF

In [72]:
all_df = read_all_problems()
# potential_def = extract_definitions(all_df)
all_df

Unnamed: 0,answer,category,choice_A,choice_B,choice_C,choice_D,comments,filename,question,question_nb,topic,year
0,A,666,rely on the integrity of input data,address every aspect of performance measurement,consist of required provisions for firms to fo...,must be applied with the goal of achieving exc...,Global Investment Performance Standards (GIPS)...,2008_part_1.xml,Which of the following is a key characteristic...,1,Ethical and Professional Standards,2008
1,B,666,disclosing potential conflicts of interest,habitually voting with management on proxies t...,disclosing confidential client information to ...,using client brokerage to purchase goods or se...,"Guidance for Standards I-VII, Standards of Pra...",2008_part_1.xml,According to the Standards of Practice Handboo...,2,Ethical and Professional Standards,2008
2,B,666,No,"Yes, because she has breached her duty to her ...","Yes, because she has failed to obtain written ...","Yes, because her allocation procedures contrib...","Guidance for Standards I-VII, Standards of Pra...",2008_part_1.xml,"Carla Scott, CFA, is a portfolio manager for a...",3,Ethical and Professional Standards,2008
3,C,666,suspend the employee,suspend Marshall from her supervisory duties,initiate an investigation to determine the ext...,demand that the employee involved provide assu...,"Guidance for Standards I-VII, Standards of Pra...",2008_part_1.xml,"Kim Li, CFA, is a portfolio manager for an inv...",4,Ethical and Professional Standards,2008
4,D,666,No No,No Yes,Yes No,Yes Yes,"Guidance for Standards I-VII, Standards of Pra...",2008_part_1.xml,"Marcus Takeda, CFA, is an analyst at a small i...",5,Ethical and Professional Standards,2008
5,C,666,No No,No Yes,Yes No,Yes Yes,"Guidance for Standards I-VII, Standards of Pra...",2008_part_1.xml,"David Gunard, CFA, is an equity analyst at Cur...",6,Ethical and Professional Standards,2008
6,A,666,No No,No Yes,Yes No,Yes Yes,"Guidance for Standards I-VII, Standards of Pra...",2008_part_1.xml,According to the Standards of Practice Handboo...,7,Ethical and Professional Standards,2008
7,D,666,No No,No Yes,Yes No,Yes Yes,"Guidance for Standards I-VII, Standards of Pra...",2008_part_1.xml,According to the Standards of Practice Handboo...,8,Ethical and Professional Standards,2008
8,A,666,clients,colleagues,his reputation,the employer's reputation,"Guidance for Standards I-VII, Standards of Pra...",2008_part_1.xml,According to the Standards of Practice Handboo...,9,Ethical and Professional Standards,2008
9,B,666,No,"Yes, because he failed to obtain consent from ...","Yes, because he failed to disclose his new emp...","Yes, because he violated his duty to his emplo...","Standards of Practice Handbook, 9th edition (C...",2008_part_1.xml,"Buta Singh, CFA, has a large extended family a...",10,Ethical and Professional Standards,2008


In [8]:
question = 'Which of the following statements regarding correlation and covariance is most likely correct? The correlation  between two random variables is their covariance standardized by the:'

def is_start_end_definition(question):
    match = re.match('^.*[.!?] The([^.!?]*) is [^.!?]*:$', question)
    return not match is None

def extract_last_sentence(question):
    match = re.match('^([^.!?]*[.!?:])+$', question)
    return match.group(1).strip()

extract_last_sentence(question)

'The correlation  between two random variables is their covariance standardized by the:'

In [10]:
test.loc[93]['question']

'Two parties agree to a forward contract to deliver the SP 500 Index at a price of $375,000 in 2 months time. When  the forward contract expires, the price of the SP 500 Index is $350,000 but the long party is unable to pay the cash  settlement. The short party is most likely obligated to:'

In [38]:
from spacy.symbols import nsubj, VERB

def display_root(sentence):
    doc = nlp(sentence)
    root = [token for token in doc if token.head == token][0]
    print(root)

def display_dependencies(sentence):
    doc = nlp(sentence)
    displacy.render(doc, style='dep', jupyter=True)

In [37]:
for sentence in last_sentences:
    display_root(sentence)

is
obligated
is
is
obligated
is
is
is
is
is
is
is
known
described
is
is


In [32]:
last_sentences = test['question'].apply(extract_last_sentence).values
last_sentences

array(['The correlation  between two random variables is their covariance standardized by the:',
       'The short party is most likely obligated to:',
       'The type of security that is most likely to yield the lowest recovery in a bankruptcy is a:',
       'The correlation between two random variables is their covariance standardized by the product of the variables’:',
       'The short party is most likely obligated to:',
       'The most likely effect of this change to the company’s credit customers is a four day:',
       'The reinvestment risk for an investor holding the bonds to maturity is greatest for the bond that is:',
       'The most likely recording of this transaction in the cash flow statement is as a(n):',
       'The pair of bonds most likely to meet the investor’s objective is a:',
       "The most likely effect of these transactions on the retailer's accounting equation for the month is that assets will:",
       'The motivation for the company’s behavior is most 

In [33]:
display_depencies(last_sentences[2])

In [59]:
def_filter = potential_def['question'].apply(is_start_end_definition)
test = potential_def[def_filter]
test_filtered = test[['answer', 'choice_A', 'choice_B', 'choice_C', 'choice_D', 'question']].copy()
test_filtered['category'] = QuestionCategory.OTHER

In [70]:
test_filtered

Unnamed: 0,answer,choice_A,choice_B,choice_C,choice_D,question,category
27,D,product of the variables' variances.,variance of the dependent variable.,variance of the independent variable.,product of the variables' standard deviations.,Which of the following statements regarding co...,2
93,B,default on the forward contract.,do nothing until the long makes payment.,accept delivery of SP 500 stocks from the long.,deliver the portfolio of SP 500 stocks to the ...,Two parties agree to a forward contract to del...,9
344,B,mortgage bond,debenture bond.,collateral trust bond.,,An analyst is evaluating various debt securiti...,3
386,B,variances.,standard deviations.,coeffi ci ents of vari ati on.,,Which of the following statements best describ...,2
453,B,Default on the forward contract,Do nothing until the long makes payment,Accept delivery of S&P 500 stocks from the long,,Two parties agree to a forward contract to del...,9
557,C,increase in their operating cycle.,decrease in their operating cycle.,decrease in their net operating cycle.,,A company extends its trade credit terms by fo...,10
582,C,a zero-coupon bond.,a coupon bond selling at a discount to par.,a coupon bond selling at a premium to par value.,,Two amortizing bonds have the same maturity da...,666
652,A,disclosure in a note or supplementary schedule.,"outflow from investing activities, and an infl...","outflow from operating activities, and an infl...",,A company issued shares to acquire a large tra...,0
703,B,putable bond and a callable bond.,zero-coupon bond and a Treasury strip.,mortgage-backed-security and an asset-backed s...,,An investor is evaluating a diverse set of bon...,0
764,B,be unchanged.,"increase by $1,000.","decrease by $2,000. By accessing this mock exa...",,"At the start of a month, a retailer paid $5,00...",10


In [79]:
test_filtered.loc[2375, 'question']

'An investor sells a bond at the quoted price of $98.00. In addition, she receives accrued interest of $4.40. The flat price of the bond is equal to the:'

In [None]:
potential_def.loc[27]['question']

In [None]:
potential_def.loc[0:30]

In [None]:
all_df.loc[2380]['question']

In [None]:
get_best_documents(docs_df, ['callable bond', 'negative convexity'])

In [None]:
get_text_by_id(docs_df, 'Investopedia_41106')