In [1]:
import os
import re
import json
import spacy
import numpy as np
import pandas as pd
from spacy import displacy
from read_problems import read_questions_json
from read_problems import read_all_problems
from read_problems import write_problems

%matplotlib inline

In [6]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load('en')

In [2]:
class QuestionCategory(object):
    DEF_KEYWORD = 0
    DEF_KEYWORD_START_END = 1
    KEYWORD_DEF = 2
    KEYWORD_DEF_START_END = 3
    
    CHAR_KEYWORD = 4
    KEYWORD_CHAR = 5
    SITUATION_KEYWORD = 6
    KEYWORD_SITUATION = 7
    SITUATION_CHAR = 8
    CHAR_SITUATION = 9
    SITUATION_ACTION = 10
    ACTION_EFFECT = 11
    ACTION_REASON = 12
    KEYWORD_COMPARISON = 13
    SITUATION_DEF = 14
    CALCULUS = 15
    REASONING = 16
    
    OTHER = 666

In [3]:
def load_glossary():
    path = os.path.join('..', 'Data', 'material_handbook', 'glossary.json')
    return json.load(open(path))

def calculate_average_question_len(q):
    res = q['choice_A_len'] + \
          q['choice_B_len'] + \
          q['choice_C_len']
    if q['choice_D'] is np.NaN:
        return res / 3
    res += q['choice_D_len']
    return res / 4

def add_length_features(df):
    df['question_len'] = df['question'].str.len()
    df['choice_A_len'] = df['choice_A'].str.len()
    df['choice_B_len'] = df['choice_B'].str.len()
    df['choice_C_len'] = df['choice_C'].str.len()
    df['choice_D_len'] = df['choice_D'].str.len()
    df['average_question_len'] = df.apply(calculate_average_question_len, axis=1)
    df['question_choice_len_ratio'] = df['question_len'] / df['average_question_len']

def sentence_with_term(sentence, glossary):
    if sentence is np.NaN:
        return False
    for keyword, definition in glossary.items():
        if keyword.lower() in sentence.lower():
            return True
    return False

def any_choice_in_glossary(df):
    if df['choice_D'] is np.NaN:
        return df['choice_A_in_glossary'] or \
               df['choice_B_in_glossary'] or \
               df['choice_C_in_glossary']
    return df['choice_A_in_glossary'] or \
           df['choice_B_in_glossary'] or \
           df['choice_C_in_glossary'] or \
           df['choice_D_in_glossary']
            
def get_default_filters(df):
    filters = ~df['question'].str.contains(', CFA,')
    filters &= df['question_choice_len_ratio'] < 12.0
    filters &= df['any_choice_in_glossary'] == True
    return filters

def extract_definitions(df, filters_factory=None):
    glossary = load_glossary()
    df_copy = df.copy()
    add_length_features(df_copy)
    df_copy['choice_A_in_glossary'] = df_copy['choice_A'].apply(lambda choice: sentence_with_term(choice, glossary))
    df_copy['choice_B_in_glossary'] = df_copy['choice_B'].apply(lambda choice: sentence_with_term(choice, glossary))
    df_copy['choice_C_in_glossary'] = df_copy['choice_C'].apply(lambda choice: sentence_with_term(choice, glossary))
    df_copy['choice_D_in_glossary'] = df_copy['choice_D'].apply(lambda choice: sentence_with_term(choice, glossary))
    df_copy['any_choice_in_glossary'] = df_copy.apply(any_choice_in_glossary, axis=1)
    if filters_factory is None:
        filters_factory = get_default_filters
    return df_copy[filters_factory(df)]

In [4]:
all_df = read_all_problems()
all_df.head()

Unnamed: 0,answer,category,choice_A,choice_B,choice_C,choice_D,comments,filename,question,question_nb,topic,year
0,A,666,rely on the integrity of input data,address every aspect of performance measurement,consist of required provisions for firms to fo...,must be applied with the goal of achieving exc...,Global Investment Performance Standards (GIPS)...,2008_part_1.xml,Which of the following is a key characteristic...,1,Ethical and Professional Standards,2008
1,B,666,disclosing potential conflicts of interest,habitually voting with management on proxies t...,disclosing confidential client information to ...,using client brokerage to purchase goods or se...,"Guidance for Standards I-VII, Standards of Pra...",2008_part_1.xml,According to the Standards of Practice Handboo...,2,Ethical and Professional Standards,2008
2,B,666,No,"Yes, because she has breached her duty to her ...","Yes, because she has failed to obtain written ...","Yes, because her allocation procedures contrib...","Guidance for Standards I-VII, Standards of Pra...",2008_part_1.xml,"Carla Scott, CFA, is a portfolio manager for a...",3,Ethical and Professional Standards,2008
3,C,666,suspend the employee,suspend Marshall from her supervisory duties,initiate an investigation to determine the ext...,demand that the employee involved provide assu...,"Guidance for Standards I-VII, Standards of Pra...",2008_part_1.xml,"Kim Li, CFA, is a portfolio manager for an inv...",4,Ethical and Professional Standards,2008
4,D,666,No No,No Yes,Yes No,Yes Yes,"Guidance for Standards I-VII, Standards of Pra...",2008_part_1.xml,"Marcus Takeda, CFA, is an analyst at a small i...",5,Ethical and Professional Standards,2008


In [15]:
question = nlp(all_df.loc[26]['question'])
choiceA = nlp('a probability distribution that summarizes the likelihood that a value will take one of two independent values under a given set of parameters or assumptions')
choiceB = nlp('A psychological phenomenon that explains why people tend to seek out information that confirms their existing opinions and overlook or ignore information that refutes their beliefs')
choiceC = nlp(all_df.loc[26]['choice_C'])
choiceD = nlp(all_df.loc[26]['choice_D'])

print(question.similarity(choiceA))
print(question.similarity(choiceB))
print(question.similarity(choiceC))
print(question.similarity(choiceD))

choiceD.has_vector

0.4446638748095584
0.34034696278557064
0.27845460590124105
0.14654814882619355


True

In [20]:
' '.join(("test", "foo"))

'test foo'

In [22]:
def get_search_text(index):
    problem = all_df.loc[index]
    question = problem['question']
    choiceA = problem['choice_A']
    choiceB = problem['choice_B']
    choiceC = problem['choice_C']
    choiceD = problem['choice_D']
    return ' '.join((question, choiceA, choiceB, choiceC, choiceD))

get_search_text(26)

'If an analyst estimates the probability of an event for which there is no historical record, this probability is best  described as: a priori objective empirical subjective'

In [5]:
all_df[all_df['question'].str.contains('described') | all_df['question'].str.contains('defined')]

Unnamed: 0,answer,category,choice_A,choice_B,choice_C,choice_D,comments,filename,question,question_nb,topic,year
26,D,1,a priori,objective,empirical,subjective,"""Probability Concepts,"" Richard A. Defusco, De...",2008_part_1.xml,If an analyst estimates the probability of an ...,27,Quantitative Methods,2008
102,A,2,Z-spread minus the option cost,Z-spread plus the cost of the option,value of the security's embedded option,effect of changes in interest rates on the val...,"""Yield Measures, Spot Rates, and Forward Rates...",2008_part_1.xml,The option adjusted spread (OAS) is best descr...,103,Fixed Income Investments,2008
142,A,6,ratio,ordinal,interval,nominal,"“Statistical Concepts and Market Returns,” Ric...",2008_part_2.xml,An analyst gathered the price-earnings ratios ...,23,Quantitative Methods,2008
189,D,6,an externality,project sequencing,a mutually exclusive project,an example of investment synergy,"“Capital Budgeting,” JohnD. Stowe and Jacques ...",2008_part_2.xml,Howard Quarries has recently opened a limeston...,70,Corporate Finance,2008
192,D,3,marginal cost of capital,cost of new debt capital,cost of retained earnings,weighted average cost of capital,"“Cost of Capital,” Yves Courtois, GeneC. Lai, ...",2008_part_2.xml,A company's optimal capital budget is best des...,73,Corporate Finance,2008
318,B,3,only interest payments to debt holders,payments to both debt holders (interest and pr...,both interest and principal payments to debt h...,,"“Understanding the Cash Flow Statement,” Thoma...",2009_afternoon_answers.xml,Free cash flow to equity is most accurately de...,86,Equity Investments,2009
375,C,1,a priori,empirical,subjective,,"“Probability Concepts,” Richard A. Defusco, CF...",2009_morning_answers.xml,If an analyst estimates the probability of an ...,26,Quantitative Methods,2009
391,A,4,Price stability,Full employment,Moderating long-term interest rates,,"“Monetary Policy,” Michael Parkin 2009 Modular...",2009_morning_answers.xml,Which of the following goals of monetary polic...,43,Economics,2009
431,A,0,escalation bias,confirmation bias,overconfidence bias,,"“Efficient Capital Markets,” Frank K. Reilly, ...",2009_morning_answers.xml,The behavior of investors who put more money i...,84,Equity Investments,2009
438,B,0,interest rate cap,interest rate floor,interest rate collar,,"“Option Markets and Contracts,” Don M. Chance ...",2009_morning_answers.xml,A series of interest rate put options that exp...,91,Derivative Investments,2009


In [22]:
all_df.loc[192]['question']

"A company's optimal capital budget is best described as the amount of new capital required to undertake all projects with an  internal rate of return greater than the:"

In [30]:
all_df[all_df['category'] == QuestionCategory.CALCULUS]

Unnamed: 0,answer,category,choice_A,choice_B,choice_C,choice_D,comments,filename,question,question_nb,topic,year
1038,C,15,3.13%,42.40%,73.50%,,"""Understanding Yield Spreads,"" Frank J. Fabozz...",2012_afternoon_answer.xml,If the yield on a 5-year U.S. corporate bond i...,101,Fixed Income Investments,2012


In [18]:
all_df.loc[1693]['question']

'The value at risk of an alternative investment is best described as the:'

In [20]:
all_df = read_all_problems()

In [123]:
write_problems(all_df)

In [120]:
all_df.loc[1226]['question']

'In the audit report, an additional paragraph that explains an exception to an accounting standard is best described as a(n):'

In [7]:
potential_defs = all_df[(all_df['question'].str.contains('defined') | all_df['question'].str.contains('described'))]
potential_defs[potential_defs['category'] == QuestionCategory.OTHER].loc[1095:]

Unnamed: 0,answer,category,choice_A,choice_B,choice_C,choice_D,comments,filename,question,question_nb,topic,year
1270,B,666,size of the contract,original time to maturity,net amount owed by one party to the other,,"""Swap Markets and Contracts,"" Don M. Chance, C...",2013_afternoon_answer.xml,The tenor of a swap is best described as the:,95,Derivative Investments,2013
1294,B,666,market factor model,fundamental factor model,macroeconomic factor model,,"""Portfolio Risk and Return Part II"", Vijay Sin...",2013_afternoon_answer.xml,A return-generating model that provides an est...,119,Portfolio Management,2013
1315,B,666,platykurtotic (less peaked than a normal distr...,leptokurtotic (more peaked than a normal distr...,mesokurtotic (identical to the normal distribu...,,"""Statistical Concepts and Market Returns,"" Ric...",2013_morning_answer.xml,"Equity return series are best described as, fo...",20,Quantitative Methods,2013
1320,A,666,support level,resistance level,change in polarity point,,"""Technical Analysis,"" Barry M. Sine, CFA and R...",2013_morning_answer.xml,A stock is declining in price and reaches a pr...,25,Quantitative Methods,2013
1324,A,666,risk of loss relative to an investment's fair ...,increased sensitivity of the market value of d...,possibility that the borrower will fail to mak...,,"""The Time Value of Money,"" Richard A. DeFusco,...",2013_morning_answer.xml,The liquidity premium can be best described as...,29,Quantitative Methods,2013
1333,C,666,E,G,F+H,,"""Demand and Supply Analysis: Introduction,"" Ri...",2013_morning_answer.xml,The diagram to the right shows the domestic de...,38,Economics,2013
1340,B,666,as liquid,less liquid,more liquid,,"""Understanding Balance Sheets,"" Elaine Henry, ...",2013_morning_answer.xml,The current ratio for an industry is 3.2. Data...,45,Financial Statement Analysis,2013
1354,C,666,timeliness and accrual accounting,understandability and verifiability,relevance and faithful representation,,"""Financial Reporting Standards,"" Elaine Henry,...",2013_morning_answer.xml,According to the International Accounting Stan...,59,Financial Statement Analysis,2013
1386,A,666,swap,futures contract,forward contract,,"""Derivative Markets and Instruments,"" Don M. C...",2013_morning_answer.xml,A corporation issues 5-year fixed-rate bonds. ...,91,Derivative Investments,2013
1405,B,666,price return,collateral yield,convenience yield,,"""Investing in Commodities,"" Ronald G. Layard-L...",2013_morning_answer.xml,If an investor uses derivatives to make a long...,110,Alternative Investments,2013


In [31]:
question = 'Which of the following statements regarding correlation and covariance is most likely correct? The correlation  between two random variables is their covariance standardized by the:'

def is_start_end_definition(question):
    match = re.match('^.*[.!?] The([^.!?]*) is [^.!?]*:$', question)
    return not match is None

def extract_last_sentence(question):
    match = re.match('^([^.!?]*[.!?:])+$', question)
    return match.group(1).strip()

extract_last_sentence(question)

'The correlation  between two random variables is their covariance standardized by the:'

In [None]:
test.loc[93]['question']

In [32]:
from spacy.symbols import nsubj, VERB

def display_root(sentence):
    doc = nlp(sentence)
    root = [token for token in doc if token.head == token][0]
    print(root)

def display_dependencies(sentence):
    doc = nlp(sentence)
    displacy.render(doc, style='dep', jupyter=True)

In [15]:
for sentence in last_sentences:
    display_root(sentence)

NameError: name 'last_sentences' is not defined

In [33]:
last_sentences = test['question'].apply(extract_last_sentence).values
last_sentences

NameError: name 'test' is not defined

In [None]:
display_depencies(last_sentences[2])

In [None]:
def_filter = potential_def['question'].apply(is_start_end_definition)
test = potential_def[def_filter]
test_filtered = test[['answer', 'choice_A', 'choice_B', 'choice_C', 'choice_D', 'question']].copy()
test_filtered['category'] = QuestionCategory.OTHER

In [None]:
test_filtered

In [None]:
test_filtered.loc[2375, 'question']

In [None]:
potential_def.loc[27]['question']

In [None]:
potential_def.loc[0:30]

In [None]:
all_df.loc[2380]['question']

In [None]:
get_best_documents(docs_df, ['callable bond', 'negative convexity'])

In [None]:
get_text_by_id(docs_df, 'Investopedia_41106')