In [1]:
import os
import re
import json
import spacy
import numpy as np
import pandas as pd
from spacy import displacy
from read_problems import read_questions_json
from read_problems import read_all_problems
from read_problems import write_problems

%matplotlib inline

In [2]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load('en')

In [102]:
class QuestionCategory(object):
    DEF_KEYWORD = 0
    DEF_KEYWORD_START_END = 1
    KEYWORD_DEF = 2
    KEYWORD_DEF_START_END = 3
    
    CHAR_KEYWORD = 4
    KEYWORD_CHAR = 5
    SITUATION_KEYWORD = 6
    KEYWORD_SITUATION = 7
    SITUATION_CHAR = 8
    CHAR_SITUATION = 9
    SITUATION_ACTION = 10
    ACTION_EFFECT = 11
    ACTION_REASON = 12
    KEYWORD_COMPARISON = 13
    SITUATION_DEF = 14
    CALCULUS = 15
    REASONING = 16
    
    OTHER = 666

In [4]:
def load_glossary():
    path = os.path.join('..', 'Data', 'material_handbook', 'glossary.json')
    return json.load(open(path))

def calculate_average_question_len(q):
    res = q['choice_A_len'] + \
          q['choice_B_len'] + \
          q['choice_C_len']
    if q['choice_D'] is np.NaN:
        return res / 3
    res += q['choice_D_len']
    return res / 4

def add_length_features(df):
    df['question_len'] = df['question'].str.len()
    df['choice_A_len'] = df['choice_A'].str.len()
    df['choice_B_len'] = df['choice_B'].str.len()
    df['choice_C_len'] = df['choice_C'].str.len()
    df['choice_D_len'] = df['choice_D'].str.len()
    df['average_question_len'] = df.apply(calculate_average_question_len, axis=1)
    df['question_choice_len_ratio'] = df['question_len'] / df['average_question_len']

def sentence_with_term(sentence, glossary):
    if sentence is np.NaN:
        return False
    for keyword, definition in glossary.items():
        if keyword.lower() in sentence.lower():
            return True
    return False

def any_choice_in_glossary(df):
    if df['choice_D'] is np.NaN:
        return df['choice_A_in_glossary'] or \
               df['choice_B_in_glossary'] or \
               df['choice_C_in_glossary']
    return df['choice_A_in_glossary'] or \
           df['choice_B_in_glossary'] or \
           df['choice_C_in_glossary'] or \
           df['choice_D_in_glossary']
            
def get_default_filters(df):
    filters = ~df['question'].str.contains(', CFA,')
    filters &= df['question_choice_len_ratio'] < 12.0
    filters &= df['any_choice_in_glossary'] == True
    return filters

def extract_definitions(df, filters_factory=None):
    glossary = load_glossary()
    df_copy = df.copy()
    add_length_features(df_copy)
    df_copy['choice_A_in_glossary'] = df_copy['choice_A'].apply(lambda choice: sentence_with_term(choice, glossary))
    df_copy['choice_B_in_glossary'] = df_copy['choice_B'].apply(lambda choice: sentence_with_term(choice, glossary))
    df_copy['choice_C_in_glossary'] = df_copy['choice_C'].apply(lambda choice: sentence_with_term(choice, glossary))
    df_copy['choice_D_in_glossary'] = df_copy['choice_D'].apply(lambda choice: sentence_with_term(choice, glossary))
    df_copy['any_choice_in_glossary'] = df_copy.apply(any_choice_in_glossary, axis=1)
    if filters_factory is None:
        filters_factory = get_default_filters
    return df_copy[filters_factory(df)]

In [29]:
all_df = read_all_problems()
all_df['category'] = QuestionCategory.OTHER

all_df.loc[1479, 'category'] = QuestionCategory.SITUATION_KEYWORD
all_df.loc[[991, 1481], 'category'] = QuestionCategory.KEYWORD_CHAR
all_df.loc[[1255, 1486], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1494, 2337], 'category'] = QuestionCategory.KEYWORD_CHAR
all_df.loc[[1497, 2339], 'category'] = QuestionCategory.KEYWORD_CHAR
all_df.loc[[1145, 1504], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1508], 'category'] = QuestionCategory.KEYWORD_DEF_START_END
all_df.loc[[1516, 1774, 2119, 2353], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1518, 2122, 2354], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1556, 2153], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1098, 1578, 1745, 1937], 'category'] = QuestionCategory.KEYWORD_COMPARISON
all_df.loc[[1579, 2185], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1583], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1586, 1875, 2178], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1591, 1959, 2189], 'category'] = QuestionCategory.KEYWORD_DEF_START_END
all_df.loc[[1600], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1626], 'category'] = QuestionCategory.SITUATION_CHAR
all_df.loc[[1276, 1632], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1158, 1643], 'category'] = QuestionCategory.CHAR_KEYWORD
all_df.loc[[1649, 2248], 'category'] = QuestionCategory.SITUATION_KEYWORD
all_df.loc[[1669, 2268], 'category'] = QuestionCategory.SITUATION_CHAR
all_df.loc[[1744], 'category'] = QuestionCategory.KEYWORD_CHAR
all_df.loc[[1749], 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[[1757], 'category'] = QuestionCategory.KEYWORD_DEF
all_df.loc[[1763], 'category'] = QuestionCategory.KEYWORD_DEF_START_END
all_df.loc[[1137, 1851], 'category'] = QuestionCategory.CHAR_KEYWORD

all_df.loc[27, 'category'] = QuestionCategory.KEYWORD_DEF_START_END
all_df.loc[93, 'category'] = QuestionCategory.SITUATION_ACTION
all_df.loc[344, 'category'] = QuestionCategory.CHAR_KEYWORD
all_df.loc[386, 'category'] = QuestionCategory.KEYWORD_DEF_START_END
all_df.loc[453, 'category'] = QuestionCategory.SITUATION_ACTION
all_df.loc[557, 'category'] = QuestionCategory.ACTION_EFFECT
all_df.loc[652, 'category'] = QuestionCategory.SITUATION_ACTION
all_df.loc[703, 'category'] = QuestionCategory.CHAR_KEYWORD
all_df.loc[764, 'category'] = QuestionCategory.ACTION_EFFECT
all_df.loc[905, 'category'] = QuestionCategory.ACTION_REASON
all_df.loc[1059, 'category'] = QuestionCategory.CALCULUS
all_df.loc[1231, 'category'] = QuestionCategory.SITUATION_CHAR
all_df.loc[1750, 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[1849, 'category'] = QuestionCategory.SITUATION_ACTION
all_df.loc[2375, 'category'] = QuestionCategory.SITUATION_DEF

In [None]:
all_df.loc[26, 'category'] = QuestionCategory.DEF_KEYWORD_START_END
all_df.loc[102, 'category'] = QuestionCategory.KEYWORD_DEF
all_df.loc[142, 'category'] = QuestionCategory.SITUATION_KEYWORD
all_df.loc[189, 'category'] = QuestionCategory.SITUATION_KEYWORD
all_df.loc[192, 'category'] = QuestionCategory.KEYWORD_DEF_START_END
all_df.loc[320, 'category'] = QuestionCategory.KEYWORD_DEF_START_END
all_df.loc[377, 'category'] = QuestionCategory.DEF_KEYWORD_START_END

In [75]:
all_df.loc[393, 'category'] = QuestionCategory.CHAR_KEYWORD
all_df.loc[434, 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[441, 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[494, 'category'] = QuestionCategory.SITUATION_KEYWORD
all_df.loc[500, 'category'] = QuestionCategory.CHAR_KEYWORD
all_df.loc[505, 'category'] = QuestionCategory.SITUATION_KEYWORD
all_df.loc[510, 'category'] = QuestionCategory.KEYWORD_DEF_START_END

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [83]:
all_df.loc[550, 'category'] = QuestionCategory.SITUATION_KEYWORD
all_df.loc[604, 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[609, 'category'] = QuestionCategory.CHAR_KEYWORD
all_df.loc[614, 'category'] = QuestionCategory.KEYWORD_CHAR
all_df.loc[635, 'category'] = QuestionCategory.CHAR_KEYWORD

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [95]:
all_df.loc[671, 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[693, 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[725, 'category'] = QuestionCategory.KEYWORD_CHAR
all_df.loc[733, 'category'] = QuestionCategory.KEYWORD_DEF_START_END
all_df.loc[736, 'category'] = QuestionCategory.SITUATION_KEYWORD
all_df.loc[799, 'category'] = QuestionCategory.SITUATION_KEYWORD
all_df.loc[852, 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[854, 'category'] = QuestionCategory.SITUATION_KEYWORD

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [104]:
all_df.loc[899, 'category'] = QuestionCategory.CHAR_KEYWORD
all_df.loc[917, 'category'] = QuestionCategory.SITUATION_KEYWORD
all_df.loc[924, 'category'] = QuestionCategory.DEF_KEYWORD_START_END
all_df.loc[964, 'category'] = QuestionCategory.REASONING
all_df.loc[967, 'category'] = QuestionCategory.DEF_KEYWORD

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [121]:
all_df.loc[972, 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[975, 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[1019, 'category'] = QuestionCategory.SITUATION_KEYWORD
all_df.loc[1084, 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[1090, 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[1095, 'category'] = QuestionCategory.KEYWORD_DEF_START_END
all_df.loc[1159, 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[1178, 'category'] = QuestionCategory.DEF_KEYWORD
all_df.loc[1206, 'category'] = QuestionCategory.KEYWORD_CHAR
all_df.loc[1226, 'category'] = QuestionCategory.DEF_KEYWORD

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [20]:
all_df = read_all_problems()

In [123]:
write_problems(all_df)

In [120]:
all_df.loc[1226]['question']

'In the audit report, an additional paragraph that explains an exception to an accounting standard is best described as a(n):'

In [122]:
potential_defs = all_df[(all_df['question'].str.contains('defined') | all_df['question'].str.contains('described'))]
potential_defs[potential_defs['category'] == QuestionCategory.OTHER].loc[1095:]

Unnamed: 0,answer,category,choice_A,choice_B,choice_C,choice_D,comments,filename,question,question_nb,topic,year
1273,B,666,size of the contract,original time to maturity,net amount owed by one party to the other,,"""Swap Markets and Contracts,"" Don M. Chance, C...",2013_afternoon_answer.xml,The tenor of a swap is best described as the:,95,Derivative Investments,2013
1297,B,666,market factor model,fundamental factor model,macroeconomic factor model,,"""Portfolio Risk and Return Part II"", Vijay Sin...",2013_afternoon_answer.xml,A return-generating model that provides an est...,119,Portfolio Management,2013
1318,B,666,platykurtotic (less peaked than a normal distr...,leptokurtotic (more peaked than a normal distr...,mesokurtotic (identical to the normal distribu...,,"""Statistical Concepts and Market Returns,"" Ric...",2013_morning_answer.xml,"Equity return series are best described as, fo...",20,Quantitative Methods,2013
1323,A,666,support level,resistance level,change in polarity point,,"""Technical Analysis,"" Barry M. Sine, CFA and R...",2013_morning_answer.xml,A stock is declining in price and reaches a pr...,25,Quantitative Methods,2013
1327,A,666,risk of loss relative to an investment's fair ...,increased sensitivity of the market value of d...,possibility that the borrower will fail to mak...,,"""The Time Value of Money,"" Richard A. DeFusco,...",2013_morning_answer.xml,The liquidity premium can be best described as...,29,Quantitative Methods,2013
1336,C,666,E,G,F+H,,"""Demand and Supply Analysis: Introduction,"" Ri...",2013_morning_answer.xml,The diagram to the right shows the domestic de...,38,Economics,2013
1343,B,666,as liquid,less liquid,more liquid,,"""Understanding Balance Sheets,"" Elaine Henry, ...",2013_morning_answer.xml,The current ratio for an industry is 3.2. Data...,45,Financial Statement Analysis,2013
1357,C,666,timeliness and accrual accounting,understandability and verifiability,relevance and faithful representation,,"""Financial Reporting Standards,"" Elaine Henry,...",2013_morning_answer.xml,According to the International Accounting Stan...,59,Financial Statement Analysis,2013
1389,A,666,swap,futures contract,forward contract,,"""Derivative Markets and Instruments,"" Don M. C...",2013_morning_answer.xml,A corporation issues 5-year fixed-rate bonds. ...,91,Derivative Investments,2013
1408,B,666,price return,collateral yield,convenience yield,,"""Investing in Commodities,"" Ronald G. Layard-L...",2013_morning_answer.xml,If an investor uses derivatives to make a long...,110,Alternative Investments,2013


In [None]:
question = 'Which of the following statements regarding correlation and covariance is most likely correct? The correlation  between two random variables is their covariance standardized by the:'

def is_start_end_definition(question):
    match = re.match('^.*[.!?] The([^.!?]*) is [^.!?]*:$', question)
    return not match is None

def extract_last_sentence(question):
    match = re.match('^([^.!?]*[.!?:])+$', question)
    return match.group(1).strip()

extract_last_sentence(question)

In [None]:
test.loc[93]['question']

In [None]:
from spacy.symbols import nsubj, VERB

def display_root(sentence):
    doc = nlp(sentence)
    root = [token for token in doc if token.head == token][0]
    print(root)

def display_dependencies(sentence):
    doc = nlp(sentence)
    displacy.render(doc, style='dep', jupyter=True)

In [None]:
for sentence in last_sentences:
    display_root(sentence)

In [None]:
last_sentences = test['question'].apply(extract_last_sentence).values
last_sentences

In [None]:
display_depencies(last_sentences[2])

In [None]:
def_filter = potential_def['question'].apply(is_start_end_definition)
test = potential_def[def_filter]
test_filtered = test[['answer', 'choice_A', 'choice_B', 'choice_C', 'choice_D', 'question']].copy()
test_filtered['category'] = QuestionCategory.OTHER

In [None]:
test_filtered

In [None]:
test_filtered.loc[2375, 'question']

In [None]:
potential_def.loc[27]['question']

In [None]:
potential_def.loc[0:30]

In [None]:
all_df.loc[2380]['question']

In [None]:
get_best_documents(docs_df, ['callable bond', 'negative convexity'])

In [None]:
get_text_by_id(docs_df, 'Investopedia_41106')