In [1]:
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import ParseError
import pandas as pd
from glob import glob
import os.path as op
import numpy as np
from collections import Counter
import operator
from string import punctuation
from nltk import SnowballStemmer
from nltk import pos_tag
import nltk

In [2]:
def add_records(path, all_records):
    xml_data = open(path, encoding="utf8").read()
    xml_data = "<problems>" + xml_data + "</problems>"
    problems = ET.XML(xml_data)
    for problem in problems:
        record = {}
        for child in problem:
            if child.tag == "choices":
                for choice in child:
                    id = choice.attrib['id']
                    record[choice.tag + '_' + id] = choice.text
            else:
                record[child.tag] = child.text
            
            #add column year to find from which year the question was taken
            record['year']=int(path[19:23])
        all_records.append(record)

In [3]:
all_records = []
file_paths = sorted(glob(op.join('Data', 'qa_mock_exams', '*', '*.xml')))
for path in file_paths:    
    try:
        add_records(path, all_records)
    except ParseError as e:
        print("Exception in file '{}': {}".format(path, e))

problems_df = pd.DataFrame(all_records)

#Pour eliminer les phrases "Select exactly 1 answer(s) from the following:"
problems_df['question'] = problems_df['question'].map(lambda x: x.replace("Select exactly 1 answer(s) from the following:  \t", ""))

In [4]:
problems_df

Unnamed: 0,answer,choice_A,choice_B,choice_C,choice_D,comments,question,year
0,A,rely on the integrity of input data.,address every aspect of performance measurement.,consist of required provisions for firms to fo...,must be applied with the goal of achieving exc...,Global Investment Performance Standards (GIPS)...,1. Which of the following is a key characteris...,2008
1,B,disclosing potential conflicts of interest.,habitually voting with management on proxies t...,disclosing confidential client information to ...,using client brokerage to purchase goods or se...,"Guidance for Standards I-VII, Standards of Pra...",2. According to the Standards of Practice Hand...,2008
2,B,No.,"Yes, because she has breached her duty to her ...","Yes, because she has failed to obtain written ...","Yes, because her allocation procedures contrib...","Guidance for Standards I-VII, Standards of Pra...","3. Carla Scott, CFA, is a portfolio manager fo...",2008
3,C,suspend the employee.,suspend Marshall from her supervisory duties.,initiate an investigation to determine the ext...,demand that the employee involved provide assu...,"Guidance for Standards I-VII, Standards of Pra...","4. Kim Li, CFA, is a portfolio manager for an ...",2008
4,D,No No,No Yes,Yes No,Yes Yes Select exactly 1 answer(s) from the f...,"Guidance for Standards I-VII, Standards of Pra...","5. Marcus Takeda, CFA, is an analyst at a smal...",2008
5,C,No No,No Yes,Yes No,Yes Yes Select exactly 1 answer(s) from the f...,"Guidance for Standards I-VII, Standards of Pra...","6. David Gunard, CFA, is an equity analyst at ...",2008
6,A,No No,No Yes,Yes No,Yes Yes Select exactly 1 answer(s) from the f...,"Guidance for Standards I-VII, Standards of Pra...",7. According to the Standards of Practice Hand...,2008
7,D,No No,No Yes,Yes No,Yes Yes Select exactly 1 answer(s) from the f...,"Guidance for Standards I-VII, Standards of Pra...",8. According to the Standards of Practice Hand...,2008
8,A,clients.,colleagues.,his reputation.,the employer's reputation.,"Guidance for Standards I-VII, Standards of Pra...",9. According to the Standards of Practice Hand...,2008
9,B,No.,"Yes, because he failed to obtain consent from ...","Yes, because he failed to disclose his new emp...","Yes, because he violated his duty to his emplo...","Standards of Practice Handbook, 9th edition (C...","10. Buta Singh, CFA, has a large extended fami...",2008


In [7]:
#Create a new dataframe
problems_df_split=problems_df

#Add 2 new column to split the question into 2 parts
problems_df_split['question_part1']=''
problems_df_split['question_part2']=''


i=0
for (question, question_part1, question_part2) in zip(problems_df_split["question"],problems_df_split['question_part1'], problems_df_split['question_part2']) :
    parts=question.split(".")
    if (len(parts)==2):
        question_part1=parts[0] + "."
    else:
        question_part1=". ".join(parts[:-1])
                             
    question_part2=parts[-1]
    
    problems_df_split.loc[i, 'question_part1']=question_part1
    problems_df_split.loc[i, 'question_part2']=question_part2
    
    i+=1


In [8]:
#Add column type (type of question) and label
problems_df_split['type']=np.nan
problems_df_split['label']=np.nan

In [9]:
# Fill column label

print("number of null values before adding labels is : ",
      problems_df_split['label'].isnull().sum())

for i in range(0, problems_df_split.shape[0]):
    if "closest to" in problems_df_split.loc[i, 'question_part2']:
        problems_df_split.loc[i, 'label'] = 'Calculate'

print("number of null values after adding labels is : ",
      problems_df_split['label'].isnull().sum())

number of null values before adding labels is :  1414
number of null values after adding labels is :  1051


In [10]:
# Fill column topic
'Ethical and Professional Standards'
i=0
while i < len(problems_df_split):
    if problems_df_split['year'][i] in range (2010, 2014):
        if int(problems_df_split.loc[i, 'question_part1'].split(".")[0]) in range (1,19):
            problems_df_split.loc[i, 'type']='Ethical and Professional Standards'
        
        elif int(problems_df_split.loc[i, 'question_part1'].split(".")[0]) in range (19,33):
            problems_df_split.loc[i, 'type']='Quantitative Methods'
        
        elif int(problems_df_split.loc[i, 'question_part1'].split(".")[0]) in range (33,45):
            problems_df_split.loc[i, 'type']='Economics'
        
        elif int(problems_df_split.loc[i, 'question_part1'].split(".")[0]) in range (45,69):
            problems_df_split.loc[i, 'type']='Financial Statement Analysis'
        
        elif int(problems_df_split.loc[i, 'question_part1'].split(".")[0]) in range (69,79):
            problems_df_split.loc[i, 'type']='Corporate Finance'
        
        elif int(problems_df_split.loc[i, 'question_part1'].split(".")[0]) in range (79,91):
            problems_df_split.loc[i, 'type']='Equity Investments'
        
        elif int(problems_df_split.loc[i, 'question_part1'].split(".")[0]) in range (91,97):
            problems_df_split.loc[i, 'type']='Derivative Investments'
        
        elif int(problems_df_split.loc[i, 'question_part1'].split(".")[0]) in range (97,109):
            problems_df_split.loc[i, 'type']='Fixed Income Investments'
        
        elif int(problems_df_split.loc[i, 'question_part1'].split(".")[0]) in range (109,115):
            problems_df_split.loc[i, 'type']='Alternative Investments'
        
        elif int(problems_df_split.loc[i, 'question_part1'].split(".")[0]) in range (115,121):
            problems_df_split.loc[i, 'type']='Portfolio Management'
            
    i+=1
    
    
problems_df_split.to_csv("dataframe splitted.csv")

In [11]:
problems_df_split

Unnamed: 0,answer,choice_A,choice_B,choice_C,choice_D,comments,question,year,question_part1,question_part2,type,label
0,A,rely on the integrity of input data.,address every aspect of performance measurement.,consist of required provisions for firms to fo...,must be applied with the goal of achieving exc...,Global Investment Performance Standards (GIPS)...,1. Which of the following is a key characteris...,2008,1.,Which of the following is a key characteristi...,,
1,B,disclosing potential conflicts of interest.,habitually voting with management on proxies t...,disclosing confidential client information to ...,using client brokerage to purchase goods or se...,"Guidance for Standards I-VII, Standards of Pra...",2. According to the Standards of Practice Hand...,2008,2.,According to the Standards of Practice Handbo...,,
2,B,No.,"Yes, because she has breached her duty to her ...","Yes, because she has failed to obtain written ...","Yes, because her allocation procedures contrib...","Guidance for Standards I-VII, Standards of Pra...","3. Carla Scott, CFA, is a portfolio manager fo...",2008,"3. Carla Scott, CFA, is a portfolio manager f...",According to the Standards of Practice Handbo...,,
3,C,suspend the employee.,suspend Marshall from her supervisory duties.,initiate an investigation to determine the ext...,demand that the employee involved provide assu...,"Guidance for Standards I-VII, Standards of Pra...","4. Kim Li, CFA, is a portfolio manager for an ...",2008,"4. Kim Li, CFA, is a portfolio manager for an...",According to the Standards of Practice Handb...,,
4,D,No No,No Yes,Yes No,Yes Yes Select exactly 1 answer(s) from the f...,"Guidance for Standards I-VII, Standards of Pra...","5. Marcus Takeda, CFA, is an analyst at a smal...",2008,"5. Marcus Takeda, CFA, is an analyst at a sma...",According to the Standards of Practice Handbo...,,
5,C,No No,No Yes,Yes No,Yes Yes Select exactly 1 answer(s) from the f...,"Guidance for Standards I-VII, Standards of Pra...","6. David Gunard, CFA, is an equity analyst at ...",2008,"6. David Gunard, CFA, is an equity analyst at...",According to the Standards of Practice Handbo...,,
6,A,No No,No Yes,Yes No,Yes Yes Select exactly 1 answer(s) from the f...,"Guidance for Standards I-VII, Standards of Pra...",7. According to the Standards of Practice Hand...,2008,7.,According to the Standards of Practice Handbo...,,
7,D,No No,No Yes,Yes No,Yes Yes Select exactly 1 answer(s) from the f...,"Guidance for Standards I-VII, Standards of Pra...",8. According to the Standards of Practice Hand...,2008,8.,According to the Standards of Practice Handbo...,,
8,A,clients.,colleagues.,his reputation.,the employer's reputation.,"Guidance for Standards I-VII, Standards of Pra...",9. According to the Standards of Practice Hand...,2008,9.,According to the Standards of Practice Handbo...,,
9,B,No.,"Yes, because he failed to obtain consent from ...","Yes, because he failed to disclose his new emp...","Yes, because he violated his duty to his emplo...","Standards of Practice Handbook, 9th edition (C...","10. Buta Singh, CFA, has a large extended fami...",2008,"10. Buta Singh, CFA, has a large extended fam...","With respect to the family portfolios, does ...",,


In [12]:
#Function to eliminate punctuations
def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

#Remove morphological affixes from words
stemmer = SnowballStemmer("english")
def stemmed_words(w):
    return stemmer.stem(w)


#Count words 
list_words=strip_punctuation(" ".join(problems_df_split['question_part2']).lower()).replace("\t", "").split(" ")
list_words=[stemmed_words(w) for w in list_words]
count_words_question=Counter(list_words)
count_words_question=count_words_question.most_common()

In [13]:
print(count_words_question)

[('', 2891), ('the', 2539), ('of', 1469), ('to', 1028), ('is', 973), ('a', 718), ('like', 544), ('most', 533), ('in', 460), ('follow', 398), ('which', 395), ('closest', 365), ('for', 329), ('compani', 328), ('and', 276), ('standard', 268), ('an', 230), ('on', 221), ('that', 190), ('rate', 188), ('be', 187), ('year', 179), ('least', 177), ('cfa', 173), ('as', 161), ('institut', 158), ('price', 149), ('cost', 148), ('violat', 145), ('will', 145), ('with', 139), ('statement', 136), ('best', 134), ('if', 134), ('valu', 134), ('stock', 126), ('return', 125), ('conduct', 123), ('profession', 122), ('invest', 116), ('are', 116), ('use', 107), ('describ', 104), ('bond', 102), ('market', 100), ('at', 97), ('would', 90), ('capit', 87), ('from', 86), ('by', 85), ('portfolio', 85), ('asset', 85), ('investor', 85), ('share', 84), ('cash', 84), ('inform', 81), ('interest', 81), ('it', 81), ('increas', 81), ('tax', 78), ('total', 78), ('accord', 77), ('when', 77), ('financi', 77), ('account', 73), ('

In [14]:
count_df = pd.DataFrame(count_words_question, columns=['words', 'count'])
count_df=count_df[count_df['count'] > 50]


In [15]:
count_df

Unnamed: 0,words,count
0,,2891
1,the,2539
2,of,1469
3,to,1028
4,is,973
5,a,718
6,like,544
7,most,533
8,in,460
9,follow,398


** Counting bigrams**

In [16]:
#Count pairs 
list_words=strip_punctuation(" ".join(problems_df_split['question_part2']).lower()).replace("\t", "").split(" ")

#To eliminate bigrams with one word
pairs=[list_words[i]+' '+list_words[i+1] for i in range(len(list_words)-1) if list_words[i]!="" and list_words[i]!=" " and list_words[i+1]!="" and list_words[i+1]!=" "]

count_pairs_question=Counter(pairs)
count_pairs_question=count_pairs_question.most_common()


In [17]:
count_df_pairs = pd.DataFrame(count_pairs_question, columns=['pairs', 'count'])

#add column to identify in which question these bigrmas exist
count_df_pairs['question']= count_df_pairs['pairs']

#Create function to return list showing us in which question these bigrams exist
def docs_include(w):
    docs=[]
    for i in range(0, len(problems_df_split)):
        if w in problems_df_split.loc[i, 'question_part2'].lower():
            docs.append(i)
    return docs

count_df_pairs['question'] = count_df_pairs['question'].map(lambda x: docs_include(x))


count_df_pairs=count_df_pairs[count_df_pairs['count'] > 20]

In [130]:
count_df_pairs

Unnamed: 0,pairs,count,question
0,of the,538,"[0, 8, 18, 27, 28, 30, 31, 35, 36, 37, 38, 41,..."
1,most likely,390,"[1, 18, 27, 44, 47, 49, 62, 73, 93, 97, 101, 1..."
2,the following,386,"[0, 20, 27, 28, 31, 35, 36, 37, 38, 46, 51, 53..."
3,closest to,363,"[19, 20, 21, 23, 25, 29, 34, 36, 39, 50, 54, 5..."
4,which of,324,"[0, 27, 28, 31, 35, 37, 38, 46, 51, 57, 58, 59..."
5,is closest,319,"[19, 20, 21, 29, 39, 50, 54, 64, 68, 70, 71, 7..."
6,is most,181,"[1, 27, 44, 65, 87, 93, 110, 199, 217, 218, 22..."
7,standards of,152,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 120, 121, 122,..."
8,cfa institute,149,"[2, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 17, 121..."
9,least likely,146,"[57, 74, 75, 77, 122, 125, 127, 129, 132, 133,..."


In [22]:
print(count_df_pairs.loc[1, 'question'])
problems_df_split.loc[count_df_pairs.loc[1, 'question'], :]

[1, 18, 27, 44, 47, 49, 62, 73, 93, 97, 101, 110, 112, 115, 144, 149, 150, 151, 152, 155, 158, 159, 165, 167, 170, 172, 188, 197, 200, 217, 218, 219, 221, 223, 227, 228, 230, 231, 234, 235, 237, 238, 263, 268, 269, 274, 275, 276, 278, 279, 280, 285, 287, 290, 292, 303, 306, 315, 328, 336, 337, 338, 339, 341, 342, 343, 344, 348, 349, 351, 353, 354, 355, 356, 357, 359, 370, 378, 396, 400, 401, 407, 419, 421, 427, 428, 436, 438, 453, 454, 461, 468, 469, 472, 473, 477, 479, 484, 490, 492, 495, 497, 498, 504, 506, 512, 514, 515, 517, 519, 520, 521, 525, 530, 531, 534, 536, 537, 539, 541, 544, 550, 555, 557, 559, 568, 573, 574, 575, 578, 587, 589, 598, 601, 602, 605, 611, 612, 618, 621, 624, 625, 626, 629, 631, 632, 633, 634, 637, 639, 640, 648, 650, 653, 656, 658, 662, 663, 665, 671, 675, 681, 697, 700, 706, 708, 709, 715, 717, 719, 720, 721, 723, 726, 727, 729, 731, 733, 737, 740, 747, 750, 751, 752, 753, 756, 757, 761, 767, 768, 769, 774, 778, 781, 783, 784, 786, 789, 790, 791, 794, 797, 

Unnamed: 0,answer,choice_A,choice_B,choice_C,choice_D,comments,question,year,question_part1,question_part2,type,label
1,B,disclosing potential conflicts of interest.,habitually voting with management on proxies t...,disclosing confidential client information to ...,using client brokerage to purchase goods or se...,"Guidance for Standards I-VII, Standards of Pra...",2. According to the Standards of Practice Hand...,2008,2.,According to the Standards of Practice Handbo...,,
18,C,liquidity.,maturity.,default risk.,business risk.,"""The Time Value of Money,"" Richard A. Defusco,...",19. The yield to maturity on otherwise identic...,2008,19. The yield to maturity on otherwise identi...,"5% over the life of the bonds, the most likely...",,
27,D,product of the variables' variances.,variance of the dependent variable.,variance of the independent variable.,product of the variables' standard deviations.,"""Probability Concepts,"" Richard A. Defusco, De...",28. Which of the following statements regardin...,2008,28.,Which of the following statements regarding c...,,
44,C,increase in the payables turnover ratio.,decrease in the inventory turnover ratio.,increase in the receivables turnover ratio.,decrease in the payables payment period.,"""Financial Analysis Techniques,"" Thomas R. Rob...",45. Assume U.S. GAAP (generally accepted accou...,2008,45. Assume U. S. GAAP (generally accepted ac...,A company's cash conversion cycle is most ...,,
47,D,"the company's current ratio, but not the compa...","the company's cash flow from operations, but n...",both the company's current ratio and the compa...,neither the company's current ratio nor the co...,"""Understanding the Cash Flow Statement,"" Thoma...",48. Assume U.S. GAAP (generally accepted accou...,2008,48. Assume U. S. GAAP (generally accepted ac...,"0, that company's repayment of $150,000 in sho...",,
49,D,tightened credit policies and increased collec...,"purchased new property, plant, and equipment a...",sold a long-term investment for an amount equa...,increased raw materials inventory in anticipat...,"""Understanding the Cash Flow Statement,"" Thoma...",50. Assume U.S. GAAP (generally accepted accou...,2008,50. Assume U. S. GAAP (generally accepted ac...,"All else equal, the most likely explanation ...",,
62,D,"$21,000 lower.","$9,000 lower.","$9,000 higher.","$21,000 higher.","""Analysis of Inventories,"" Gerald I. White, As...",63. Assume U.S. GAAP (generally accepted accou...,2008,63. Assume U. S. GAAP (generally accepted ac...,An analyst gathered the following informatio...,,
73,A,has served on the board for 14 years.,owns 1000 shares of the corporation's equity.,is a college professor and a certified public ...,has formerly served on the boards of several s...,"""The Corporate Governance of Listed Companies:...","74. Regarding corporate governance, which of t...",2008,74.,"Regarding corporate governance, which of the ...",,
93,B,default on the forward contract.,do nothing until the long makes payment.,accept delivery of SP 500 stocks from the long.,deliver the portfolio of SP 500 stocks to the ...,"""Forward Markets and Contracts,"" Don M. Chance...",94. Two parties agree to a forward contract to...,2008,94. Two parties agree to a forward contract t...,The short party is most likely obligated to:,,
97,D,No No,No Yes,Yes No,Yes Yes Select exactly 1 an...,"""Understanding Yield Spreads,"" Frank J. Fabozz...",98. According to the Liquidity Preference Theo...,2008,98.,"According to the Liquidity Preference Theory,...",,


In [23]:
problems_df_split.loc[count_df_pairs.loc[1, 'question'], :].to_csv("most_likely.csv")