In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Get corpus files

In [74]:
# core folder url with corpus files
path_core = 'corpus/'

# file names for each corpus
imb = 'BSc Information Management For Business corpus.txt'
msci_man = 'BSc-MSci Management Science corpus.txt'
mres_fin = 'MRes and PhD in Financial Economics corpus.txt'
mres_man = 'MRes and PhD in Management corpus.txt'
msc_ba = "MSc Business Analytics corpus.txt"
msc_ent = "MSc Entrepreneurship corpus.txt"
msc_fin = "MSc Finance corpus.txt"
msc_man = "MSc Management corpus.txt"
mba = "The UCL MBA corpus.txt"
mba_peking = "The UCL MBA with Peking University corpus.txt"

corpus_list = { 
    'imb': imb,
    'msci_man':msci_man, 
    'mres_fin':mres_fin, 
    'mres_man':mres_man, 
    'msc_ba':msc_ba, 
    'msc_ent':msc_ent, 
    'msc_fin':msc_fin, 
    'msc_man':msc_man, 
    'mba':mba, 
    'mba_peking':mba_peking
    } # holds all file names in dict

# update corpus_list with string of corpus
for file in corpus_list:
    with open(path_core + corpus_list[file], 'r') as f:     
        corpus = f.read()
        corpus_list[file] = corpus

# Documents to Word Embeddings

In [5]:
!pip install allennlp==0.9.0
!pip install flair==0.9

[1m
         .:::.     .::.       
        ....yy:    .yy.       
        :.  .yy.    y.        
             :y:   .:         
             .yy  .:          
              yy..:           
              :y:.            
              .y.             
             .:.              
        ....:.                
        :::.                  
[0;33m
• Project files and data should be stored in /project. This is shared among everyone
  in the project.
• Personal files and configuration should be stored in /home/faculty.
• Files outside /project and /home/faculty will be lost when this server is terminated.
• Create custom environments to setup your servers reproducibly.
[0m
Collecting allennlp==0.9.0
  Using cached allennlp-0.9.0-py3-none-any.whl (7.6 MB)
Collecting responses>=0.7
  Using cached responses-0.20.0-py3-none-any.whl (27 kB)
Collecting flask-cors>=3.0.7
  Using cached Flask_Cors-3.0.10-py2.py3-none-any.whl (14 kB)
Collecting word2number>=1.1
  Using cached word2number-1.1

Collecting blis<0.3.0,>=0.2.2
  Using cached blis-0.2.4-cp38-cp38-linux_x86_64.whl
Collecting plac<1.0.0,>=0.9.6
  Using cached plac-0.9.6-py2.py3-none-any.whl (20 kB)
Collecting typing-utils>=0.0.3
  Using cached typing_utils-0.1.0-py3-none-any.whl (10 kB)
Installing collected packages: cymem, wasabi, srsly, preshed, plac, murmurhash, blis, typing-utils, torch, thinc, sentencepiece, word2number, unidecode, tensorboardX, spacy, responses, pytorch-transformers, pytorch-pretrained-bert, parsimonious, overrides, jsonpickle, jsonnet, ftfy, flask-cors, flaky, editdistance, conllu, allennlp
Successfully installed allennlp-0.9.0 blis-0.2.4 conllu-1.3.1 cymem-2.0.6 editdistance-0.6.0 flaky-3.7.0 flask-cors-3.0.10 ftfy-6.1.1 jsonnet-0.18.0 jsonpickle-2.1.0 murmurhash-1.0.6 overrides-6.1.0 parsimonious-0.8.1 plac-0.9.6 preshed-2.0.1 pytorch-pretrained-bert-0.6.2 pytorch-transformers-1.1.0 responses-0.20.0 sentencepiece-0.1.96 spacy-2.1.9 srsly-1.0.5 tensorboardX-2.5 thinc-7.0.8 torch-1.11.0 typi

Collecting conllu>=4.0
  Using cached conllu-4.4.1-py2.py3-none-any.whl (15 kB)
Collecting janome
  Using cached Janome-0.4.2-py2.py3-none-any.whl (19.7 MB)
Collecting smart-open>=1.8.1
  Using cached smart_open-5.2.1-py3-none-any.whl (58 kB)
Collecting py4j
  Using cached py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
Collecting overrides<4.0.0,>=3.0.0
  Using cached overrides-3.1.0-py3-none-any.whl
Collecting sacremoses
  Using cached sacremoses-0.0.49-py3-none-any.whl (895 kB)
Collecting tokenizers!=0.11.3,>=0.11.1
  Using cached tokenizers-0.11.6-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
Installing collected packages: smart-open, tokenizers, sentencepiece, sacremoses, py4j, overrides, huggingface-hub, gensim, wikipedia-api, transformers, sqlitedict, segtok, mpld3, more-itertools, langdetect, konoha, janome, hyperopt, gdown, deprecated, conllu, bpemb, flair
  Attempting uninstall: sentencepiece
    Found existing installation: sentencepiece 0.1.96
    Uninstalli

In [9]:
from flair.data import Sentence
from flair.embeddings import ELMoEmbeddings
elmo_embeddings = ELMoEmbeddings('original')

In [76]:
# crate an elmo embedding
def elmo_embed(sen, mute=False):
    sentence = Sentence(sen)
    if not mute: print('sentence created')
    elmo_embeddings.embed(sentence)
    if not mute: print('embedded')
    
    embedding = 0
    
    for token in sentence:
        embedding = embedding + token.embedding
    
    
    if not mute: print('done\n')
    return np.array(embedding)

**Save embeddings for large corpus**

In [77]:
# filter to only 2 relevant corpuses
corpus_list = {k:v for (k,v) in corpus_list.items() if k in ['msc_ba', 'msc_fin']}
# copy to store embeddings in
corpus_list_embedded = corpus_list.copy()

In [78]:
# create embedding for corpus via ELMo
for doc in corpus_list_embedded:
    doc_embedding = elmo_embed(corpus_list_embedded[doc])
    corpus_list_embedded[doc] = doc_embedding

sentence created
embedded
done

sentence created
embedded
done



**Save embeddings for mini corpus**

In [79]:
mini_corpus_folder_mapper = {
    'msc_ba': 'business-analytics',
    'msc_fin': 'finance'
}

# Question Similarity - Degree

In [98]:
def get_related_degree(question):
    cos_similarities = []
    
    question_embedded = elmo_embed(question, mute=True)
    
    for doc in corpus_list_embedded:
        doc_embedded = corpus_list_embedded[doc]
        cos_sim = cosine_similarity(question_embedded.reshape(1,-1), doc_embedded.reshape(1,-1))[0][0]
        cos_similarities.append(cos_sim)
    
    max_similarity_index = np.argmax(cos_similarities)
    
    degree = list(corpus_list.keys())[max_similarity_index]
    print('Business Analytics >', cos_similarities[0], str(cos_similarities[1]) + ' <', 'Finance')
    
    return degree

# Testing

**Finance**

In [99]:
# load data
q_a_train = pd.read_csv('question-answers/Q&A Data - Msc Finance Q&A.csv')
# clean
q_a_train = q_a_train[['Q', 'A']]
q_a_train.rename(columns={'Q':'question', 'A':'answers'}, inplace=True)

preds = []

for question in q_a_train['question'].values:
    degree_association = get_related_degree(question)
    preds.append(degree_association)

Business Analytics > 0.40861315 0.42304182 < Finance
Business Analytics > 0.55916977 0.583346 < Finance
Business Analytics > 0.49786603 0.5305347 < Finance
Business Analytics > 0.36440983 0.37960705 < Finance
Business Analytics > 0.3641274 0.38134637 < Finance
Business Analytics > 0.42622173 0.4576794 < Finance
Business Analytics > 0.40677285 0.4566934 < Finance
Business Analytics > 0.39468145 0.40779573 < Finance
Business Analytics > 0.472342 0.5133951 < Finance
Business Analytics > 0.39562315 0.4126351 < Finance
Business Analytics > 0.3734594 0.4152089 < Finance
Business Analytics > 0.33045512 0.35336715 < Finance
Business Analytics > 0.5416429 0.5799994 < Finance
Business Analytics > 0.5832787 0.6334357 < Finance
Business Analytics > 0.46942568 0.49524173 < Finance
Business Analytics > 0.42305988 0.4323452 < Finance
Business Analytics > 0.5295562 0.5628122 < Finance
Business Analytics > 0.48513222 0.5202471 < Finance
Business Analytics > 0.472236 0.47543272 < Finance
Business Analyt

In [101]:
print('\n\n\nAccuracy for predicting whether a question belongs to the Finance degree:', preds.count('msc_fin')/len(preds))




Accuracy for predicting whether a question belongs to the Finance degree: 0.5435168738898757


**Business Analytics**

In [100]:
# load data
q_a_train = pd.read_csv('question-answers/Q&A Data - MSc Business Analytics.csv')
# clean
q_a_train = q_a_train[['Q', 'A']]
q_a_train.rename(columns={'Q':'question', 'A':'answers'}, inplace=True)

preds = []

for question in q_a_train['question'].values:
    degree_association = get_related_degree(question)
    preds.append(degree_association)

Business Analytics > 0.48195332 0.4556929 < Finance
Business Analytics > 0.58727187 0.58058786 < Finance
Business Analytics > 0.4883334 0.460242 < Finance
Business Analytics > 0.58258665 0.5862954 < Finance
Business Analytics > 0.53765786 0.54041725 < Finance
Business Analytics > 0.53987086 0.53923434 < Finance
Business Analytics > 0.5457426 0.547849 < Finance
Business Analytics > 0.580479 0.59201896 < Finance
Business Analytics > 0.536468 0.54436964 < Finance
Business Analytics > 0.52987564 0.5231559 < Finance
Business Analytics > 0.53673106 0.53106 < Finance
Business Analytics > 0.5428398 0.5210152 < Finance
Business Analytics > 0.4808547 0.51267445 < Finance
Business Analytics > 0.56986177 0.5552046 < Finance
Business Analytics > 0.5402145 0.5866184 < Finance
Business Analytics > 0.46094373 0.50311357 < Finance
Business Analytics > 0.6163918 0.6044616 < Finance
Business Analytics > 0.686414 0.68204 < Finance
Business Analytics > 0.45906478 0.49867707 < Finance
Business Analytics > 0

Business Analytics > 0.6005807 0.61069274 < Finance
Business Analytics > 0.56223094 0.5556488 < Finance
Business Analytics > 0.5944638 0.6002252 < Finance
Business Analytics > 0.5659051 0.5486733 < Finance
Business Analytics > 0.6237119 0.59840333 < Finance
Business Analytics > 0.6146314 0.5944727 < Finance
Business Analytics > 0.5933651 0.5875113 < Finance
Business Analytics > 0.53494096 0.5400264 < Finance
Business Analytics > 0.5957383 0.59324324 < Finance
Business Analytics > 0.5735967 0.55999315 < Finance
Business Analytics > 0.5561476 0.5516491 < Finance
Business Analytics > 0.63485056 0.6261102 < Finance
Business Analytics > 0.5760484 0.57084954 < Finance
Business Analytics > 0.50652546 0.54035443 < Finance
Business Analytics > 0.48097265 0.5210133 < Finance
Business Analytics > 0.5137804 0.53711426 < Finance
Business Analytics > 0.48247588 0.5104301 < Finance
Business Analytics > 0.50692475 0.5131483 < Finance
Business Analytics > 0.51091284 0.52240324 < Finance
Business Analyt

Business Analytics > 0.4505623 0.4747767 < Finance
Business Analytics > 0.5088681 0.552269 < Finance
Business Analytics > 0.4346719 0.47646728 < Finance
Business Analytics > 0.5193084 0.5755898 < Finance
Business Analytics > 0.49793264 0.54458094 < Finance
Business Analytics > 0.4542544 0.4849895 < Finance
Business Analytics > 0.42961514 0.44964033 < Finance
Business Analytics > 0.5613984 0.55304337 < Finance
Business Analytics > 0.49743932 0.49157745 < Finance
Business Analytics > 0.5126545 0.4982753 < Finance
Business Analytics > 0.45689452 0.44134566 < Finance
Business Analytics > 0.4549155 0.4489923 < Finance
Business Analytics > 0.5045576 0.4898005 < Finance
Business Analytics > 0.52922034 0.4947793 < Finance
Business Analytics > 0.5908074 0.5645678 < Finance
Business Analytics > 0.4666096 0.5124639 < Finance
Business Analytics > 0.39110628 0.42719623 < Finance
Business Analytics > 0.52593184 0.564788 < Finance
Business Analytics > 0.31515172 0.34670323 < Finance
Business Analytic

Business Analytics > 0.5114718 0.5108171 < Finance
Business Analytics > 0.46745813 0.45708334 < Finance
Business Analytics > 0.49195048 0.4840826 < Finance
Business Analytics > 0.5837599 0.5620094 < Finance
Business Analytics > 0.5892568 0.56899554 < Finance
Business Analytics > 0.57988465 0.56546175 < Finance
Business Analytics > 0.6257912 0.60834587 < Finance
Business Analytics > 0.46901527 0.5059008 < Finance
Business Analytics > 0.33986223 0.3628598 < Finance
Business Analytics > 0.421201 0.46265727 < Finance
Business Analytics > 0.46275017 0.4950509 < Finance
Business Analytics > 0.46947193 0.46726766 < Finance
Business Analytics > 0.54550886 0.56528234 < Finance
Business Analytics > 0.45405394 0.475474 < Finance
Business Analytics > 0.5555799 0.59800094 < Finance
Business Analytics > 0.49965268 0.5357245 < Finance
Business Analytics > 0.5499723 0.5454277 < Finance
Business Analytics > 0.5250309 0.5060002 < Finance
Business Analytics > 0.5199498 0.5107741 < Finance
Business Analyt

In [102]:
print('\n\n\nAccuracy for predicting whether a question belongs to the Business Analytics degree:', preds.count('msc_ba')/len(preds))




Accuracy for predicting whether a question belongs to the Business Analytics degree: 0.4564831261101243
