In [None]:
import pickle, random, os, json, spacy, seaborn, requests, string, copy
import pandas as pd
import pyterrier as pt 
from autocorrect import Speller
from pyterrier.measures import *
from pathlib import *
from bs4 import BeautifulSoup

#### learning to rank
import fastrank
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

#### setting(jupyter notebook)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#### setting(pandas)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
def getSynonymsByGoogle(query):
    url = "https://www.google.com/search?q=" + query
    res = requests.get(url) 
    Soup = BeautifulSoup(res.text,'html.parser') 
    links = Soup.findAll('a')
    link_img = None
    for link in links:
        if 'Images' in link.get_text():
            link_img = link.get('href')
            break
    synonyms = []
    if link_img:
        url = 'https://www.google.com'+link_img
        res = requests.get(url)
        Soup = BeautifulSoup(res.text,'html.parser') 
        span_synonyms = Soup.find_all("a",class_= "TwVfHd")
        for span in span_synonyms:
            synonyms.append(span.get_text('innerText'))
    return synonyms

def isVerb(text):
    """Check if is verb."""
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    return doc[0].pos_ == 'VERB'

def RemoveOneChar(s):
    idx_rm = random.randint(0,len(s)-1)
    return s[:idx_rm]+s[idx_rm+1:]

def InsertOneChar(s):
    char = random.choice(string.ascii_uppercase)
    idx_insert = random.randint(0,len(s)-1)
    return s[:idx_rm]+char+s[idx_rm:]

def replaceOneChar(s):
    char = random.choice(string.ascii_uppercase)
    idx_insert = random.randint(0,len(s)-1)
    return s[:idx_rm]+char+s[idx_rm+1:]

In [188]:
class SearchEngine:
    def __init__(self,pt):
        self.pt = pt    
        self.setPaths()
        self.spell = Speller()
        ## spell('temwork') => teamwork
        
    def splitDataSet(self):
        RANK_CUTOFF = 10
        SEED=42
        self.train_topics, self.test_topics = train_test_split(self.df_query, test_size=4, random_state=SEED)

        
    def setPaths(self): 
        self.paths = dict()
        self.paths['dataset'] = os.path.join(Path(os.getcwd()).parent.absolute(),'dataset')
        self.paths['data_txt']  = os.path.join(self.paths['dataset'], 'data_txt')
        self.paths['data_json'] = os.path.join(self.paths['dataset'], 'data_json')
        self.paths['data_csv']  = os.path.join(self.paths['dataset'], 'pyterrier-index.csv')
        self.paths['pt_index']  = './resume_index'
        self.paths['synonymDict']  = '../dataset/query_expansion.pkl'
        self.paths['query']  = '../dataset/query.csv'
        self.paths['qrels']  = '../dataset/qrel/'
        
    def JsontoDataFrame(self, profiles_json, split=True):
        rows = []
        docno = 0
        for profile in profiles_json:
            meta_data = {
                "education": json.dumps(profile["Education"]),
                "company": None,
                "title": None,
                "period": None,
            }
            for expBlock in profile["Experience"]:
                meta_data["company"] = expBlock["company"]
                meta_data["title"]   = expBlock["title"]
                meta_data["period"]  = expBlock["period"]
                if expBlock["description"]:
                    bulletpoints = expBlock["description"].split('\n') if split else [expBlock["description"]]
                    for bulletpoint in bulletpoints:
                        if len(bulletpoint.split()) >= 5:
                            row = [str(docno),bulletpoint]
                            for item in meta_data.values():
                                row.append(item)
                            rows.append(row)
                            docno += 1
        docs_df =   pd.DataFrame(rows,columns= ['docno','text']+list(meta_data.keys()) )
        return docs_df

    def loadDocs(self, verbose=False):
        if verbose:
            print('----------------load profiles(documents) to profiles----------------')
        
        if not os.path.exists(self.paths['data_csv']):
            # get raw json files & preprocess
            profiles_json = []
            for filename in os.listdir(self.paths['data_json']):
                path_profile  = os.path.join( self.paths['data_json'], filename  )
                with open(path_profile) as ptr:
                    profile_json = json.load(ptr)
                    profiles_json.append(profile_json)
            profiles = self.JsontoDataFrame(profiles_json)

        else:
            profiles = pd.read_csv(self.paths['data_csv'])
            profiles['docno'] = profiles['docno'].astype(str)
            profiles['company'] =  profiles['company'].astype(str)
        self.profiles = profiles
    
    def loadQueries(self, misspell = False, autoCorrect = False, expand = True, verbose=False):
        """Load queries."""
        if verbose:
            print('----------------load queries to df_query----------------')
            
        df_query = pd.read_csv(self.paths['query'])
        df_query['qid'] = df_query['qid'].astype(str)
        self.df_query = df_query
        
        if misspell:
            self.makeMisspelling(numReplace = 0, numRemove = 0, numInsert = 1, verbose=verbose)
            
        if autoCorrect:
            self.autoCorrectQueris(verbose=verbose)
            
        if expand:
            self.expandQuery(verbose=verbose)
        
    def loadQrels(self, verbose=False):
        """Load labeled 7 queries."""
        if verbose:
            print('----------------load queries & labels to qrels----------------')
            
        queries = []
        for i in range(1, 15):
            subpath = self.paths['qrels'] + f'query - {i}.csv'
            if verbose:
                print('load the labels for query - %d'%(i),subpath)
            df = pd.read_csv(subpath)
            queries.append(df)
        qrels = pd.concat(queries)

        qrels = qrels.rename(columns={"score": "label"})

        qrels = qrels.drop(['education', 'company', 'title', 'period', 'text'], axis=1)
        qrels = qrels.reset_index(drop=True)
        qrels = qrels.fillna(-1)

        qrels['qid'] = qrels['qid'].astype(str)
        qrels['docno'] = qrels['docno'].astype(str)
        qrels['label'] = qrels['label'].astype(int)
        self.qrels = qrels
    
    def expandQuery(self,verbose=False):
        """Expand Query with specific weights."""
        if verbose:
            print('----------------expand the queries by appending synonyms----------------')
        
        for ind in self.df_query.index:
            query_tmp = self.df_query['query'][ind]
            synonyms = getSynonymsByGoogle(query_tmp)
            query_expand = [query_tmp]*(len(synonyms)*2) + synonyms
            self.df_query['query'][ind] = ' '.join(query_expand)
            if verbose:
                print(query_tmp,synonyms)
    
    def makeMisspelling(self, numReplace=1, numRemove = 1, numInsert = 1,verbose=False):
        """Reload Query and randomly insert & remove chars"""
        if verbose:
            print('----------------generate misspelling by randomly insert&remove char from original queries----------------')
        for ind in self.df_query.index:
            query_tmp = self.df_query['query'][ind]
            query_misspelling = query_tmp
            
            numRemove = min(numRemove,len(query_misspelling)-1)
            for ite in range(numReplace):
                query_misspelling = replaceOneChar(query_misspelling)
            for ite in range(numRemove):
                query_misspelling = RemoveOneChar(query_misspelling)
            for ite in range(numInsert):
                query_misspelling = InsertOneChar(query_misspelling)
            
                
            self.df_query['query'][ind] = query_misspelling
            if verbose:
                print(query_tmp,' => ',query_misspelling)
    
    def autoCorrectQueris(self,verbose=False):
        if verbose:
            print('----------------Autocorrection----------------')
            
        for ind in self.df_query.index:
            query_tmp = self.df_query['query'][ind]
            query_tmp2 = query_tmp.split()
            query_autoCorrect = ' '.join([self.spell(query) for query in query_tmp2])
            self.df_query['query'][ind] = query_autoCorrect
            if verbose:
                print(query_tmp, ' => ' ,query_autoCorrect)
                
    def indexing(self, verbose=False):
        if verbose:
            print('----------------index the profiles(documents)----------------')
            
        ### load index fils to index obj
        ### If index fils are not existed, create index files by indexer
        #if not os.path.exists(paths['pt_index'] + "/data.properties"):
        if True:
            index_dir = self.paths['pt_index']
            indexer = self.pt.DFIndexer(index_dir, overwrite=True)
            index_ref = indexer.index(self.profiles["text"], self.profiles["docno"], self.profiles["education"], self.profiles["title"], self.profiles["company"], self.profiles["period"], self.profiles["text"])
            #index_ref = indexer.index(profiles["text"], profiles["docno"])

            ### load the index
            index = self.pt.IndexFactory.of(index_ref)
        else:
            index = self.pt.IndexFactory.of(paths['pt_index'])
        
        if verbose:
            ### show the stat 
            print(index.getCollectionStatistics().toString())
   
        self.index = index
        
    def loadIRmodels(self,modelNames = ['TF_IDF','BM25','DPH'], verbose = False):
        if verbose:
            print('----------------load IR models (default: TF_IDF, BM25, DPH)----------------')
        self.models = {  modelName: self.pt.BatchRetrieve(self.index, wmodel=modelName) for modelName in modelNames}
    
    def learning_to_rank(self):
        RANK_CUTOFF = 20
        SEED = 10
        bm25 = self.models['BM25']
        pt   = self.pt
        index = self.index
        
        ltr_feats2 = (bm25 % RANK_CUTOFF) >> pt.text.get_text(index, ["text","title", "education", "company", "period"]) >> (
                            pt.transformer.IdentityTransformer()
                            ** 
                            (pt.apply.query(lambda row: 'test') >> bm25)
                            )
        fnames=["isVerb"]
        
        ### random forest pipe
        rf = RandomForestRegressor(n_estimators=400, verbose=1, random_state=SEED, n_jobs=8)
        rf_pipe2 = ltr_feats2 >> pt.ltr.apply_learned_model(rf)
        %time rf_pipe2.fit(self.df_query, self.qrels)
        self.models['Random_Forest'] = rf_pipe2
        
    def search(self,query,modelName):
        if modelName not in self.models.keys():
            print('Model not found, Available:',list(self.models.keys()))
        else:
            return self.models[modelName].search(query)
             

# Set up ResumeHelper

In [189]:
if not pt.started():
    pt.init()
ResumeHelper = SearchEngine(pt)
ResumeHelper.loadDocs(verbose= True)
ResumeHelper.loadQueries(misspell=False,autoCorrect=False,expand=False, verbose= True)
ResumeHelper.loadQrels(verbose= True)
ResumeHelper.indexing(verbose= True)
ResumeHelper.loadIRmodels(verbose= True)
ResumeHelper.df_query

----------------load profiles(documents) to profiles----------------
----------------load queries to df_query----------------
----------------load queries & labels to qrels----------------
load the labels for query - 1 ../dataset/qrel/query - 1.csv
load the labels for query - 2 ../dataset/qrel/query - 2.csv
load the labels for query - 3 ../dataset/qrel/query - 3.csv
load the labels for query - 4 ../dataset/qrel/query - 4.csv
load the labels for query - 5 ../dataset/qrel/query - 5.csv
load the labels for query - 6 ../dataset/qrel/query - 6.csv
load the labels for query - 7 ../dataset/qrel/query - 7.csv
----------------index the profiles(documents)----------------
Number of documents: 363
Number of terms: 1510
Number of postings: 4637
Number of fields: 0
Number of tokens: 4906
Field names: []
Positions:   false

----------------load IR models (default: TF_IDF, BM25, DPH)----------------


Unnamed: 0,qid,query
0,1,sql
1,2,Information retrieval nlp
2,3,teamwork
3,4,web front end
4,5,aws cloud
5,6,machine learning
6,7,data visualization
7,8,sdk platform build
8,9,data analytics
9,10,kubernetes container


In [190]:
ResumeHelper.loadQueries(misspell=True,autoCorrect=True,expand=False, verbose= True)

----------------load queries to df_query----------------
----------------generate misspelling by randomly insert&remove char from original queries----------------
sql  =>  sqlY
Information retrieval nlp  =>  InfoQrmation retrieval nlp
teamwork  =>  teamVwork
web front end  =>  web Gfront end
aws cloud  =>  aws Xcloud
machine learning  =>  machXine learning
data visualization  =>  dataN visualization
sdk platform build  =>  sdk Rplatform build
data analytics  =>  dataP analytics
kubernetes container  =>  kubeUrnetes container
rest api request  =>  restL api request
quality integrity  =>  qualZity integrity
site-reliability  =>  siteG-reliability
object-oriented  =>  objeHct-oriented
design pattern  =>  desiZgn pattern
data learning model  =>  dataC learning model
web security  =>  web Tsecurity
distributed infrastructure  =>  distAributed infrastructure
system design latency  =>  systYem design latency
architecture system design quality  =>  archRitecture system design quality
---------

# Demo the usage of ResumeHelper (You can skip this block)

In [187]:
docids = ResumeHelper.search(query = 'sql', modelName='DPH')['docid'][:10]
print('Query: sql')
print('IR model: DPH')
print()
print('Top10 relevent documents:')
for rank,docid in enumerate(docids):
    doc = ResumeHelper.profiles.loc[ResumeHelper.profiles['docno'] == str(docid)]['text'].to_string()
    print(doc)

Query: sql
IR model: DPH

Top10 relevent documents:
91    • Analyzed clients’ requirements, designed ER models and constructed corresponding databases using SQL server including 200k records; gathered comprehensive profiles of cars and customers by SQL queries
96    • Accelerated data processing by 50% by automating data retrieval and extraction with SQL queries
152    • Wrote stored procedures, functions, and packages using PL/SQL, reducing project time by 25%
216    • Cooperated with engineers and PMs to design image recommendation feature; analyzed relationship between image usage and ad performance using TB level data (SQL, C#)
243    •Designed an automated customer retention system with Python and SQL to complete weekly client status report for strategy development. Increased customer retention rate by 15%.
176    - Use SQL / python to combine various source of financial data
347    - Implement front-end and back-end for various projects with multiple programming languages and pla

# Exp (w/o query expansion)

In [192]:
# load the original queries
ResumeHelper.loadQueries(misspell=False, autoCorrect=False, expand=False, verbose= False)

# Train RF with origianl queries
ResumeHelper.learning_to_rank()

# Run exp.
ResumeHelper.pt.Experiment(
    retr_systems = [ResumeHelper.models['TF_IDF'],ResumeHelper.models['BM25'],ResumeHelper.models['DPH'],ResumeHelper.models['Random_Forest']],
    topics = ResumeHelper.df_query.loc[1:7],
    qrels = ResumeHelper.qrels,
    names = ['TF_IDF','BM25','DPH','Random_Forest'],
    eval_metrics=["map", NDCG@5 ,NDCG@10] )

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 400 out of 400 | elapsed:    0.4s finished


CPU times: user 1.38 s, sys: 297 ms, total: 1.67 s
Wall time: 882 ms


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 400 out of 400 | elapsed:    0.0s finished


Unnamed: 0,name,map,nDCG@5,nDCG@10
0,TF_IDF,0.383435,0.71827,0.672158
1,BM25,0.383491,0.71827,0.672158
2,DPH,0.400839,0.765254,0.698722
3,Random_Forest,0.36659,0.762776,0.730332


# Exp (with query expansion)

In [193]:
# load the original queries => query expansion
ResumeHelper.loadQueries(misspell=False, autoCorrect=False, expand=True, verbose= False)

# Train RF with augmented queries
ResumeHelper.learning_to_rank()

# Run exp.
ResumeHelper.pt.Experiment(
    retr_systems = [ResumeHelper.models['TF_IDF'],ResumeHelper.models['BM25'],ResumeHelper.models['DPH'],ResumeHelper.models['Random_Forest']],
    topics = ResumeHelper.df_query.loc[1:7],
    qrels = ResumeHelper.qrels,
    names = ['TF_IDF','BM25','DPH','Random_Forest'],
    eval_metrics=["map", NDCG@5 ,NDCG@10] )

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 400 out of 400 | elapsed:    0.4s finished


CPU times: user 1.01 s, sys: 314 ms, total: 1.32 s
Wall time: 924 ms


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 400 out of 400 | elapsed:    0.1s finished


Unnamed: 0,name,map,nDCG@5,nDCG@10
0,TF_IDF,0.494266,0.7684,0.771243
1,BM25,0.496571,0.7684,0.771243
2,DPH,0.506957,0.806511,0.797315
3,Random_Forest,0.48239,0.985868,0.940411


In [194]:
# load the original queries => query expansion
ResumeHelper.loadQueries(misspell=True, autoCorrect=False, expand=False, verbose= False)

# Train RF with augmented queries
ResumeHelper.learning_to_rank()

# Run exp.
ResumeHelper.pt.Experiment(
    retr_systems = [ResumeHelper.models['TF_IDF'],ResumeHelper.models['BM25'],ResumeHelper.models['DPH'],ResumeHelper.models['Random_Forest']],
    topics = ResumeHelper.df_query.loc[1:7],
    qrels = ResumeHelper.qrels,
    names = ['TF_IDF','BM25','DPH','Random_Forest'],
    eval_metrics=["map", NDCG@5 ,NDCG@10] )

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 400 out of 400 | elapsed:    0.4s finished


CPU times: user 664 ms, sys: 234 ms, total: 898 ms
Wall time: 651 ms


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 400 out of 400 | elapsed:    0.1s finished


Unnamed: 0,name,map,nDCG@5,nDCG@10
0,TF_IDF,0.309748,0.682605,0.601278
1,BM25,0.309748,0.682605,0.601278
2,DPH,0.327497,0.740909,0.632336
3,Random_Forest,0.298993,0.740909,0.668672


In [195]:
# load the original queries => query expansion
ResumeHelper.loadQueries(misspell=True, autoCorrect=True, expand=False, verbose= False)

# Train RF with augmented queries
ResumeHelper.learning_to_rank()

# Run exp.
ResumeHelper.pt.Experiment(
    retr_systems = [ResumeHelper.models['TF_IDF'],ResumeHelper.models['BM25'],ResumeHelper.models['DPH'],ResumeHelper.models['Random_Forest']],
    topics = ResumeHelper.df_query.loc[1:7],
    qrels = ResumeHelper.qrels,
    names = ['TF_IDF','BM25','DPH','Random_Forest'],
    eval_metrics=["map", NDCG@5 ,NDCG@10] )

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 400 out of 400 | elapsed:    0.4s finished


CPU times: user 746 ms, sys: 282 ms, total: 1.03 s
Wall time: 792 ms


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 400 out of 400 | elapsed:    0.1s finished


Unnamed: 0,name,map,nDCG@5,nDCG@10
0,TF_IDF,0.365688,0.644799,0.616955
1,BM25,0.365744,0.644799,0.616955
2,DPH,0.38232,0.701326,0.650689
3,Random_Forest,0.348325,0.698849,0.682299


In [196]:
# load the original queries => query expansion
ResumeHelper.loadQueries(misspell=True, autoCorrect=True, expand=True, verbose= False)

# Train RF with augmented queries
ResumeHelper.learning_to_rank()

# Run exp.
ResumeHelper.pt.Experiment(
    retr_systems = [ResumeHelper.models['TF_IDF'],ResumeHelper.models['BM25'],ResumeHelper.models['DPH'],ResumeHelper.models['Random_Forest']],
    topics = ResumeHelper.df_query.loc[1:7],
    qrels = ResumeHelper.qrels,
    names = ['TF_IDF','BM25','DPH','Random_Forest'],
    eval_metrics=["map", NDCG@5 ,NDCG@10] )

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 400 out of 400 | elapsed:    0.4s finished


CPU times: user 936 ms, sys: 335 ms, total: 1.27 s
Wall time: 924 ms


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 400 out of 400 | elapsed:    0.1s finished


Unnamed: 0,name,map,nDCG@5,nDCG@10
0,TF_IDF,0.453224,0.69493,0.723961
1,BM25,0.455761,0.69493,0.723961
2,DPH,0.470159,0.74898,0.762009
3,Random_Forest,0.445887,0.953788,0.901664


In [None]:
# load the original queries => query expansion
ResumeHelper.loadQueries(misspell=True, autoCorrect=True, expand=False, verbose= False)

# Train RF with augmented queries
ResumeHelper.learning_to_rank()

# Run exp.
ResumeHelper.pt.Experiment(
    retr_systems = [ResumeHelper.models['TF_IDF'],ResumeHelper.models['BM25'],ResumeHelper.models['DPH'],ResumeHelper.models['Random_Forest']],
    topics = ResumeHelper.df_query.loc[1:7],
    qrels = ResumeHelper.qrels,
    names = ['TF_IDF','BM25','DPH','Random_Forest'],
    eval_metrics=["map", NDCG@5 ,NDCG@10] )