# SEARCH ENGINEERING

In [1]:
#Import the required libraries
import pandas as pd
import numpy as np
from time import sleep
from random import randint
import unicodedata
import re

## Will combine all the 3 CSV based on the names from the google scholar crawling 

In [None]:
#add the ID column and drop the page index from crawling 
coventry_all = pd.read_csv('coventry1.csv')
coventry_all['ID'] = [x for x in range(1, len(coventry_all.values)+1)]
coventry_all.drop(coventry_all.columns[0], axis=1, inplace=True)
coventry_all.research_field=coventry_all.research_field.astype(str) #avoid been seen as a float 
coventry_all

## Combine Google Scholar data with Faculty data from CU based on researchers present in Google scholar

In [None]:
f1a = pd.DataFrame(pd.read_csv('coventry1.csv'))
f2a = pd.DataFrame(pd.read_csv('Finded_URL_All.csv'))
facultDept = pd.merge(f1a, f2a, how='left', left_on='names', right_on='pernames')
drop_cols = ['Unnamed: 0_x', 'Unnamed: 0_y', 'mainfaculty']
facultyDept = facultDept.drop(drop_cols, axis=1)
facultyDept = facultyDept.drop_duplicates(subset = 'names', keep = 'first') #remove duplicates
facultyDept.to_csv('facultyDept.csv')

In [None]:
facultyDept.head()

In [None]:
f3a = pd.DataFrame(pd.read_csv('facultyDept.csv'))
f4a = pd.DataFrame(pd.read_csv('cov_reasearch.csv'))
facultDept_1 = pd.merge(f3a, f4a, how='left', left_on='names', right_on='names_2')
drop_cols = ['Unnamed: 0_x', 'Unnamed: 0_y','pernames','names_2']
facultyDept_1 = facultDept_1.drop(drop_cols, axis=1)
facultyDept_1.to_csv('cov_facultyDept.csv')

In [None]:
facultyDept_1.head()

### Reading in required data

In [2]:
#add the ID column and drop the page index from crawling 
coventry_all_2 = pd.read_csv('cov_facultyDept.csv')
coventry_all_2['ID'] = [x for x in range(1, len(coventry_all_2.values)+1)]
coventry_all_2.drop(coventry_all_2.columns[0], axis=1, inplace=True)
coventry_all_2.research_field=coventry_all_2.research_field.astype(str) #avoid been seen as a float 
coventry_all_2.head()

Unnamed: 0,names,link,research_field,subfaculty,faculty,link_CU,research_interest,ID
0,Timothy Mason,https://scholar.google.co.uk/citations?hl=en&u...,"sonochemistry, ultrasound, chemistry, environm...",,,,,1
1,Gurnam Singh,https://scholar.google.co.uk/citations?hl=en&u...,"social work, race and racism, critical pedagog...",,,https://pureportal.coventry.ac.uk/en/persons/g...,Research Interests: Emancipatory research and ...,2
2,WD Li,https://scholar.google.co.uk/citations?hl=en&u...,,,,,,3
3,Dr. Mohammad M Ali,https://scholar.google.co.uk/citations?hl=en&u...,"Forecast Information Sharing, ARIMA Modelling,...",,,,,4
4,Petra Wark,https://scholar.google.co.uk/citations?hl=en&u...,"m/eHealth, epidemiology, primary prevention, d...",Faculty Research Centre for Intelligent Health...,Faculty of Health & Life Sciences,https://pureportal.coventry.ac.uk/en/persons/p...,Research Interests: Efficacy and effectiveness...,5


##### Using the google scholar data (with faculty and research interest if available)

In [None]:
#Checking missing data. Missing data are not going to be dropped as student might be looking for name of the professor 
print(coventry_all_2.dtypes)
print('-------------------------')
# to see where you're missing data and how much data is missing 
print(coventry_all_2.isnull().sum())

In [None]:
#coventry_all.rename(columns={'name':'names'}, inplace=True)

## INDEXER

An indexer of a search engine it's basically a smart storage of our data which we can later easily retrieve data given a search query. It parses the name and research field of the data scraped by the crawler to single words. All these words make up the vocabulary of our index. 
Next step is to put the ID of the data in the posting lists of the words that the data contains. For example data called "This happened today" will be stored in posting lists of terms "this", "happened" and "today". 
Before creating the index we preprocess the text of the data in order to get rid of useless information. We the text of accents and turn everything to lowercase. Next we perform lemmatization. This is slightly smarter version of stemming. Essentially, it's a word normalization, e.g. all nouns to singular, all verbs in present tense etc

#### Importing Libraries

In [3]:
import nltk
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer 
import string


In [4]:
coventry_processed = coventry_all_2.copy()
coventry_all_2a = coventry_all_2.copy()

In [5]:
single_entry = coventry_processed.loc[0,:].copy()

In [6]:
single_entry

names                                                    Timothy Mason
link                 https://scholar.google.co.uk/citations?hl=en&u...
research_field       sonochemistry, ultrasound, chemistry, environm...
subfaculty                                                         NaN
faculty                                                            NaN
link_CU                                                            NaN
research_interest                                                  NaN
ID                                                                   1
Name: 0, dtype: object

#### Text preprocessing 

Turn text to lowercase and remove punctuations

In [7]:
def process_string(text):
    text = text.lower() #to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation)) #strip punctuation
    return text

In [8]:
process_string(single_entry.research_field)

'sonochemistry ultrasound chemistry environment food technology'

Now, lemmatize, i.e. word normalization.

This method requires some additional information about the words. We need to find the word category of each word, e.g. verb, noun etc.

In [9]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

Test the function

In [10]:
print("Apple: {}\n Run: {}\n Happy: {}" .format(get_wordnet_pos("apple"), get_wordnet_pos("run"), get_wordnet_pos("happy")))

Apple: n
 Run: v
 Happy: a


We also need to remove stopwords, i.e. words with low informational value and repeats alot making the document bigger

In [11]:
stop = stopwords.words('english')

Now we'll iterate over all words in text, lemmatize and return the transformed string.

In [12]:
lem = WordNetLemmatizer()

def stop_lemmatize(doc):
    tokens = nltk.word_tokenize(doc)
    tmp = ""
    for w in tokens:
        if w not in stop:
            tmp += lem.lemmatize(w, get_wordnet_pos(w)) + " "
    return tmp

In [13]:
stop_lemmatize(doc = single_entry.research_field)

'sonochemistry , ultrasound , chemistry , environment , food technology '

In [14]:
%time process_string(single_entry.research_field)

Wall time: 0 ns


'sonochemistry ultrasound chemistry environment food technology'

Now we apply the process_string function to all names and research fields in our database.

In [None]:
print(coventry_all_2.dtypes)
print('--------------------------')
print(coventry_processed.dtypes)

In [15]:
def transform_df(df):
    df['names'] = df['names'].apply(process_string)
    df['research_field'] = df['research_field'].apply(process_string)
    

In [16]:
%time transform_df(coventry_processed)

Wall time: 7.98 ms


In [None]:
coventry_processed.head()

In [17]:
coventry_processed['text'] = coventry_processed['names'] + " " + coventry_processed['research_field']
drop_cols = ['names','link', 'research_field', 'subfaculty', 'faculty', 'link_CU', 'research_interest']
coventry_processed = coventry_processed.drop(drop_cols, axis=1)

In [None]:
coventry_processed.dtypes

In [18]:
def transform_df(df):
    df = df
    df['names'] = df['names'].apply(process_string)
    df['research_field'] = df['research_field'].apply(process_string)
    df['text'] = df['names'] + " " + df['research_field']
    drop_cols = ['names', 'research_field', 'link']
    df = df.drop(drop_cols, axis=1)
    return df

#### Build Index

First, we'll build index with just one entry.

In [19]:
single_entry = coventry_processed.loc[0,:].copy()
print(single_entry)

ID                                                      1
text    timothy mason sonochemistry ultrasound chemist...
Name: 0, dtype: object


Split the entry to single words and return list and save entry's ID as object.

In [20]:
words = single_entry.text.split()
ID = single_entry.ID
print(words)
print(ID)

['timothy', 'mason', 'sonochemistry', 'ultrasound', 'chemistry', 'environment', 'food', 'technology']
1


Each word in index' vocabulary is a dictionary key and has its own posting list with IDs. Let's construct one word vocabulary as example.

In [21]:
word = words[0]
sample = {word: [ID]}
print(sample)

{'timothy': [1]}


Now we iterate over all words and if they aren't in the vocabulary yet we add them. Also for each word we append the entry ID to the posting list.

In [22]:
index_test = {}
for word in words:
    if word in index_test.keys():
        index_test[word].append(ID)
    else:
        index_test[word] = [ID]

In [23]:
print(index_test)

{'timothy': [1], 'mason': [1], 'sonochemistry': [1], 'ultrasound': [1], 'chemistry': [1], 'environment': [1], 'food': [1], 'technology': [1]}


###### Now this process can be repeated for all entries in the database

In [24]:
def index_itd(single_entry, index):
    words = single_entry.text.split()
    ID = single_entry.ID
    for word in words:
        if word in index.keys():
            index[word].append(ID)
        else:
            index[word] = [ID]
    return index

In [58]:
index_itd(machine)

NameError: name 'machine' is not defined

In [25]:
ind = index_itd(single_entry=single_entry, index= {})
print(ind)

{'timothy': [1], 'mason': [1], 'sonochemistry': [1], 'ultrasound': [1], 'chemistry': [1], 'environment': [1], 'food': [1], 'technology': [1]}


Again we can iterate over all entries in the database, process them append to index.

In [26]:
def index_all(df, index):
    for i in range(len(df)):
        single_entry = df.loc[i,:]
        index = index_itd(single_entry = single_entry, index = index)
    return index

In [27]:
index = index_all(coventry_processed, index = {})
len(index)

2761

Finally we wrap everything in one nice function.

In [28]:
def build_index(df, index):
    to_add = transform_df(df)
    index = index_all(df = to_add, index = index)
    return index

In [29]:
idx = build_index(df = coventry_all_2, index = {})

In [30]:
len(idx)

2761

###### Ranked retrieval
The user would probably prefer the more relevant pages to be displayed before those that are less relevant (hopefully they're at least a bit relevant). For our search engine to support such option we need to store some information about the scraped documents that could be later used for this purpose. We'll use averaged word2vec for this purpose. Word2Vec model is single hidden-layer neural network. The hidden layer is actually what is so useful about this model. Given a word the layer's activation gives a unique vector that word. For each document we can iterate over all words, extract their vectors and then by averaging obtain a document vector. Compared to other methods averaged word2vec has multiple advantages. Unlike simpler methods such as bag-of-words, n-grams and tf-idf the size of the vectors is fixed. For example bag-of-words is also using vectors but the size of these vectors equals the number of unique words in the corpus. This means that the computational and storage requirements get larger as the corpus gets larger. Averaged word2vec is also able to represent the documents on more abstract level than simpler methods and should therefore provide better method of ranking. We're using word2vec rather than doc2vec because we can simply use pretrained word2vec model to compute the document vectors. Using doc2vec would mean training a neural network from scratch which requires computational power, time and rather large dataset.

Import and download pretrained word2vec model

In [31]:
import gensim
from gensim import models
import numpy as np

Load word2vec model

In [32]:
#DOWNLOAD PRETRAINED  NETWORK FROM https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
word2vec = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True, limit= 10**5)

In [33]:
def average_vectors(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.vocab]
    if len(doc) == 0:
        return np.zeros(300)
    else:
        return np.mean(word2vec_model[doc], axis=0)

In [34]:
%time test_vec = average_vectors(word2vec, words)

Wall time: 0 ns


Now we can iterate over documents, compute their vectors and construct a document vectors database.

In [35]:
def prepare_ranking(df):
    corpus = df[['ID', 'text']].copy()
    doc_vecs = {}
    for i in range(len(corpus)):
        row = corpus.loc[i,:]
        text = row.text.split()
        doc_vecs[row.ID]=average_vectors(word2vec, text)
    doc_vecs = pd.DataFrame.from_dict(data=doc_vecs, orient="index")
    doc_vecs['ID'] = doc_vecs.index
    return doc_vecs

In [89]:
doc_vecs = prepare_ranking(df=coventry_all_2)
doc_vecs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,ID
1,-0.004995,0.145549,-0.034017,0.059591,-0.135661,0.157700,0.003092,-0.249349,-0.066030,-0.110738,...,0.062927,-0.138468,0.027649,0.030111,0.122528,0.026530,-0.037150,0.069295,0.086675,1
2,0.022522,-0.079918,0.035995,0.025421,0.027618,0.024967,0.107208,-0.074802,0.044128,-0.098450,...,-0.015661,-0.152939,0.086594,-0.089752,0.048096,0.095215,-0.024162,0.064499,0.040466,2
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3
4,-0.037937,-0.024832,-0.023218,0.063184,-0.047705,0.057776,0.000638,-0.115820,0.038379,0.114746,...,0.115570,-0.063135,0.002841,0.117480,-0.059705,0.042377,-0.107935,0.098145,-0.098505,4
5,-0.181356,0.044922,-0.045186,0.107686,0.062350,0.107951,0.099264,-0.093320,0.035726,-0.056599,...,-0.031942,0.058922,0.080709,0.116130,-0.001582,-0.014516,-0.065694,0.179199,0.082611,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
678,0.062280,0.011563,0.048730,0.213477,-0.025598,0.067212,0.133105,-0.093530,0.051514,0.089893,...,0.031836,0.012695,-0.027441,0.049124,-0.033838,0.100439,-0.083398,0.181696,0.035620,678
679,0.133586,0.060221,0.068899,0.165365,0.050496,-0.005371,0.177653,-0.137044,0.068766,-0.005046,...,-0.110748,-0.045492,0.017721,0.051107,0.009928,0.089681,-0.021891,-0.004557,-0.285156,679
680,-0.007812,0.117554,-0.021118,0.103394,0.010925,0.085999,0.187927,-0.206055,0.109253,-0.177490,...,0.095642,-0.137299,0.109314,0.003357,-0.052979,0.167908,-0.046844,0.260986,0.230835,680
681,-0.028483,0.090603,0.035319,0.136081,-0.021756,0.030965,0.196520,-0.060208,0.053752,0.002587,...,-0.046875,-0.014472,-0.046319,0.026260,0.168349,0.006951,-0.041292,0.089952,0.041477,681


##### Query processor
The final part of a search engine is a query processor which actually performs the search task. Given a query by user the processor should return list of relevant documents. There are multiple types of queries. We'll start with a simple "google-ish" query where assume the user looks for documents relevant to all words in the query. Therefore we transform the query to boolean by connecting all words with AND operator.

First, the processor preprocesses the query the same way as the indexer preprocessed the text. In other words, we normalize the query to match the format of text in the index. Next, the query is parsed to single words. We look into index if these words are part of the vocabulary. If a word is in index we retrieve its posting list. Finally, we look for intersection of all retrieved posting lists. The result is list of document IDs that the user asked for. However, we need to return something more useful than just a list of IDs. Therefore,we retrieve the information stored about the documents in the university database. Before printing the results we should also rank the documents. This ranking should be based on relevance to query.

To implement:
    Boolean query
    phrase matching

###### Normalize query¶
define an example query.

In [37]:
test = "Christopher Clarke"

In [38]:
print("User query: {}." .format(test))
test_norm = process_string(test)
print("Normalized query: {}." .format(test_norm))

User query: Christopher Clarke.
Normalized query: christopher clarke.


Now we split the query into words.

In [39]:
test_split = test_norm.split()

And we wrap this in function

In [40]:
def process_query(query):
    norm = process_string(query)
    return norm.split()

##### Retrieve from index¶
And we iterate over the words, looking if they're in the index vocabulary. If so then we retrieve the associated posting list.

In [41]:
retrieved = []
for word in test_split:
    if word in index.keys():
        retrieved.append(index[word])
        print(retrieved)

[[24, 37, 504]]
[[24, 37, 504], [93, 213]]


Now we look for the intersection of all posting lists

In [42]:
def lists_intersection(lists):
    intersect = list(set.intersection(*map(set, lists)))
    intersect.sort()
    return intersect
lists_intersection(retrieved)

[]

Let's wrap this part in a function before proceeding to formatting the results. The additional if statement is for cases when there's nothing retrieved.

In [43]:
def search_googleish(query, index=idx):
    query_split = process_query(query)
    retrieved = []
    for word in query_split:
        if word in index.keys():
            retrieved.append(index[word])
    if len(retrieved)>0:
        result = lists_intersection(retrieved)
    else:
        result = ['No Information Found']
    return result

In [44]:
result_IDs = search_googleish("virus", index)
print(result_IDs)

['No Information Found']


TO DO: If there's no document retrieved, try removing one term and looking for simplified query + tell user that such document doesn't include term X.

### Retrieve from our database
Now we need to connect the retrieved IDs with some useful information stored in database that we first use to refine the results and then to print nice result to user.

In [45]:
#this is our database
#meta = coventry_all_2.drop(['text'], axis=1).copy()
meta = coventry_all_2a.copy()
meta.head()

Unnamed: 0,names,link,research_field,subfaculty,faculty,link_CU,research_interest,ID
0,Timothy Mason,https://scholar.google.co.uk/citations?hl=en&u...,"sonochemistry, ultrasound, chemistry, environm...",,,,,1
1,Gurnam Singh,https://scholar.google.co.uk/citations?hl=en&u...,"social work, race and racism, critical pedagog...",,,https://pureportal.coventry.ac.uk/en/persons/g...,Research Interests: Emancipatory research and ...,2
2,WD Li,https://scholar.google.co.uk/citations?hl=en&u...,,,,,,3
3,Dr. Mohammad M Ali,https://scholar.google.co.uk/citations?hl=en&u...,"Forecast Information Sharing, ARIMA Modelling,...",,,,,4
4,Petra Wark,https://scholar.google.co.uk/citations?hl=en&u...,"m/eHealth, epidemiology, primary prevention, d...",Faculty Research Centre for Intelligent Health...,Faculty of Health & Life Sciences,https://pureportal.coventry.ac.uk/en/persons/p...,Research Interests: Efficacy and effectiveness...,5


Query from database to get only rows of retrieved IDs

In [46]:
def connect_id_df(retrieved_id, df):
    return df[df.ID.isin(retrieved_id)].reset_index(drop=True)

In [88]:
result_meta = connect_id_df(result_IDs, meta)
result_meta.head(5)

Unnamed: 0,names,link,research_field,subfaculty,faculty,link_CU,research_interest,ID


#### Ranked retrieval
Now we return back to the word2vec vectors we computed after indexing the documents. We'l compute the vector for the query as well and then using a cosine similarity compare query to retrieved document relevance.

Compute vector for query

In [48]:
query_vec = average_vectors(word2vec, test_split)

Retrieve vectors of retrieve documents.

In [49]:
result_vecs = connect_id_df(result_IDs, doc_vecs)

Compute cosine similarity between retrieved documents and query

In [50]:
def cos_similarity(a, b):
    dot = np.dot(a, b)
    norma = np.linalg.norm(a)
    normb = np.linalg.norm(b)
    cos = dot / (norma * normb)
    return(cos)

In [51]:
cos_sim = []
for i in range(len(result_vecs)):
    doc_vec = result_vecs.loc[i,:].drop(['ID'])
    cos_sim.append(cos_similarity(doc_vec, query_vec))
result_meta['rank'] = cos_sim

Sort retrieved docs by cosine similarity which is proxi for relevance.

In [52]:
result_meta.sort_values('rank', axis=0)

Unnamed: 0,names,link,research_field,subfaculty,faculty,link_CU,research_interest,ID,rank


In [53]:
cov_dept_1 = pd.merge(coventry_all_2a, result_meta, how='right', left_on='ID', right_on='ID')
cov_dept_1 = cov_dept_1[cov_dept_1.columns.drop(list(cov_dept_1.filter(regex='_y')))]
cov_dept_1.columns = cov_dept_1.columns.str.replace('_x','')
#cov_dept_1

Unnamed: 0,names,link,research_field,subfaculty,faculty,link_CU,research_interest,ID,rank


Wrap this in function

In [54]:
def rank_results(query, results):
    query_norm = process_query(query)
    query_vec = average_vectors(word2vec, query_norm)
    result_vecs = connect_id_df(results.ID, doc_vecs)
    cos_sim = []
    for i in range(len(result_vecs)):
        doc_vec = result_vecs.loc[i,:].drop(['ID'])
        cos_sim.append(cos_similarity(doc_vec, query_vec))
    results['rank'] = cos_sim
    results = results.sort_values('rank', axis=0)
    return results

In [None]:
final_result = rank_results("Christopher Clarke", result_meta)


## Print results to user

In [85]:
def print_results(result_df):
    for i in range(len(result_df)):
        res = result_df.loc[i, :]
        print(res.names)
        print("research field: ", res.research_field)
        print('%.200s'  % res.research_interest )
        print("Subfaculty: ", res.subfaculty)
        print("Faculty: ", res.subfaculty)
        print("CU link: ", res.link_CU)
        print("GScolar link", res.link)
        #if i == len(result_df):
        #    print("Scholar link:", res.link)
        #else:
        #    print("{}\n" .format(res.link))
        print("------------------------------------")

### Put it all together

In [86]:
def search(query, dat=None):
    result = search_googleish(query)
    result = connect_id_df(result, meta)
    result = rank_results(query, result)
    print_results(result)

In [87]:
query = input("Search for:")
print('*******************')
search(query)

Search for:machine
*******************
Kevin Warwick
research field:  Biomedical Engineering, Machine Intelligence, Bioethics, Cybernetics, Cyborgs
Research Interests: Cyborgs; Control
; Robotics
; Biomedical systems 
; Artificial intelligence
Subfaculty:  Faculty Research Centre for Data Science
Faculty:  Faculty Research Centre for Data Science
CU link:  https://pureportal.coventry.ac.uk/en/persons/kevin-warwick
GScolar link https://scholar.google.co.uk/citations?hl=en&user=TfTyMZQAAAAJ
------------------------------------
Vasile Palade
research field:  Machine Learning, Data Science
Research Interests: Machine Learning and Applications; Deep learning; Image Processing
Subfaculty:  Faculty Research Centre for Data Science
Faculty:  Faculty Research Centre for Data Science
CU link:  https://pureportal.coventry.ac.uk/en/persons/vasile-palade
GScolar link https://scholar.google.co.uk/citations?hl=en&user=KTXoxysAAAAJ
------------------------------------
Chitta Saha
research field:  Ener