## COPA Enhanced Search Usability Testing

This script is used to generate HTML pages with re-ranked search results for COPA project's modeling test. The code will change the search results' order by modifying the original search HTML pages. By doing so, COPA's re-ranking model can be tested by surveying several COPA members and leting them score the new search result panel. 

### Import Packages

In [0]:
import pandas as pd
import bs4 as bs
import re
import requests
from selenium import webdriver
from tqdm import tqdm_notebook as tqdm
from time import sleep
from os import listdir
from os.path import isfile, join, isdir
import random
import itertools
from pywebcopy import save_webpage

### Generate HTML Test Panels

In [0]:
# import the existing list for test links
test_links = pd.read_csv('./test_links.csv')

In [0]:
# get all files from the specified path with ratio in the filename
mypath = '/Users/jeloretizo/Documents/MSBA/Practicum Project/COPA-Usability-Testing'
ratio_files = [f for f in listdir(mypath) if isfile(join(mypath,
                                                         f)) and ('ratio' in f)]

In [0]:
# list all test search queries prescribed by the MIP
query_strings = ['brakes during taxi',
                 'capture glide slope above',
                 'stall under hood',
                 'find flight instructor',
                 'oxygen altitude',
                 'file nasa report',
                 'loose fairing noise',
                 'loose seat belt',
                 'power off landing',
                 'music volume low']

In [0]:
# four test HTML panels' name
panel_labels = ['Panel1.html', 'Panel2.html', 'Panel3.html', 'Panel4.html']

In [0]:
def get_id(link):
    """Returns result id

    This function parses the id from a the url link passed as argument.
    """

    res_id = re.findall(r'\/[0-9]+\/[0-9]+|\/[\d]+$', link)[0]
    return res_id

In [0]:
def get_href_id(html):
    """Returns the id for a search result

    This function looks for the search-link class in the html file given and passes the value of the href to 
    another helper function called get_id. The function takes html as an input.
    """

    href = html.find(class_='search-link')['href']
    return get_id(href)

In [0]:
def get_rerank_list(soup, rerank):
    """Returns reordered list of search results

    This function reranks search results according to the order prescribed by the model. It receives two inputs; 
    soup refers to the parsed html page and rerank contains the new order of the search results.
    """

    # get all html groups with class fps-result
    all_items = soup.find_all(class_='fps-result')
    html_links = pd.DataFrame(all_items).reset_index()
    html_links = html_links.rename(columns={'index': 'old_rank'})
    html_links['id'] = html_links[3].apply(lambda x: get_href_id(x))

    rerank = rerank.reset_index(drop=True).reset_index()
    rerank = rerank.rename(columns={'index': 'New Rank'})
    for ID in html_links['id']:
        try:
            new_rank = rerank.loc[rerank['url-id'].str.contains(ID)].index[0]
        except:
            new_rank = None
        html_links.loc[html_links['id'] == ID, 'New Rank'] = new_rank

    new_order = list(html_links.sort_values('New Rank',
                                            ascending=True)['old_rank'])
    final_order = [all_items[index]
                   for index in new_order] 
    return final_order

In [0]:
# iterate through all query strings
for query in tqdm(query_strings):  # change i to query

    # randomize panel labels
    random_panels = random.sample(panel_labels, 4)
    folder_name = query.title().replace(' ', '')
    query_path = mypath + '/' + folder_name + '/'

    files = [f for f in listdir(query_path) if (query in f) and (isdir(join(query_path,
                                                                            f)) == False)]  
    if len(files) == 1:
        with open('./{}/{}'.format(folder_name, files[0]), 'r') as open_file:
            soup = bs.BeautifulSoup(open_file, 'html.parser')

            # save the original search in a new html panel
            print(query, ' -- Control Panel Printed in:', random_panels[3])
            with open('./{}/{}'.format(folder_name, random_panels[3]), 'w') as output_file:

                test_links.loc[(test_links['0'] == folder_name) &
                               (test_links['1'] == random_panels[3]),
                               'Panel Type'] = 'Control'
                
                output_file.write(str(soup))
                output_file.close()

            # iterate through each model ratio file and generate 
            for model_number, ratio_file in enumerate(ratio_files, start=0):
                
                ratio = pd.read_csv('./' + ratio_file)
                ratio['url-id'] = ratio['url'].apply(lambda x: get_id(x))
                temp_var = ratio.loc[ratio['Query'] == query]
                final_list = get_rerank_list(soup, temp_var)

                # reopen original base html file
                with open('./{}/{}'.format(folder_name, files[0]), 'r') as open_file:
                    soup = bs.BeautifulSoup(open_file, 'html.parser')

                # perform replacement of elements based on final list
                for k in range(len(final_list)):
                    soup.find_all(
                        class_='fps-result')[k].replace_with(final_list[k])

                # print out panel information and record the same on the experimental design document
                print(query, ' -- Model Ratio: ', ratio_file,
                      ' -- Printed in: ', random_panels[model_number])
                test_links.loc[(test_links['0'] == folder_name) & (test_links['1'] == random_panels[model_number]),
                               'Panel Type'] = ratio_file

                # save the new test panel as html
                with open('./{}/{}'.format(folder_name, random_panels[model_number]), 'w') as output_file:
                    output_file.write(str(soup))
                    output_file.close()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

brakes during taxi  -- Control Panel Printed in: Panel1.html
brakes during taxi  -- Model Ratio:  ratio_comb3.csv  -- Printed in:  Panel2.html
brakes during taxi  -- Model Ratio:  ratio_comb2.csv  -- Printed in:  Panel3.html
brakes during taxi  -- Model Ratio:  ratio_comb1.csv  -- Printed in:  Panel4.html
capture glide slope above  -- Control Panel Printed in: Panel2.html
capture glide slope above  -- Model Ratio:  ratio_comb3.csv  -- Printed in:  Panel3.html
capture glide slope above  -- Model Ratio:  ratio_comb2.csv  -- Printed in:  Panel4.html
capture glide slope above  -- Model Ratio:  ratio_comb1.csv  -- Printed in:  Panel1.html
stall under hood  -- Control Panel Printed in: Panel2.html
stall under hood  -- Model Ratio:  ratio_comb3.csv  -- Printed in:  Panel1.html
stall under hood  -- Model Ratio:  ratio_comb2.csv  -- Printed in:  Panel3.html
stall under hood  -- Model Ratio:  ratio_comb1.csv  -- Printed in:  Panel4.html
find flight instructor  -- Control Panel Printed in: Panel3

In [0]:
# save the generated experimental design to a csv
test_links.to_csv('./experimental_design.csv', index=False)