From d429651ac8031a773839c774f56d46df49a8369b Mon Sep 17 00:00:00 2001 From: "Md. Iftekhar Tanveer" Date: Fri, 24 Aug 2018 16:33:19 -0400 Subject: [PATCH] New Repository --- .gitignore | 103 +++++++++ CONTRIBUTING | 5 + LICENSE | 201 +++++++++++++++++ NOTICE | 16 ++ README.md | 77 +++++++ code/__init__.py | 1 + code/cluster_query.py | 319 ++++++++++++++++++++++++++ code/filter_query.py | 307 +++++++++++++++++++++++++ code/parse_query.py | 267 ++++++++++++++++++++++ code/syntaviz.py | 422 +++++++++++++++++++++++++++++++++++ code/templates/fullpage.html | 111 +++++++++ setup.cfg | 7 + setup.py | 37 +++ 13 files changed, 1873 insertions(+) create mode 100644 .gitignore create mode 100644 CONTRIBUTING create mode 100644 LICENSE create mode 100644 NOTICE create mode 100644 README.md create mode 100644 code/__init__.py create mode 100644 code/cluster_query.py create mode 100644 code/filter_query.py create mode 100644 code/parse_query.py create mode 100644 code/syntaviz.py create mode 100644 code/templates/fullpage.html create mode 100644 setup.cfg create mode 100644 setup.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d7f493d --- /dev/null +++ b/.gitignore @@ -0,0 +1,103 @@ +.idea + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/CONTRIBUTING b/CONTRIBUTING new file mode 100644 index 0000000..87eca50 --- /dev/null +++ b/CONTRIBUTING @@ -0,0 +1,5 @@ +If you would like to contribute code to this project you can do so through GitHub by forking the repository and sending a pull request. + +Before Comcast merges your code into the project you must sign the Comcast Contributor License Agreement (CLA). + +If you haven't previously signed a Comcast CLA, you'll automatically be asked to when you open a pull request. Alternatively, we can e-mail you a PDF that you can sign and scan back to us. Please send us an e-mail or create a new GitHub issue to request a PDF version of the CLA. diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/NOTICE b/NOTICE new file mode 100644 index 0000000..7f91e5c --- /dev/null +++ b/NOTICE @@ -0,0 +1,16 @@ +SyntaViz +Copyright 2017 Comcast Cable Communications Management, LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +This product includes software developed at Comcast (http://www.comcast.com/). diff --git a/README.md b/README.md new file mode 100644 index 0000000..def3993 --- /dev/null +++ b/README.md @@ -0,0 +1,77 @@ +This repository contains the code of SyntaViz that Md. Iftekhar Tanveer worked on in Summer 2017. + +Outline of the code +=================== + +- cluster_query.py: Called by main program to build the hierarchical clusters from the (dependency) parsed queries. It has functionalities to navigate into the clusters and show the contents. +- filter_query.py: This is one of the earliest module implementing all the necessary functions for processing the data from the production pipeline (vrex_log.queries) to smaller and more manageable files (e.g. non_titles.queries). It has functions for filtering and sorting the queries based on the language-model based scores. +- parse_query.py: Self running module to read the list of queries and create the dependency parse trees (dependency_syntaxnet_jsonified). It assumes tensorflow/syntaxnet environment. +- syntaviz.py: Self running module to read the hierarchical clusters from file and show in an web interface. +- session_handler.py: It contains the code to extract the "actions" (i.e. how the intent resolution system responded to a certain query) for the queries and to save in non_titles_actionlist.queries file. It also contains some other experimental codes with the voice_watch_pair and flat_sessions data. +- templates/ Contains the html skeleton for the SyntaViz server. + +Logical sequence of the codes +============================= + + filter_query.py + [for preparing data] + | + | + | + v + parse_query.py + [for parsing queries] + | + | + | + v + cluster_query.py + [for creating clusters] + | + | + | + v + syntaviz.py + [for creating server] + +Running SyntaViz +================ + +Define variables: +``` +DATADIR=/data/syntaviz +CODEDIR=/code/SyntaViz/code +PORT=5678 +``` + +### Running SyntaViz on a corpus of queries + +#### 0. Start a container with syntaxnet: +`docker run --rm --name syntaviz-parser -it -e CODEDIR=$CODEDIR -e DATADIR=$DATADIR -v $CODEDIR:$CODEDIR -v $DATADIR:$DATADIR -p 9030:8888 tensorflow/syntaxnet /bin/bash` + +#### 1. Prepare data in the following format + - cx.queries: A text file with each line representing one query in following format: `ID\tquery\tlogProb\tlogFreq\tCount` + +e.g., + +``` +0 i wanna change my plans its to high 1.0 1.0 1 +1 please email me an alarm certificate showing that our services are current and active. 1.0 1.0 1 +2 cant send outgoing email 1.0 1.0 1 +``` + - cx-actions.pkl: A pkl file that contains a single mapping (dict object) with `key=query value=action` + +#### 2. Parse queries +``` +cd /opt/tensorflow/syntaxnet +mkdir $DATADIR/cx-parsed +python -u $CODEDIR/parse_query.py $DATADIR/cx.queries $DATADIR/cx-parsed/part >& a.log 2>&1 & +cat $DATADIR/cx-parsed/part* > $DATADIR/cx-parsed.txt +exit +``` + +#### 3. Start SyntaViz server +``` +cd $CODEDIR +python ./syntaviz.py $DATADIR/cx.queries $DATADIR/cx-parsed.txt $DATADIR/cx-actions.pkl $PORT +``` diff --git a/code/__init__.py b/code/__init__.py new file mode 100644 index 0000000..5becc17 --- /dev/null +++ b/code/__init__.py @@ -0,0 +1 @@ +__version__ = "1.0.0" diff --git a/code/cluster_query.py b/code/cluster_query.py new file mode 100644 index 0000000..aef4b65 --- /dev/null +++ b/code/cluster_query.py @@ -0,0 +1,319 @@ +import json +import numpy as np + + +def cluster_by_root(parsed_query_file='../data/dependency_syntaxnet_jsonified'): + ''' + This function clusters the queries based on the roots of dependency parse tree. + ''' + clusthash = {} + with open(parsed_query_file) as f: + for aline in f: + spltline = aline.strip().split('\t') + # Get the root + root = '_'.join(json.loads(spltline[1])[0].split()[:-1]) + query = spltline[0] + if root in clusthash: + clusthash[root].append(query) + else: + clusthash[root] = [query] + return clusthash + + +def cluster_counts(parsed_query_file='../data/dependency_syntaxnet_jsonified'): + ''' + This function creates a dictionary of counts of various patterns in + the dependency parse trees. + ''' + clust = {} + with open(parsed_query_file) as f: + for aline in f: + spltline = aline.strip().split('\t') + query = spltline[0] + jtree = json.loads(spltline[1]) + update_count(clust, jtree) + return clust + + +def get_queries_and_freq(original_query_file='../data/all-queries-raw.txt'): + ''' + Returns the query list and the frequency list read from the files. + ''' + original_query_list = [] + original_freq_list = [] + with open(original_query_file) as f: + for aline in f: + spltline = aline.strip().split('\t') + original_query_list.append(spltline[1]) + original_freq_list.append(int(spltline[-1])) + return original_query_list, original_freq_list + + +def cluster_counts_and_queries(parsed_query_file='../data/dependency_syntaxnet_jsonified', + original_query_file='../data/all-queries-raw.txt', get_freq=False): + ''' + This function creates a dictionary of counts of various patterns and + associates the corresponding query IDs. + If get_freq is True, it returns the cluster, the list of original queries, and the list + of query frequencies + If get_freq is false, it returns only the first two. + ''' + clust = {} + if get_freq: + original_query_list, original_freq_list = get_queries_and_freq(original_query_file) + else: + original_query_list, _ = get_queries_and_freq(original_query_file) + with open(parsed_query_file) as f: + # i is the position of the query in parsed_query_file + # qid is the position in original_query_file + # These two positions donot match + for i, aline in enumerate(f): + spltline = aline.strip().split('\t') + # Original Query Index + qid = int(spltline[3]) + # parsed tree in json format + try: + jtree = json.loads(spltline[1]) + except: + print("Skipping corrupt line %d" % i) + continue + update_count_and_query(clust, jtree, qid) + if not get_freq: + return clust, original_query_list + else: + return clust, original_query_list, original_freq_list + + +def update_count(clust, jtree): + ''' + This function captures the counts of all the dependency grammer + starting from the root of the dependency tree. + ''' + for anode in jtree: + if type(anode) is unicode: + if anode in clust: + clust[anode][0] += 1 + else: + clust[anode] = [1, {}] + last_named_node = anode + elif type(anode) is list: + update_count(clust[last_named_node][1], anode) + + +def update_count_and_query(clust, jtree, qID, currlevel=0, maxlevel=np.inf): + ''' + This function captures the counts of all the dependency grammer + starting from the root of the dependency tree. In addition, it + stores the indices of the corresponding queries. Note that it needs + a lot of memories to store the qID's. + ''' + if currlevel > maxlevel: + return + for anode in jtree: + if type(anode) is unicode: + if anode in clust: + clust[anode][0] += 1 + clust[anode][2].append(qID) + else: + # Position#0 = Number of unique queries in this cluster (redundant) + # Position#1 = Dictionary representing the subclusters + # Position#2 = List of all the unique queries falling in this cluster + clust[anode] = [1, {}, [qID]] + last_named_node = anode + elif type(anode) is list: + # Recursively parse the subtrees + update_count_and_query(clust[last_named_node][1], anode, qID, currlevel + 1, maxlevel) + + +def cd(clust, key): + ''' + If the key is given in a nested format, this function changes the clust to the + level before the last key and returns the last key. + ''' + if '|' in key: + keys = key.split('|') + for akey in keys[:-1]: + clust = clust[akey][1] + key = keys[-1] + return clust, key + + +def show_keys(clust, key='', st_idx=0, en_idx=100): + ''' + This is similar to get_keys but it prints the results + ''' + for i, akey, count in get_keys(clust, key, st_idx, en_idx): + print str(i) + ': ' + akey + '(' + str(count) + ')' + + +def get_keys(clust, key='', st_idx=0, en_idx=100, freq_list=None, sortby=0): + ''' + This function shows the keys (of the dictionary created by cluster_counts_and_queries + function) sorted in descending order of unique counts. + :param clust: The dictionary for which we want to see the keys + :param key: Optional key. If a key is provided, the dictionary is changed to that + specific sub-dictionary before showing the keys. + :param st_idx: start index. The keys will be skipped upto the start index. + :param en_idx: end index. All the keys after end index will be skipped. + :param freq_list: if the frequency list is provided (get it from cluster_counts_and_queries + by setting the get_freq flag to True), this function will also return + the total non-unique counts of the queries under each cluster + :param sortby: If it is set to 0, the clusters will be sorted by unique counts. If set to + 1, then the clusters will be sorted by total non-unique counts. This + parameter will be ignored if freq_list is set to None. + ''' + if key: + clust, key = cd(clust, key) + clust = clust[key][1] + if not freq_list: + # Sort the keys based on unique counts + allkeys = sorted([(clust[akey][0], akey) for akey in clust], key=lambda x: -1 * x[0]) + # No need to send the total non-unique counts + for i, (unique_count, akey) in enumerate(allkeys): + if i > en_idx: + break + if i < st_idx: + continue + yield i, akey, unique_count + else: + # Since we did not store the non-unique counts in the cluster, we need + # to calculate that for every cluster and subclusters. This process + # would make it slower than the other option. + # Sort the keys based on either unique counts or non-unique counts + allkeys = sorted([(clust[akey][0], \ + sum([freq_list[aqid] for aqid in clust[akey][2]]), \ + akey) for akey in clust], key=lambda x: -1 * x[sortby]) + # providing the frequency list implies that the user + # wants the total non-unique counts. + for i, (unique_count, non_unique_count, akey) in enumerate(allkeys): + if i > en_idx: + break + if i < st_idx: + continue + yield i, akey, unique_count, non_unique_count + + +def show_queries(clust, key, query_list, st_idx=0, en_idx=100): + ''' + Similar to get_queries, but prints the data instead of yielding + ''' + for i, qid, akey in get_queries(clust, key, query_list, st_idx, en_idx): + print str(i) + ': ' + str(qid) + ' -- ' + akey + + +def get_queries(clust, key, query_list, st_idx=0, en_idx=100, freq_list=None): + ''' + This function prints the first n queries for a specific key. + :param clust: The cluster obtained from the function cluster_counts_and_queries + :param key: The key of the cluster for which we are looking for the queries. It is possible + to nest the keys by seperating them with a slash (/). + :param query_list: The query list obtained from the function cluster_counts_and_queries + :param freq_list: if the frequency list is provided (get it from cluster_counts_and_queries + by setting the get_freq flag to True), the queries will be sorted by frequency + ''' + clust, key = cd(clust, key) + qid_list = clust[key][2] + if freq_list: + rank, qid_list = zip(*sorted([(freq_list[aqid], aqid) for aqid in qid_list], key=lambda x: -1 * x[0])) + else: + qid_list = sorted(qid_list) + for i, qid in enumerate(qid_list): + if i > en_idx: + break + if i < st_idx: + continue + yield i, qid, query_list[qid] + + +def get_statistics(clust, key, freq_list): + ''' + Returns the following counts for a cluster + 1. Count of all the unique queries in the current cluster + 2. Count of the total queries (non-unique) in the current cluster + 3. Unique count of "non-dependent" queries. That is, the unique queries in the current + cluster, which are not available in any of the sub-clusters. + 4. Total (Non-Unique) count of "non-dependent" queries. + 5. a dictionary, mapping qids (key) to a list of all the immediate subclusters + where that qid is available + + ''' + clust, key = cd(clust, key) + queries = {aqid: True for aqid in clust[key][2]} + qid_to_subclust = {} + for a_sub_clust in clust[key][1]: + for aqid in clust[key][1][a_sub_clust][2]: + # Delete the qid from queries to trace out the + # queries having no dependencies + if aqid in queries: + del queries[aqid] + # Add the subcluster name in qid_to_subclust + if not aqid in qid_to_subclust: + qid_to_subclust[aqid] = [a_sub_clust] + else: + qid_to_subclust[aqid].append(a_sub_clust) + return clust[key][0], sum([freq_list[aquery] for aquery in clust[key][2]]), \ + len(queries), sum([freq_list[aquery] for aquery in queries]), qid_to_subclust + + +# def show_query_actions(clust,key,session_map,session_list,st_idx=0,en_idx=100,actualcount=False): +# ''' +# Shows a probability distribution of the actions taken for the +# ''' +# pass + +#################### Logical Operations over clusters ########################## +def get_all_queries(clust): + ''' + Get a list of all the query id's + ''' + allqid = [] + for akey in clust: + allqid.extend(clust[akey][2]) + return allqid + + +def get_query_IDs(clust, key): + ''' + Similar to show_queries, but instead of printing the queries, it returns all the query ID's. + ''' + clust, key = cd(clust, key) + return clust[key][2] + + +def query_or(key1, key2, clust): + ''' + Returns a union of queries + ''' + qid1 = set(get_query_IDs(clust, key1)) + qid2 = set(get_query_IDs(clust, key2)) + return list(qid1.union(qid2)) + + +def query_and(key1, key2, clust): + ''' + Returns an intersection of queries + ''' + qid1 = set(get_query_IDs(clust, key1)) + qid2 = set(get_query_IDs(clust, key2)) + return list(qid1.intersection(qid2)) + + +def query_subtract(key1, key2, clust): + ''' + Returns the queries which are present in the first cluster + but not present in the second + ''' + qid1 = set(get_query_IDs(clust, key1)) + qid2 = set(get_query_IDs(clust, key2)) + return list(qid1 - qid2) + + +def show_roots(clusthash, n=100): + ''' + This function shows the keys of the dictionary constructed by "cluster_by_root" function. + ''' + allkeys = sorted([(len(clusthash[akey]), akey) for akey in clusthash], key=lambda x: -1 * x[0]) + for (i, (count, akeys)) in enumerate(allkeys): + if i > n: + break + print '(' + str(count) + ') ' + akeys diff --git a/code/filter_query.py b/code/filter_query.py new file mode 100644 index 0000000..9a3c9f0 --- /dev/null +++ b/code/filter_query.py @@ -0,0 +1,307 @@ +import re +import json +import nltk +import cPickle as cp +import numpy as np +from collections import OrderedDict + +__author__ = 'mtanve200' + + +def filter_by_re(inp='../data/vrex_1week.queries', + outp='../data/vrex_1week_long_text_filter_by_re.queries', + minlen=4): + """ + Filter the queries by regular expression. + This method extracts all the queries that starts with wh/h words (what, how, why etc.) + It puts an additional constraint that the query must be of length $minlen + """ + with open(inp) as f: + with open(outp, 'wb') as fout: + for i, aline in enumerate(f): + txt = aline.decode('utf8') + jdat = json.loads(txt) + q = jdat['text'].lower() + test = re.match( \ + "who|who's|what|what's|where|where's|when|when's|why|why's|how|how's|define|definition of", q) + if i % 10000 == 0: + print(i), 'queries processed' + if test and len(test.string.split()) >= minlen: + fout.write(test.string.encode('utf8') + '\n') + fout.flush() + + +def filter_unique(inp='../data/vrex_1week_long_text_filter_by_re.queries', + outp='../data/vrex_1week_long_text_filter_unique.queries'): + """ + Filters the queries to keep only the unique ones and associates + a count. It reads from $inp and writes in $outp + """ + with open(inp) as f: + with open(outp, 'wb') as fout: + uniq_lines = OrderedDict() + for i, aline in enumerate(f): + txt = aline.decode('utf8') + if i % 10000 == 0: + print(i) + if not uniq_lines.get(txt): + uniq_lines[txt] = 1 + else: + uniq_lines[txt] += 1 + for i, uqlines in enumerate(uniq_lines): + fout.write(str(i) + '\t' + uqlines.strip().encode('utf8') + '\t' + str(uniq_lines[uqlines]) + '\n') + fout.flush() + + +def filter_titles(inp='../data/vrex_1week_with_probability_plus_logfrequency_sorted.query', + outp='../data/non_titles.queries', query_col=1): + """ + Filter out queries that are just the titles of some movie or tv series. This operation + is not case or punctuation sensitive. Everything other than alphaneumeric characters + are ignored from both. + """ + print('Loading Titles ...') + alltitles = cp.load(open('../data/alltitles.pickle'))['alltitles'] + print('done') + with open(outp, 'wb') as fout: + with open(inp) as f: + for i, aline in enumerate(f): + title = aline.split('\t')[query_col] + title = re.sub('[^a-z0-9\s]+', '', title.lower()) + title = ' '.join(title.split()) + if not alltitles.get(title): + fout.write(aline) + if i % 100000 == 0: + print(i) + + +def trigram_freqdist(inp='../data/combined_corpus', outp='../data/fdist_kn.pickle'): + """ + It calculates the trigram frequency distributions for the + parliament speech dataset. This distribution is important + for calculating the trigram probabilities with kneser-ney + smoothing. The distribution is saved in a pickle file. + """ + with open(inp) as f: + alltrigrams = [] + for i, aline in enumerate(f): + aline = aline.strip().decode('utf8') + aline = aline.encode('ascii', 'ignore') + aline = aline.lower() + tokens = [''] + aline.split() + [''] + alltrigrams += [(x, y, z) for x, y, z in nltk.trigrams(tokens)] + if i % 10000 == 0: + print(i) + fdist = nltk.FreqDist(alltrigrams) + cp.dump({'fdist': fdist}, open(outp, 'wb')) + + +def kn_logprob(inp='../data/vrex_1week_long_text.queries', + outp='../data/vrex_1week_with_probability.queries', + fdfile='../data/fdist_kn.pickle', + minlen=4, + length_normalized=True): + """ + Calculates the log probability of every query from the input file according + to the trigram distributions. It uses Kneser Ney smoothing. + It produces a tab delimited file with the queries and the logprobabilities. + :params fdfile: Trigram frequency distribution file (pickled) + """ + print('Loading Trigram Distribution') + fdist = cp.load(open(fdfile))['fdist'] + print('Trigram Distribution Loaded') + kn_pd = nltk.probability.KneserNeyProbDist(fdist) + print('Kneser Ney Loaded') + with open(inp) as f: + with open(outp, 'wb') as fout: + for i, aline in enumerate(f): + jdat = json.loads(aline.strip()) + q = jdat['text'].lower().encode('ascii', 'ignore') + tokens = [''] + nltk.word_tokenize(q) + [''] + if len(tokens) < minlen + 2: + continue + logplist = [] + for x, y, z in nltk.trigrams(tokens): + lgp = kn_pd.logprob((x, y, z)) + # OOV cases + if lgp == -1e300: + logplist.append(-50) + else: + logplist.append(lgp) + # Length Normalization: Add points for longer sentences + if length_normalized: + len_score = len(set(tokens)) * 8.5 + else: + len_score = 0 + + logpsum = sum(logplist) + len_score + fout.write(q + '\t' + str(logpsum) + '\n') + fout.flush() + if i % 100000 == 0: + print(i) + + +def sort_by_logprob(inp='../data/vrex_1week_with_probability.queries', + outp='../data/vrex_1week_with_probability_sorted.queries', + sort_column=-1, query_column=0, tag_columns=[], ascending=False): + """ + Sorts the queries by logprobability. It assumes that the input + is a tab-delimited file where the last column is logprobability. + You may change the default parameter values for customized behavior. + :params sort_column: Index of the column upon which the sorting will be done. + :params query_column: Index of the column where the queries are located. + :params tag_columns: A list of indices of columns which we want to augment + into the output file. + :params ascending: Sort in an ascending order instead of descending. + """ + with open(inp) as f: + allqueries = [] + allprob = [] + tagcols = [] + for i, aline in enumerate(f): + cols = aline.strip().split('\t') + logprob = float(cols[sort_column]) + allqueries.append(cols[query_column]) + allprob.append(logprob) + if tag_columns: + tagcols.append('\t'.join([cols[m] for m in tag_columns])) + with open(outp, 'wb') as fout: + if not ascending: + idx = np.argsort(allprob)[::-1] + else: + idx = np.argsort(allprob) + for m, i in enumerate(idx): + if tagcols: + fout.write(str(m) + '\t' + allqueries[i] + '\t' + str(allprob[i]) + '\t' + tagcols[i] + '\n') + else: + fout.write(str(m) + '\t' + allqueries[i] + '\t' + str(allprob[i]) + '\n') + fout.flush() + + +def add_logfrequency(inp='../data/vrex_1week_with_probability_unique.queries', + outp='../data/vrex_1week_with_probability_plus_logfrequency.query'): + """ + Adds the log of query-frequency with the (normalized) logprobability + values and creates a new column with this score. This score might be + a better query ranking metric than the normalized logprobability. + It assumes the last column is the query frequency and the column before + the last one is the normalized logprobability. + """ + with open(inp) as f: + with open(outp, 'wb') as fout: + for i, aline in enumerate(f): + if i % 100000 == 0: + print(i) + aline = aline.strip() + cols = aline.split('\t') + logprob = float(cols[-2]) + logfreq = np.log(float(cols[-1])) + fout.write(aline + '\t' + str(logprob + logfreq) + '\n') + fout.flush() + + +def get_natural_queries(filename='../data/non_titles.queries'): + """ + get a hash of all the natural queries from our natural + query dataset. + """ + natqueries = {} + with open(filename) as f: + for aline in f: + spltaline = aline.strip().split('\t') + natqueries[spltaline[1].lower()] = int(spltaline[0]) + return natqueries + + +def save_na_queries(natqueries, allqfilename='../data/vrex_log.queries', + outfilename='../data/NAqueries.query'): + """ + Search for na queries in natural query database and save it + vrex_log.queries is the dump of the following hdfs file to local filesystem: + /user/fture/vrex/sessions/201702.22-28/vrex-log-201702_22-28.queries + """ + with open(outfilename, 'wb') as fout: + with open(allqfilename) as f: + for i, aline in enumerate(f): + if i % 10000 == 0: + print(i) + jsonx = json.loads(aline.strip().lower()) + if jsonx['action'] == 'na' and \ + len(jsonx['text'].split()) > 4 and \ + natqueries.get(jsonx['text']): + fout.write(str(natqueries[jsonx['text']]) + '\t' + jsonx['text'] + '\n') + + +def save_uniq_sorted_na_queries(inp='../data/NAqueries.query', + outp='../data/NAqueries_uniq_sorted.query'): + """ + Saves the unique queries that got an action "NA" and saves in a sorted order. + You may get the input by running save_na_queries. + The output file preserves the original indices in the 3rd column + """ + filter_unique(inp, outp='../data/NAqueries_uniq.query') + sort_by_logprob(inp='../data/NAqueries_uniq.query', outp=outp, sort_column=1, + query_column=2, tag_columns=[3], ascending=True) + + +def combine_corpus(inp1='../data/imdb_corpus_processed', + inp2='../data/eng_voc.txt', + outp='../data/combined_corpus'): + """ + The trigram frequencies were calculated from two corpuses: + imdb movie comment dataset and parliament speech dataset. + This function combines the two corpuses for calcualting trigram probabilities. + This combining process involves some preprocessing of the data. + """ + with open(inp1) as f1: + with open(inp2) as f2: + with open(outp, 'wb') as fout: + # Parliament Speech corpus + txt2 = f2.read().decode('unicode_escape') + fout.write(txt2.encode('utf8')) + fout.flush() + # IMDB corpus. It needs sentence tokenization and word tokenization. + txt1 = f1.read() + txt1 = txt1.decode('utf8') + txt1 = '\n'.join([' '.join(nltk.word_tokenize(asent)) \ + for asent in nltk.sent_tokenize(txt1)]) + '\n' + fout.write(txt1.encode('utf8')) + fout.flush() + + +def pipeline_query_ranking(initialize=False): + """ + The full pipeline of loading and calculating the trigram frequencies to + ranking the queries based on our naturalness score. Please note that the + trigram_freqdist() needs to be done only the first time. + Output of this pipeline is saved in the following file: + non_titles.queries + """ + if initialize: + # Building the language model + trigram_freqdist() + # Get probability + kn_logprob() + # Get unique queries and compute frequencies + filter_unique(inp='../data/vrex_1week_with_probability.queries', + outp='../data/vrex_1week_with_probability_unique.queries') + # Add the logfrequency with the logprobability to calculate the query ranking + add_logfrequency() + # Sort the queries based on rankings + sort_by_logprob(inp='../data/vrex_1week_with_probability_plus_logfrequency.query', + outp='../data/vrex_1week_with_probability_plus_logfrequency_sorted.query', query_column=1, + tag_columns=[2, 3]) + filter_titles() + + +def pipeline_sort_by_frequency(): + """ + This pipeline sorts the queries based on frequency (not log-frequency). + Output file is: vrex_1week_long_unique_sorted.queries + """ + filter_unique(inp='../data/vrex_1week_with_probability.queries', + outp='../data/vrex_1week_long_unique.queries') + sort_by_logprob(inp='../data/vrex_1week_long_unique.queries', + outp='../data/vrex_1week_long_unique_sorted.queries', query_column=1) + filter_titles(inp='../data/vrex_1week_long_unique_sorted.queries', + outp='../data/non_titles_sorted_by_freq.queries', query_col=0) diff --git a/code/parse_query.py b/code/parse_query.py new file mode 100644 index 0000000..eb38056 --- /dev/null +++ b/code/parse_query.py @@ -0,0 +1,267 @@ +import os +import sys +import stat +import numpy as np +import json +import time +from itertools import izip +from multiprocessing import Process +from os import path + +__author__ = 'mtanve200' + + +def parse_query_with_syntaxnet(query_generator, + start_index=0, + end_index=np.inf, + shellname='syntaxnet/demo.sh'): + ''' + Parses a query using syntaxnet. It breaks the input stream into mini batches of 1000 + queries and passes through the syntaxnet. It extracts two different styles of the parse + tree. It reads the whole input and returns the trees as a list. Do do not feed an + infinitely long stream because that will overflow the memory. + It requires to be called from the /root/models/syntaxnet/ + folder of the syntaxnet docker + :param query_generator: An iterator of (index, query) tuples + :param start_index: From which item it will start parsing. Note, it does not consider the index + of the data in the query file. It considers the idex of the data as they arrive + :param end_index: Where it will stop reading. Same convention to start_index applies. + + Note: It is utmost important to remove the last element (a blank line) + from the output of parse_query_with_syntaxnet function, before passing + it to the segment_gen function. Otherwise, it will create an invalid + tree for the last element of the batch due to the blank line. + ''' + assert start_index < end_index, 'Start index cannot be greater than end index' + allparsetreelist = [] + orig_idx_list = [] + container = [] + idx_container = [] + + def process_single(idx_container, container, orig_idx, aquery): + container.append(aquery) + idx_container.append(orig_idx) + + def write_to_disk(idx_container, container): + argtxt_ = '\n'.join(container) + output = os.popen('echo ' + '"' + argtxt_ + '"' + ' | ' + shellname).read().split('\n') + allparsetreelist.extend(output) + orig_idx_list.extend(idx_container) + + # Iterate over the queries and the save the parsed tree + for i, (orig_idx, aquery) in enumerate(query_generator): + if i < start_index: + continue + elif start_index <= i <= end_index: + # put the first query in a container + process_single(idx_container, container, orig_idx, aquery) + if len(container) % 1000 == 999: + # The container is full. Process the queries within the container + write_to_disk(idx_container, container) + container = [] + idx_container = [] + elif i > end_index: + break + if len(container) > 0: + write_to_disk(idx_container, container) + return allparsetreelist, orig_idx_list + + +def make_new_shell(): + # Prepare a shell for conll style dependency tree + # This function must run within the docker image of syntaxnet + st = os.stat('syntaxnet/demo.sh') + os.chmod('syntaxnet/demo.sh', 0o777) + with open('syntaxnet/demo.sh') as f: + txt = f.read() + with open('syntaxnet/demo_conll.sh', 'wb') as f: + f.write(txt[:-107] + '\n') + st = os.stat('syntaxnet/demo_conll.sh') + os.chmod('syntaxnet/demo_conll.sh', st.st_mode | stat.S_IEXEC) + + +def query_gen(inpfile): + ''' + Construct an iterator to feed the queries from the inpfile + ''' + # Start reading and yieling the queries + with open(inpfile) as f: + for aline in f: + spltline = aline.strip().split('\t') + yield int(spltline[0]), spltline[1] + + +def abstract_query_gen(inpfile): + ''' + It is similar to query_gen. The only difference is, it reads from the abstract query file, + thus processes and transmits the abstract queries accordingly. + ''' + # Start reading and yieling the queries + with open(inpfile) as f: + for aline in f: + spltline = aline.strip().split('\t') + yield int(spltline[0]), spltline[2] + + +def input_gen(filename): + ''' + Makes an iterator from the file. This is useful when the output + of the syntaxnet is saved as a file and you take that file to + pass through the segment_gen function as an iterator. + ''' + with open(filename) as f: + for aline in f: + if aline[0] == ' ': + yield aline[1:].rstrip() + else: + yield aline.rstrip() + + +def segment_gen(inp_gen): + ''' + Segments the stream from the output of syntaxnet into + one input and one parse tree at a time. The parse tree + is given in a json format. + :param inp_gen: input iterator + ''' + retval = '' + parsetree = '' + currlevel = -1 + count = 0 + for inp in inp_gen: + # Transforming the tree with states "Input", "Parse", and + # the normal tree parsing. + if inp.startswith('Input'): + # if there is something in retval from previous iterations + if retval: + # Close off the tree + retval += parsetree + ']' * (currlevel + 2) + yield retval + # Reset the value + retval = '' + # There is nothing from previous iterations, so start making + retval += inp[6:].strip() + '\t' + elif inp.startswith('Parse'): + # start of the parse tree + parsetree = '' + elif not inp: + # if the input is empty, just skip it + continue + else: + parse_out, currlevel = jsonify_tree(inp, currlevel) + # Debug + # print inp,parse_out,currlevel + parsetree += parse_out + if retval and parsetree: + # Close off the last tree + retval += parsetree + ']' * (currlevel + 2) + yield retval + + +def segment_gen_conll(inp_gen): + ''' + similar to segment_gen, but works on conll style parse tree + ''' + aparse = [] + for inp in inp_gen: + if not inp: + yield json.dumps(aparse) + aparse = [] + else: + aparse.append(inp.split('\t')[1:]) + + +def jsonify_tree(inp, currlevel): + ''' + Converts from syntaxnet tree structure to json tree structure. + ''' + nxtlevel = inp.find('+--') / 4 + if nxtlevel == -1: + # Root Node + return '[ ' + '"' + inp + '"', -1 + elif nxtlevel == currlevel + 1: + # Subtree of previous node + return ', [ ' + '"' + inp[nxtlevel * 4 + 4:].strip() + '"', nxtlevel + elif nxtlevel == currlevel: + # Another node in the same level of the tree + return ', ' + '"' + inp[nxtlevel * 4 + 4:].strip() + '"', nxtlevel + elif nxtlevel < currlevel: + # At least one subtree finished + leveljump = currlevel - nxtlevel + return ']' * leveljump + ',' + '"' + inp[nxtlevel * 4 + 4:].strip() + '"', nxtlevel + else: + # nxtlevel>currlevel+1 + # Impossible situation. Something is wrong + raise IOError('More than one level jump forward. At least one tree node must be missing.') + + +def pipeline(inpfile, outfile, + start_idx=0, + end_idx=np.inf, + stream_generator_function=query_gen): + ''' + This is the complete pipeline for parsing the (raw or abstract) queries from the queryfile + (query_analysis/data/non_titles.queries) using syntaxnet and producing the outfile. + :param outfile: The name (with path) of the file where the output will be written + :param start_idx: Element in the stream where the parsing should start + :param end_idx: Element in the stream where the parsing should stop + :param stream_generator_function: Determines whether the queries or the abstract queries + would be processed for parsing. As the formats of the query and abstract files are different, + the generator functions automatically reads the corresponding file formats. + This argument takes only the following two generator functions: + a) query_gen + b) abstract_query_gen + ''' + # Normal parse tree + qgen1 = stream_generator_function(inpfile) + output_tree, orig_idx_list = parse_query_with_syntaxnet(qgen1, start_index=start_idx, end_index=end_idx) + tree_gen = segment_gen(output_tree) + + # Conll style parse tree + qgen2 = stream_generator_function(inpfile) + output_conll, orig_idx_list = parse_query_with_syntaxnet(qgen2, start_index=start_idx, end_index=end_idx, + shellname='syntaxnet/demo_conll.sh') + conll_gen = segment_gen_conll(output_conll) + + # Save to file + with open(outfile, 'wb') as f: + for (i, tree, conll) in izip(orig_idx_list, tree_gen, conll_gen): + f.write(tree + '\t' + conll + '\t' + str(i) + '\n') + f.flush() + + +if __name__ == '__main__': + make_new_shell() + + abstract = False + + inpfile = sys.argv[1] + outfile = sys.argv[2] + + outdir = path.dirname(os.path.abspath(outfile)) + if not os.path.exists(outdir): + raise OSError("Output directory does not exist: %s" % outdir) + + def file_len(fname): + i = -1 + with open(fname) as f: + for i, l in enumerate(f): + pass + return i + 1 + + + line_cnt = file_len(inpfile) + + print(sys.argv) + print(line_cnt) + + if abstract: + for i in range(0, line_cnt, 1000): + p = Process(target=pipeline, args=(inpfile, outfile + str(i), i, i + 999, abstract_query_gen)) + p.start() + time.sleep(5) + else: + for i in range(0, line_cnt, 1000): + p = Process(target=pipeline, args=(inpfile, outfile + str(i), i, i + 999)) + p.start() + time.sleep(5) diff --git a/code/syntaviz.py b/code/syntaviz.py new file mode 100644 index 0000000..c177f95 --- /dev/null +++ b/code/syntaviz.py @@ -0,0 +1,422 @@ +# Copyright 2017 Comcast Cable Communications Management, LLC +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pdb + +from flask import Flask, abort, render_template, url_for, request +import cluster_query +import pickle as cp +import numpy as np +import urllib +import json +import sys +import base64 +from io import BytesIO +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt + +''' +This is a quick working prototype of a visualizer that would +facilitate the exploring of syntactic patterns of queries and +the statistical distribution of the actions currently taken for +these patterns. + +Note: This server needs the following files: +1. '../data/all-queries-raw.txt': This is a tab delimited list + of original queries. It also contains +2. '../data/dependency_syntaxnet_jsonified': This is a tab + delimited list of tokenized queries, dependency parse in + two different formats, and the query ID (index to the original + query list). +3. '../data/qaction.pickle': It contains a dictionary named qaction + which returns the actions taken (values) for the queries (keys). + +''' + +inpfile = sys.argv[1] +outfile = sys.argv[2] +query2actionfile = sys.argv[3] +PORT=5678 +if len(sys.argv) > 4: + PORT = int(sys.argv[4]) + +################## Load the pre-requisites #################### +print("Loading cluster data ...") +clust_head, queries, freq_list = cluster_query.cluster_counts_and_queries( + original_query_file=inpfile, + parsed_query_file=outfile, + get_freq=True) +clust = clust_head +tot_uniq = sum([clust[akey][0] for akey in clust]) +tot_nonuniq = sum([freq_list[aqid] for akey in clust for aqid in clust[akey][2]]) +print("Done clustering.") + +print("Loading list of actions performed for each query ...") +qaction = cp.load(open(query2actionfile)) +print("Done loading actions.") + +############################################################### + +# The SyntaViz server +app = Flask('SyntaViz') + + +@app.route('/keys/') +@app.route('/keys///') +@app.route('/keys///') +def get_keys_json(key='', st_idx=0, en_idx=50): + ''' + Make a list of keys + ''' + key = urllib.unquote(urllib.unquote(key)) + allkeys = [] + try: + for i, akey, count in cluster_query.get_keys(clust, key, st_idx, en_idx): + if key: + akey = key + '|' + akey + allkeys.append((i, count, akey)) + except KeyError: + print('Key Not Found:', key) + return abort(404) + return json.dumps(allkeys) + + +@app.route('/queries/') +@app.route('/queries///') +def get_queries_json(key='', st_idx=0, en_idx=50): + ''' + Make a list of queries + ''' + key = urllib.unquote(urllib.unquote(key)) + clust = clust_head + allqueries = [] + try: + for i, qid, aquery in cluster_query.get_queries(clust, key, queries, st_idx, en_idx): + allqueries.append((i, qid, aquery)) + except: + print('Key Not Found:', key) + return abort(404) + return json.dumps(allqueries) + + +def get_action_hist(key, qaction): + ''' + Returns the frequency of various actions taken (in response to + the queries of a key) as well as the list of actions + ''' + action_hist = {} + clust = clust_head + for i, qid, aquery in cluster_query.get_queries(clust, key, queries, en_idx=float('inf')): + aquery = aquery.lower() + if aquery in qaction: + if qaction[aquery] in action_hist: + action_hist[qaction[aquery]] += 1 + else: + action_hist[qaction[aquery]] = 1 + return action_hist + + +def get_plot(adict): + ''' + Plot the action dictionary + ''' + img_format = \ + '' + count, labels = zip(*sorted([(adict[akey], akey) for akey in adict], + key=lambda x: -1 * x[0])) + # total is the number of queries having an action + # m is the number of different actions + m = len(count) + total = sum(count) + # Plot + if m > 30: + plt.figure(num=1, figsize=(12, 8), fontsize=24) + plt.clf() + plt.bar(np.arange(30), count[:30]) + plt.xticks(np.arange(30) + 0.4, labels[:30], rotation='vertical', fontsize=24) + plt.xlabel('Name of Actions (Only top 30 among {0})'.format(m)) + plt.ylabel('Count of Actions') + plt.title('Total unique queries containing an action (Including NA) = {0}'.format(total)) + try: + plt.tight_layout() + except: + print("matplotlib error") + pass + # pdb.set_trace() + else: + plt.figure(num=1, figsize=(12, 8), fontsize=24) + plt.clf() + plt.bar(np.arange(m), count) + plt.xticks(np.arange(m) + 0.4, labels, rotation='vertical', fontsize=24) + plt.xlabel('Name of Actions (all)') + plt.ylabel('Count of Actions') + plt.title('Total unique queries containing an action (Including NA) = {0}'.format(total)) + try: + plt.tight_layout() + except: + print("matplotlib error") + pass + # pdb.set_trace() + # Convert to html + figfile = BytesIO() + plt.savefig(figfile, format='png') + figfile.seek(0) + figfile_png = base64.b64encode(figfile.getvalue()) + return img_format.format(figfile_png) + + +@app.route('/') +@app.route('/both') +def both(): + ''' + Show the page + ''' + # Parsing arguments + key_k = urllib.unquote(urllib.unquote(request.args.get('key_k', ''))) + st_idx_k = int(request.args.get('st_idx_k', 0)) + en_idx_k = int(request.args.get('en_idx_k', 500)) + st_idx_q = int(request.args.get('st_idx_q', 0)) + en_idx_q = int(request.args.get('en_idx_q', 1000)) + sort_key_by = int(request.args.get('sort_key_by', 0)) # sort by unique count(0), total count(1) + sort_query_by = int(request.args.get('sort_query_by', 0)) # sort by query frequency(0), or qid(1) + + try: + # Build the list of keys + clust = clust_head + allkeys = [] + for i, akey, count, nucount in cluster_query.get_keys( \ + clust, + key_k, + st_idx_k, + en_idx_k, + freq_list=freq_list, + sortby=sort_key_by): + if key_k: + fullkey = key_k + '|' + akey + else: + fullkey = akey + encodedkey = urllib.quote(fullkey) + # Link to go to a specific cluster + keylink = url_for('both', + key_k=encodedkey, + st_idx_k=st_idx_k, + en_idx_k=en_idx_k, + st_idx_q=st_idx_q, + en_idx_q=en_idx_q, + sort_key_by=sort_key_by, + sort_query_by=sort_query_by) + allkeys.append((i, count, akey, keylink, nucount, + '{0:0.2f}'.format(float(count) / float(tot_uniq) * 100.), + '{0:0.2f}'.format(float(nucount) / float(tot_nonuniq) * 100.))) + except KeyError: + print('Key Not Found:', key_k) + return abort(404) + + # Build the left pane navigations + navformat = '{1}' + diff = en_idx_k - st_idx_k + if st_idx_k > 0: + left_prev_code = navformat.format(url_for('both', + key_k=key_k, + st_idx_k=max(0, st_idx_k - diff), + en_idx_k=st_idx_k, + st_idx_q=st_idx_q, + en_idx_q=en_idx_q, + sort_key_by=sort_key_by, + sort_query_by=sort_query_by), '<<') + else: + left_prev_code = '<<' + left_next_code = navformat.format(url_for('both', + key_k=key_k, + st_idx_k=en_idx_k, + en_idx_k=en_idx_k + diff, + st_idx_q=st_idx_q, + en_idx_q=en_idx_q, + sort_key_by=sort_key_by, + sort_query_by=sort_query_by), '>>') + + # Build the breadcrumb + if not key_k: + currentkey = '< None >' + else: + if '|' in key_k: + currentkey = '' + spltkey = key_k.split('|') + for i in range(len(spltkey)): + encodedkey = urllib.quote('|'.join(spltkey[:i + 1])) + currentkey += '{0}|'.format(spltkey[i], url_for('both', + key_k=encodedkey, + st_idx_k=0, + en_idx_k=en_idx_k - st_idx_k, + st_idx_q=0, + en_idx_q=en_idx_q - st_idx_q, + sort_key_by=sort_key_by, + sort_query_by=sort_query_by)) + if currentkey[-1] == '|': + currentkey = currentkey[:-1] + else: + currentkey = key_k + + # Build the links for sorting the keys + unique_link = url_for('both', + key_k=key_k, + st_idx_k=st_idx_k, + en_idx_k=en_idx_k, + st_idx_q=st_idx_q, + en_idx_q=en_idx_q, + sort_key_by=0, + sort_query_by=sort_query_by) + total_link = url_for('both', + key_k=key_k, + st_idx_k=st_idx_k, + en_idx_k=en_idx_k, + st_idx_q=st_idx_q, + en_idx_q=en_idx_q, + sort_key_by=1, + sort_query_by=sort_query_by) + + # If no key is selected, just send out the root keys + if not key_k: + return render_template('fullpage.html', + total_count=tot_nonuniq, + uniq_count=tot_uniq, + left_prev_code=left_prev_code, + left_next_code=left_next_code, + allkeys=allkeys, + currentkey=currentkey, + header_unique_link=unique_link, + header_total_link=total_link) + + try: + # Build the list of queries + allqueries = [] + clust = clust_head + if sort_query_by == 0: + # sort by query frequency + frequency_list = freq_list + else: + # sort by qid + frequency_list = None + # Accumulate the queries + for i, qid, aquery in cluster_query.get_queries(clust, + key_k, + queries, + st_idx_q, + en_idx_q, + freq_list=frequency_list): + if aquery.lower() in qaction: + query_action = qaction[aquery.lower()] + else: + query_action = '[Not Found]' + allqueries.append((i, + qid, + aquery, + query_action, + freq_list[qid], + '{0:0.3f}'.format(float(freq_list[qid]) / tot_nonuniq * 100.))) + + except KeyError: + print('Key Not Found:', key_k) + return abort(404) + + # Build the right pane navigations + diff = en_idx_q - st_idx_q + if st_idx_q > 0: + right_prev_code = navformat.format(url_for('both', + key_k=key_k, + st_idx_k=st_idx_k, + en_idx_k=en_idx_k, + st_idx_q=max(0, st_idx_q - diff), + en_idx_q=st_idx_q, + sort_key_by=sort_key_by, + sort_query_by=sort_query_by), '<<') + else: + right_prev_code = '<<' + right_next_code = navformat.format(url_for('both', + key_k=key_k, + st_idx_k=st_idx_k, + en_idx_k=en_idx_k, + st_idx_q=en_idx_q, + en_idx_q=en_idx_q + diff, + sort_key_by=sort_key_by, + sort_query_by=sort_query_by), '>>') + + # Build the links for sorting the queries + # By frequency + header_freq_link = url_for('both', + key_k=key_k, + st_idx_k=st_idx_k, + en_idx_k=en_idx_k, + st_idx_q=st_idx_q, + en_idx_q=en_idx_q, + sort_key_by=sort_key_by, + sort_query_by=0) + # By QID + header_qid_link = url_for('both', + key_k=key_k, + st_idx_k=st_idx_k, + en_idx_k=en_idx_k, + st_idx_q=st_idx_q, + en_idx_q=en_idx_q, + sort_key_by=sort_key_by, + sort_query_by=1) + + # Calculate the cluster statistics + clust_stats = cluster_query.get_statistics(clust, key_k, freq_list) + + # Build the visualization on the right pane + action_freq = get_action_hist(key_k, qaction) + image_src = get_plot(action_freq) + + # Send all the data with visualization if there are queries + if len(action_freq.keys()) > 0: + # When the plot exists + return render_template('fullpage.html', + total_count=tot_nonuniq, + uniq_count=tot_uniq, + left_prev_code=left_prev_code, + left_next_code=left_next_code, + allkeys=allkeys, + currentkey=currentkey, + header_unique_link=unique_link, + header_total_link=total_link, + right_prev_code=right_prev_code, + right_next_code=right_next_code, + allqueries=allqueries, + clust_stats=clust_stats, + image_src=image_src, + header_freq_link=header_freq_link, + header_qid_link=header_qid_link) + else: + # When the plot does not exist + return render_template('fullpage.html', + total_count=tot_nonuniq, + uniq_count=tot_uniq, + left_prev_code=left_prev_code, + left_next_code=left_next_code, + allkeys=allkeys, + currentkey=currentkey, + header_unique_link=unique_link, + header_total_link=total_link, + right_prev_code=right_prev_code, + right_next_code=right_next_code, + allqueries=allqueries, + clust_stats=clust_stats, + header_freq_link=header_freq_link, + header_qid_link=header_qid_link) + + +# Run the server +if __name__ == '__main__': + app.debug = False + app.run(host='0.0.0.0', port=PORT) diff --git a/code/templates/fullpage.html b/code/templates/fullpage.html new file mode 100644 index 0000000..40a372a --- /dev/null +++ b/code/templates/fullpage.html @@ -0,0 +1,111 @@ + + + +

SyntaViz: Syntax-driven Query Visualizer

+ Total {{total_count}} queries loaded ({{uniq_count}} unique)
+
+ +
+ +
+

Hierarchical Dependency Clusters

+
+ Current Cluster:
{{currentkey|safe}} +
+
+ {{left_prev_code|safe}} +
+
+ {{left_next_code|safe}} +
+
+
+ +
+ + + + + + + + + + {% for i,count,akey,encodedkey,nucount,uniq_perc,nonuniq_perc in allkeys %} + + + + + + + {% endfor %} +
idxClusterUniqueTotal
{{i}}{{akey}}{{count}} ({{uniq_perc}}%){{nucount}} ({{nonuniq_perc}}%)
+
+
+ + {% if allqueries is defined %} + +
+ +
+

Current Cluster Statistics:

+ The current cluster contains:
+ {% set unq_cnt,tot_cnt, unq_nondep,tot_nondep,qid_to_subclust = clust_stats %} + {{tot_cnt}} queries in total
+ {{unq_cnt}} unique queries
+ Number of queries not belonging to any subcluster:
+ {{unq_nondep}} unique queries
+ {{tot_nondep}} total queries
+
+ + {% if image_src is defined %} +
+

Histogram of Actions in the cluster

+ {{image_src|safe}} +
+ {% endif %} +
+ +
+
+ {{right_prev_code|safe}} +
+
+ {{right_next_code|safe}} +
+
+
+ +
+ + + + + + + + + + + + {% for i,qid,aquery,action,qfreq,qfreq_perc in allqueries %} + + + + + + + + + {% endfor %} +
idxQueryActionSubclustersFrequency(%ofTotal)QID
{{i}}{{aquery}}{{action}}{% for asubclust in qid_to_subclust[qid] %} + {{asubclust}}, + {% endfor %} + {{qfreq}} ({{qfreq_perc}}%){{qid}}
+
+
+ {% endif %} +
+
+ + \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..3d7aca6 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,7 @@ +[aliases] +test=pytest + +[tool:pytest] +addopts = + --cov-report term-missing + --cov=syntaviz diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..271b244 --- /dev/null +++ b/setup.py @@ -0,0 +1,37 @@ +from setuptools import setup, find_packages +import re + + +def get_version(): + """ + Extract the version from the module's root __init__.py file + """ + root_init_file = open("syntaviz/__init__.py").read() + match = re.search("__version__[ ]+=[ ]+[\"'](.+)[\"']", root_init_file) + return match.group(1) if match is not None else "unknown" + + +setup( + name="syntaviz", + version=get_version(), + description="SyntaViz", + + packages=find_packages(), + + package_data={}, + + python_requires='>=2.7, <3', + + install_requires=["Flask==0.12.2", + "matplotlib==2.0.2", + "numpy==1.8.2", + "scikit-learn==0.18.2", + "scipy==0.19.1", + "ipython==5.1.0", + "bokeh==0.12.5", + "nltk==3.2.3", + "pandas==0.20.2", + "torch"], + setup_requires=['pytest-runner'], + tests_require=['pytest', 'pytest-cov'], +)