From d429651ac8031a773839c774f56d46df49a8369b Mon Sep 17 00:00:00 2001
From: "Md. Iftekhar Tanveer" <go2chayan@gmail.com>
Date: Fri, 24 Aug 2018 16:33:19 -0400
Subject: [PATCH] New Repository

---
 .gitignore                   | 103 +++++++++
 CONTRIBUTING                 |   5 +
 LICENSE                      | 201 +++++++++++++++++
 NOTICE                       |  16 ++
 README.md                    |  77 +++++++
 code/__init__.py             |   1 +
 code/cluster_query.py        | 319 ++++++++++++++++++++++++++
 code/filter_query.py         | 307 +++++++++++++++++++++++++
 code/parse_query.py          | 267 ++++++++++++++++++++++
 code/syntaviz.py             | 422 +++++++++++++++++++++++++++++++++++
 code/templates/fullpage.html | 111 +++++++++
 setup.cfg                    |   7 +
 setup.py                     |  37 +++
 13 files changed, 1873 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 CONTRIBUTING
 create mode 100644 LICENSE
 create mode 100644 NOTICE
 create mode 100644 README.md
 create mode 100644 code/__init__.py
 create mode 100644 code/cluster_query.py
 create mode 100644 code/filter_query.py
 create mode 100644 code/parse_query.py
 create mode 100644 code/syntaviz.py
 create mode 100644 code/templates/fullpage.html
 create mode 100644 setup.cfg
 create mode 100644 setup.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d7f493d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,103 @@
+.idea
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# dotenv
+.env
+
+# virtualenv
+.venv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
diff --git a/CONTRIBUTING b/CONTRIBUTING
new file mode 100644
index 0000000..87eca50
--- /dev/null
+++ b/CONTRIBUTING
@@ -0,0 +1,5 @@
+If you would like to contribute code to this project you can do so through GitHub by forking the repository and sending a pull request.
+
+Before Comcast merges your code into the project you must sign the Comcast Contributor License Agreement (CLA).
+
+If you haven't previously signed a Comcast CLA, you'll automatically be asked to when you open a pull request. Alternatively, we can e-mail you a PDF that you can sign and scan back to us. Please send us an e-mail or create a new GitHub issue to request a PDF version of the CLA.
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..261eeb9
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/NOTICE b/NOTICE
new file mode 100644
index 0000000..7f91e5c
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,16 @@
+SyntaViz
+Copyright 2017 Comcast Cable Communications Management, LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at 
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+This product includes software developed at Comcast (http://www.comcast.com/).
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..def3993
--- /dev/null
+++ b/README.md
@@ -0,0 +1,77 @@
+This repository contains the code of SyntaViz that Md. Iftekhar Tanveer worked on in Summer 2017.
+
+Outline of the code
+===================
+
+- cluster_query.py:        Called by main program to build the hierarchical clusters from the (dependency) parsed queries. It has functionalities to navigate into the clusters and show the contents.
+- filter_query.py:         This is one of the earliest module implementing all the necessary functions for processing the data from the production pipeline (vrex_log.queries) to smaller and more manageable files (e.g. non_titles.queries). It has functions for filtering and sorting the queries based on the language-model based scores.
+- parse_query.py:          Self running module to read the list of queries and create the dependency parse trees (dependency_syntaxnet_jsonified). It assumes tensorflow/syntaxnet environment.
+- syntaviz.py:             Self running module to read the hierarchical clusters from file and show in an web interface. 
+- session_handler.py:     It contains the code to extract the "actions" (i.e. how the intent resolution system responded to a certain query) for the queries and to save in non_titles_actionlist.queries file. It also contains some other experimental codes with the voice_watch_pair and flat_sessions data.
+- templates/              Contains the html skeleton for the SyntaViz server.
+
+Logical sequence of the codes
+=============================
+
+        filter_query.py
+	[for preparing data]                      
+             |
+             |
+             |
+             v
+        parse_query.py
+	[for parsing queries] 
+             |
+             |
+             |
+             v
+       cluster_query.py
+	[for creating clusters]
+             |
+             |
+             |
+             v
+       syntaviz.py    
+	[for creating server]
+
+Running SyntaViz
+================
+
+Define variables:
+```
+DATADIR=/data/syntaviz
+CODEDIR=/code/SyntaViz/code
+PORT=5678
+```
+
+### Running SyntaViz on a corpus of queries
+
+#### 0. Start a container with syntaxnet:
+`docker run --rm --name syntaviz-parser -it -e CODEDIR=$CODEDIR -e DATADIR=$DATADIR -v $CODEDIR:$CODEDIR -v $DATADIR:$DATADIR -p 9030:8888 tensorflow/syntaxnet /bin/bash`
+
+#### 1. Prepare data in the following format
+ - cx.queries: A text file with each line representing one query in following format: `ID\tquery\tlogProb\tlogFreq\tCount`
+
+e.g.,
+
+```
+0       i wanna change my plans its to high     1.0     1.0     1
+1       please email me an alarm certificate showing that our services are current and active. 1.0     1.0     1
+2       cant send outgoing email        1.0     1.0     1
+```
+ - cx-actions.pkl: A pkl file that contains a single mapping (dict object) with `key=query value=action`
+
+#### 2. Parse queries
+```
+cd /opt/tensorflow/syntaxnet
+mkdir $DATADIR/cx-parsed
+python -u $CODEDIR/parse_query.py $DATADIR/cx.queries $DATADIR/cx-parsed/part >& a.log 2>&1 &
+cat $DATADIR/cx-parsed/part* > $DATADIR/cx-parsed.txt
+exit
+```
+
+#### 3. Start SyntaViz server
+```
+cd $CODEDIR
+python ./syntaviz.py $DATADIR/cx.queries $DATADIR/cx-parsed.txt $DATADIR/cx-actions.pkl $PORT
+```
diff --git a/code/__init__.py b/code/__init__.py
new file mode 100644
index 0000000..5becc17
--- /dev/null
+++ b/code/__init__.py
@@ -0,0 +1 @@
+__version__ = "1.0.0"
diff --git a/code/cluster_query.py b/code/cluster_query.py
new file mode 100644
index 0000000..aef4b65
--- /dev/null
+++ b/code/cluster_query.py
@@ -0,0 +1,319 @@
+import json
+import numpy as np
+
+
+def cluster_by_root(parsed_query_file='../data/dependency_syntaxnet_jsonified'):
+    '''
+    This function clusters the queries based on the roots of dependency parse tree.
+    '''
+    clusthash = {}
+    with open(parsed_query_file) as f:
+        for aline in f:
+            spltline = aline.strip().split('\t')
+            # Get the root
+            root = '_'.join(json.loads(spltline[1])[0].split()[:-1])
+            query = spltline[0]
+            if root in clusthash:
+                clusthash[root].append(query)
+            else:
+                clusthash[root] = [query]
+    return clusthash
+
+
+def cluster_counts(parsed_query_file='../data/dependency_syntaxnet_jsonified'):
+    '''
+    This function creates a dictionary of counts of various patterns in
+    the dependency parse trees.
+    '''
+    clust = {}
+    with open(parsed_query_file) as f:
+        for aline in f:
+            spltline = aline.strip().split('\t')
+            query = spltline[0]
+            jtree = json.loads(spltline[1])
+            update_count(clust, jtree)
+    return clust
+
+
+def get_queries_and_freq(original_query_file='../data/all-queries-raw.txt'):
+    '''
+    Returns the query list and the frequency list read from the files.
+    '''
+    original_query_list = []
+    original_freq_list = []
+    with open(original_query_file) as f:
+        for aline in f:
+            spltline = aline.strip().split('\t')
+            original_query_list.append(spltline[1])
+            original_freq_list.append(int(spltline[-1]))
+    return original_query_list, original_freq_list
+
+
+def cluster_counts_and_queries(parsed_query_file='../data/dependency_syntaxnet_jsonified',
+                               original_query_file='../data/all-queries-raw.txt', get_freq=False):
+    '''
+    This function creates a dictionary of counts of various patterns and
+    associates the corresponding query IDs.
+    If get_freq is True, it returns the cluster, the list of original queries, and the list
+    of query frequencies
+    If get_freq is false, it returns only the first two.
+    '''
+    clust = {}
+    if get_freq:
+        original_query_list, original_freq_list = get_queries_and_freq(original_query_file)
+    else:
+        original_query_list, _ = get_queries_and_freq(original_query_file)
+    with open(parsed_query_file) as f:
+        # i is the position of the query in parsed_query_file
+        # qid is the position in original_query_file
+        # These two positions donot match
+        for i, aline in enumerate(f):
+            spltline = aline.strip().split('\t')
+            # Original Query Index
+            qid = int(spltline[3])
+            # parsed tree in json format
+            try:
+                jtree = json.loads(spltline[1])
+            except:
+                print("Skipping corrupt line %d" % i)
+                continue
+            update_count_and_query(clust, jtree, qid)
+    if not get_freq:
+        return clust, original_query_list
+    else:
+        return clust, original_query_list, original_freq_list
+
+
+def update_count(clust, jtree):
+    '''
+    This function captures the counts of all the dependency grammer
+    starting from the root of the dependency tree.
+    '''
+    for anode in jtree:
+        if type(anode) is unicode:
+            if anode in clust:
+                clust[anode][0] += 1
+            else:
+                clust[anode] = [1, {}]
+            last_named_node = anode
+        elif type(anode) is list:
+            update_count(clust[last_named_node][1], anode)
+
+
+def update_count_and_query(clust, jtree, qID, currlevel=0, maxlevel=np.inf):
+    '''
+    This function captures the counts of all the dependency grammer
+    starting from the root of the dependency tree. In addition, it
+    stores the indices of the corresponding queries. Note that it needs
+    a lot of memories to store the qID's.
+    '''
+    if currlevel > maxlevel:
+        return
+    for anode in jtree:
+        if type(anode) is unicode:
+            if anode in clust:
+                clust[anode][0] += 1
+                clust[anode][2].append(qID)
+            else:
+                # Position#0 = Number of unique queries in this cluster (redundant)
+                # Position#1 = Dictionary representing the subclusters
+                # Position#2 = List of all the unique queries falling in this cluster
+                clust[anode] = [1, {}, [qID]]
+            last_named_node = anode
+        elif type(anode) is list:
+            # Recursively parse the subtrees
+            update_count_and_query(clust[last_named_node][1], anode, qID, currlevel + 1, maxlevel)
+
+
+def cd(clust, key):
+    '''
+    If the key is given in a nested format, this function changes the clust to the
+    level before the last key and returns the last key.
+    '''
+    if '|' in key:
+        keys = key.split('|')
+        for akey in keys[:-1]:
+            clust = clust[akey][1]
+        key = keys[-1]
+    return clust, key
+
+
+def show_keys(clust, key='', st_idx=0, en_idx=100):
+    '''
+    This is similar to get_keys but it prints the results
+    '''
+    for i, akey, count in get_keys(clust, key, st_idx, en_idx):
+        print str(i) + ': ' + akey + '(' + str(count) + ')'
+
+
+def get_keys(clust, key='', st_idx=0, en_idx=100, freq_list=None, sortby=0):
+    '''
+    This function shows the keys (of the dictionary created by cluster_counts_and_queries
+    function) sorted in descending order of unique counts.
+    :param clust: The dictionary for which we want to see the keys
+    :param key: Optional key. If a key is provided, the dictionary is changed to that
+                specific sub-dictionary before showing the keys.
+    :param st_idx: start index. The keys will be skipped upto the start index.
+    :param en_idx: end index. All the keys after end index will be skipped.
+    :param freq_list: if the frequency list is provided (get it from cluster_counts_and_queries
+                      by setting the get_freq flag to True), this function will also return
+                      the total non-unique counts of the queries under each cluster
+    :param sortby:  If it is set to 0, the clusters will be sorted by unique counts. If set to
+                    1, then the clusters will be sorted by total non-unique counts. This
+                    parameter will be ignored if freq_list is set to None.
+    '''
+    if key:
+        clust, key = cd(clust, key)
+        clust = clust[key][1]
+    if not freq_list:
+        # Sort the keys based on unique counts
+        allkeys = sorted([(clust[akey][0], akey) for akey in clust], key=lambda x: -1 * x[0])
+        # No need to send the total non-unique counts
+        for i, (unique_count, akey) in enumerate(allkeys):
+            if i > en_idx:
+                break
+            if i < st_idx:
+                continue
+            yield i, akey, unique_count
+    else:
+        # Since we did not store the non-unique counts in the cluster, we need
+        # to calculate that for every cluster and subclusters. This process
+        # would make it slower than the other option.
+        # Sort the keys based on either unique counts or non-unique counts
+        allkeys = sorted([(clust[akey][0], \
+                           sum([freq_list[aqid] for aqid in clust[akey][2]]), \
+                           akey) for akey in clust], key=lambda x: -1 * x[sortby])
+        # providing the frequency list implies that the user
+        # wants the total non-unique counts.        
+        for i, (unique_count, non_unique_count, akey) in enumerate(allkeys):
+            if i > en_idx:
+                break
+            if i < st_idx:
+                continue
+            yield i, akey, unique_count, non_unique_count
+
+
+def show_queries(clust, key, query_list, st_idx=0, en_idx=100):
+    '''
+    Similar to get_queries, but prints the data instead of yielding
+    '''
+    for i, qid, akey in get_queries(clust, key, query_list, st_idx, en_idx):
+        print str(i) + ': ' + str(qid) + ' -- ' + akey
+
+
+def get_queries(clust, key, query_list, st_idx=0, en_idx=100, freq_list=None):
+    '''
+    This function prints the first n queries for a specific key.
+    :param clust: The cluster obtained from the function cluster_counts_and_queries
+    :param key: The key of the cluster for which we are looking for the queries. It is possible
+                to nest the keys by seperating them with a slash (/).
+    :param query_list: The query list obtained from the function cluster_counts_and_queries
+    :param freq_list: if the frequency list is provided (get it from cluster_counts_and_queries
+                      by setting the get_freq flag to True), the queries will be sorted by frequency
+    '''
+    clust, key = cd(clust, key)
+    qid_list = clust[key][2]
+    if freq_list:
+        rank, qid_list = zip(*sorted([(freq_list[aqid], aqid) for aqid in qid_list], key=lambda x: -1 * x[0]))
+    else:
+        qid_list = sorted(qid_list)
+    for i, qid in enumerate(qid_list):
+        if i > en_idx:
+            break
+        if i < st_idx:
+            continue
+        yield i, qid, query_list[qid]
+
+
+def get_statistics(clust, key, freq_list):
+    '''
+    Returns the following counts for a cluster
+    1. Count of all the unique queries in the current cluster
+    2. Count of the total queries (non-unique) in the current cluster    
+    3. Unique count of "non-dependent" queries. That is, the unique queries in the current
+       cluster, which are not available in any of the sub-clusters.
+    4. Total (Non-Unique) count of "non-dependent" queries.
+    5. a dictionary, mapping qids (key) to a list of all the immediate subclusters
+       where that qid is available
+
+    '''
+    clust, key = cd(clust, key)
+    queries = {aqid: True for aqid in clust[key][2]}
+    qid_to_subclust = {}
+    for a_sub_clust in clust[key][1]:
+        for aqid in clust[key][1][a_sub_clust][2]:
+            # Delete the qid from queries to trace out the
+            # queries having no dependencies
+            if aqid in queries:
+                del queries[aqid]
+            # Add the subcluster name in qid_to_subclust
+            if not aqid in qid_to_subclust:
+                qid_to_subclust[aqid] = [a_sub_clust]
+            else:
+                qid_to_subclust[aqid].append(a_sub_clust)
+    return clust[key][0], sum([freq_list[aquery] for aquery in clust[key][2]]), \
+           len(queries), sum([freq_list[aquery] for aquery in queries]), qid_to_subclust
+
+
+# def show_query_actions(clust,key,session_map,session_list,st_idx=0,en_idx=100,actualcount=False):
+#     '''
+#     Shows a probability distribution of the actions taken for the 
+#     '''
+#     pass
+
+#################### Logical Operations over clusters ##########################
+def get_all_queries(clust):
+    '''
+    Get a list of all the query id's
+    '''
+    allqid = []
+    for akey in clust:
+        allqid.extend(clust[akey][2])
+    return allqid
+
+
+def get_query_IDs(clust, key):
+    '''
+    Similar to show_queries, but instead of printing the queries, it returns all the query ID's.
+    '''
+    clust, key = cd(clust, key)
+    return clust[key][2]
+
+
+def query_or(key1, key2, clust):
+    '''
+    Returns a union of queries
+    '''
+    qid1 = set(get_query_IDs(clust, key1))
+    qid2 = set(get_query_IDs(clust, key2))
+    return list(qid1.union(qid2))
+
+
+def query_and(key1, key2, clust):
+    '''
+    Returns an intersection of queries
+    '''
+    qid1 = set(get_query_IDs(clust, key1))
+    qid2 = set(get_query_IDs(clust, key2))
+    return list(qid1.intersection(qid2))
+
+
+def query_subtract(key1, key2, clust):
+    '''
+    Returns the queries which are present in the first cluster
+    but not present in the second
+    '''
+    qid1 = set(get_query_IDs(clust, key1))
+    qid2 = set(get_query_IDs(clust, key2))
+    return list(qid1 - qid2)
+
+
+def show_roots(clusthash, n=100):
+    '''
+    This function shows the keys of the dictionary constructed by "cluster_by_root" function.
+    '''
+    allkeys = sorted([(len(clusthash[akey]), akey) for akey in clusthash], key=lambda x: -1 * x[0])
+    for (i, (count, akeys)) in enumerate(allkeys):
+        if i > n:
+            break
+        print '(' + str(count) + ') ' + akeys
diff --git a/code/filter_query.py b/code/filter_query.py
new file mode 100644
index 0000000..9a3c9f0
--- /dev/null
+++ b/code/filter_query.py
@@ -0,0 +1,307 @@
+import re
+import json
+import nltk
+import cPickle as cp
+import numpy as np
+from collections import OrderedDict
+
+__author__ = 'mtanve200'
+
+
+def filter_by_re(inp='../data/vrex_1week.queries',
+                 outp='../data/vrex_1week_long_text_filter_by_re.queries',
+                 minlen=4):
+    """
+    Filter the queries by regular expression.
+    This method extracts all the queries that starts with wh/h words (what, how, why etc.)
+    It puts an additional constraint that the query must be of length $minlen
+    """
+    with open(inp) as f:
+        with open(outp, 'wb') as fout:
+            for i, aline in enumerate(f):
+                txt = aline.decode('utf8')
+                jdat = json.loads(txt)
+                q = jdat['text'].lower()
+                test = re.match( \
+                    "who|who's|what|what's|where|where's|when|when's|why|why's|how|how's|define|definition of", q)
+                if i % 10000 == 0:
+                    print(i), 'queries processed'
+                if test and len(test.string.split()) >= minlen:
+                    fout.write(test.string.encode('utf8') + '\n')
+                    fout.flush()
+
+
+def filter_unique(inp='../data/vrex_1week_long_text_filter_by_re.queries',
+                  outp='../data/vrex_1week_long_text_filter_unique.queries'):
+    """
+    Filters the queries to keep only the unique ones and associates 
+    a count. It reads from $inp and writes in $outp
+    """
+    with open(inp) as f:
+        with open(outp, 'wb') as fout:
+            uniq_lines = OrderedDict()
+            for i, aline in enumerate(f):
+                txt = aline.decode('utf8')
+                if i % 10000 == 0:
+                    print(i)
+                if not uniq_lines.get(txt):
+                    uniq_lines[txt] = 1
+                else:
+                    uniq_lines[txt] += 1
+            for i, uqlines in enumerate(uniq_lines):
+                fout.write(str(i) + '\t' + uqlines.strip().encode('utf8') + '\t' + str(uniq_lines[uqlines]) + '\n')
+                fout.flush()
+
+
+def filter_titles(inp='../data/vrex_1week_with_probability_plus_logfrequency_sorted.query',
+                  outp='../data/non_titles.queries', query_col=1):
+    """
+    Filter out queries that are just the titles of some movie or tv series. This operation
+    is not case or punctuation sensitive. Everything other than alphaneumeric characters
+    are ignored from both.
+    """
+    print('Loading Titles ...')
+    alltitles = cp.load(open('../data/alltitles.pickle'))['alltitles']
+    print('done')
+    with open(outp, 'wb') as fout:
+        with open(inp) as f:
+            for i, aline in enumerate(f):
+                title = aline.split('\t')[query_col]
+                title = re.sub('[^a-z0-9\s]+', '', title.lower())
+                title = ' '.join(title.split())
+                if not alltitles.get(title):
+                    fout.write(aline)
+                if i % 100000 == 0:
+                    print(i)
+
+
+def trigram_freqdist(inp='../data/combined_corpus', outp='../data/fdist_kn.pickle'):
+    """
+    It calculates the trigram frequency distributions for the 
+    parliament speech dataset. This distribution is important
+    for calculating the trigram probabilities with kneser-ney 
+    smoothing. The distribution is saved in a pickle file.
+    """
+    with open(inp) as f:
+        alltrigrams = []
+        for i, aline in enumerate(f):
+            aline = aline.strip().decode('utf8')
+            aline = aline.encode('ascii', 'ignore')
+            aline = aline.lower()
+            tokens = ['<s>'] + aline.split() + ['<e>']
+            alltrigrams += [(x, y, z) for x, y, z in nltk.trigrams(tokens)]
+            if i % 10000 == 0:
+                print(i)
+        fdist = nltk.FreqDist(alltrigrams)
+        cp.dump({'fdist': fdist}, open(outp, 'wb'))
+
+
+def kn_logprob(inp='../data/vrex_1week_long_text.queries',
+               outp='../data/vrex_1week_with_probability.queries',
+               fdfile='../data/fdist_kn.pickle',
+               minlen=4,
+               length_normalized=True):
+    """
+    Calculates the log probability of every query from the input file according 
+    to the trigram distributions. It uses Kneser Ney smoothing.
+    It produces a tab delimited file with the queries and the logprobabilities.
+    :params fdfile: Trigram frequency distribution file (pickled)
+    """
+    print('Loading Trigram Distribution')
+    fdist = cp.load(open(fdfile))['fdist']
+    print('Trigram Distribution Loaded')
+    kn_pd = nltk.probability.KneserNeyProbDist(fdist)
+    print('Kneser Ney Loaded')
+    with open(inp) as f:
+        with open(outp, 'wb') as fout:
+            for i, aline in enumerate(f):
+                jdat = json.loads(aline.strip())
+                q = jdat['text'].lower().encode('ascii', 'ignore')
+                tokens = ['<s>'] + nltk.word_tokenize(q) + ['<e>']
+                if len(tokens) < minlen + 2:
+                    continue
+                logplist = []
+                for x, y, z in nltk.trigrams(tokens):
+                    lgp = kn_pd.logprob((x, y, z))
+                    # OOV cases
+                    if lgp == -1e300:
+                        logplist.append(-50)
+                    else:
+                        logplist.append(lgp)
+                # Length Normalization: Add points for longer sentences
+                if length_normalized:
+                    len_score = len(set(tokens)) * 8.5
+                else:
+                    len_score = 0
+
+                logpsum = sum(logplist) + len_score
+                fout.write(q + '\t' + str(logpsum) + '\n')
+                fout.flush()
+                if i % 100000 == 0:
+                    print(i)
+
+
+def sort_by_logprob(inp='../data/vrex_1week_with_probability.queries',
+                    outp='../data/vrex_1week_with_probability_sorted.queries',
+                    sort_column=-1, query_column=0, tag_columns=[], ascending=False):
+    """
+    Sorts the queries by logprobability. It assumes that the input
+    is a tab-delimited file where the last column is logprobability.
+    You may change the default parameter values for customized behavior.
+    :params sort_column: Index of the column upon which the sorting will be done.
+    :params query_column: Index of the column where the queries are located.
+    :params tag_columns: A list of indices of columns which we want to augment 
+                         into the output file.
+    :params ascending: Sort in an ascending order instead of descending.
+    """
+    with open(inp) as f:
+        allqueries = []
+        allprob = []
+        tagcols = []
+        for i, aline in enumerate(f):
+            cols = aline.strip().split('\t')
+            logprob = float(cols[sort_column])
+            allqueries.append(cols[query_column])
+            allprob.append(logprob)
+            if tag_columns:
+                tagcols.append('\t'.join([cols[m] for m in tag_columns]))
+    with open(outp, 'wb') as fout:
+        if not ascending:
+            idx = np.argsort(allprob)[::-1]
+        else:
+            idx = np.argsort(allprob)
+        for m, i in enumerate(idx):
+            if tagcols:
+                fout.write(str(m) + '\t' + allqueries[i] + '\t' + str(allprob[i]) + '\t' + tagcols[i] + '\n')
+            else:
+                fout.write(str(m) + '\t' + allqueries[i] + '\t' + str(allprob[i]) + '\n')
+            fout.flush()
+
+
+def add_logfrequency(inp='../data/vrex_1week_with_probability_unique.queries',
+                     outp='../data/vrex_1week_with_probability_plus_logfrequency.query'):
+    """
+    Adds the log of query-frequency with the (normalized) logprobability
+    values and creates a new column with this score. This score might be
+    a better query ranking metric than the normalized logprobability.
+    It assumes the last column is the query frequency and the column before
+    the last one is the normalized logprobability.
+    """
+    with open(inp) as f:
+        with open(outp, 'wb') as fout:
+            for i, aline in enumerate(f):
+                if i % 100000 == 0:
+                    print(i)
+                aline = aline.strip()
+                cols = aline.split('\t')
+                logprob = float(cols[-2])
+                logfreq = np.log(float(cols[-1]))
+                fout.write(aline + '\t' + str(logprob + logfreq) + '\n')
+                fout.flush()
+
+
+def get_natural_queries(filename='../data/non_titles.queries'):
+    """
+    get a hash of all the natural queries from our natural
+    query dataset.
+    """
+    natqueries = {}
+    with open(filename) as f:
+        for aline in f:
+            spltaline = aline.strip().split('\t')
+            natqueries[spltaline[1].lower()] = int(spltaline[0])
+    return natqueries
+
+
+def save_na_queries(natqueries, allqfilename='../data/vrex_log.queries',
+                    outfilename='../data/NAqueries.query'):
+    """
+    Search for na queries in natural query database and save it
+    vrex_log.queries is the dump of the following hdfs file to local filesystem:
+    /user/fture/vrex/sessions/201702.22-28/vrex-log-201702_22-28.queries
+    """
+    with open(outfilename, 'wb') as fout:
+        with open(allqfilename) as f:
+            for i, aline in enumerate(f):
+                if i % 10000 == 0:
+                    print(i)
+                jsonx = json.loads(aline.strip().lower())
+                if jsonx['action'] == 'na' and \
+                                len(jsonx['text'].split()) > 4 and \
+                        natqueries.get(jsonx['text']):
+                    fout.write(str(natqueries[jsonx['text']]) + '\t' + jsonx['text'] + '\n')
+
+
+def save_uniq_sorted_na_queries(inp='../data/NAqueries.query',
+                                outp='../data/NAqueries_uniq_sorted.query'):
+    """
+    Saves the unique queries that got an action "NA" and saves in a sorted order.
+    You may get the input by running save_na_queries.
+    The output file preserves the original indices in the 3rd column
+    """
+    filter_unique(inp, outp='../data/NAqueries_uniq.query')
+    sort_by_logprob(inp='../data/NAqueries_uniq.query', outp=outp, sort_column=1,
+                    query_column=2, tag_columns=[3], ascending=True)
+
+
+def combine_corpus(inp1='../data/imdb_corpus_processed',
+                   inp2='../data/eng_voc.txt',
+                   outp='../data/combined_corpus'):
+    """
+    The trigram frequencies were calculated from two corpuses:
+    imdb movie comment dataset and parliament speech dataset.
+    This function combines the two corpuses for calcualting trigram probabilities.
+    This combining process involves some preprocessing of the data.
+    """
+    with open(inp1) as f1:
+        with open(inp2) as f2:
+            with open(outp, 'wb') as fout:
+                # Parliament Speech corpus
+                txt2 = f2.read().decode('unicode_escape')
+                fout.write(txt2.encode('utf8'))
+                fout.flush()
+                # IMDB corpus. It needs sentence tokenization and word tokenization.
+                txt1 = f1.read()
+                txt1 = txt1.decode('utf8')
+                txt1 = '\n'.join([' '.join(nltk.word_tokenize(asent)) \
+                                  for asent in nltk.sent_tokenize(txt1)]) + '\n'
+                fout.write(txt1.encode('utf8'))
+                fout.flush()
+
+
+def pipeline_query_ranking(initialize=False):
+    """
+    The full pipeline of loading and calculating the trigram frequencies to
+    ranking the queries based on our naturalness score. Please note that the
+    trigram_freqdist() needs to be done only the first time.
+    Output of this pipeline is saved in the following file:
+    non_titles.queries
+    """
+    if initialize:
+        # Building the language model
+        trigram_freqdist()
+    # Get probability
+    kn_logprob()
+    # Get unique queries and compute frequencies
+    filter_unique(inp='../data/vrex_1week_with_probability.queries',
+                  outp='../data/vrex_1week_with_probability_unique.queries')
+    # Add the logfrequency with the logprobability to calculate the query ranking
+    add_logfrequency()
+    # Sort the queries based on rankings
+    sort_by_logprob(inp='../data/vrex_1week_with_probability_plus_logfrequency.query',
+                    outp='../data/vrex_1week_with_probability_plus_logfrequency_sorted.query', query_column=1,
+                    tag_columns=[2, 3])
+    filter_titles()
+
+
+def pipeline_sort_by_frequency():
+    """
+    This pipeline sorts the queries based on frequency (not log-frequency).
+    Output file is: vrex_1week_long_unique_sorted.queries
+    """
+    filter_unique(inp='../data/vrex_1week_with_probability.queries',
+                  outp='../data/vrex_1week_long_unique.queries')
+    sort_by_logprob(inp='../data/vrex_1week_long_unique.queries',
+                    outp='../data/vrex_1week_long_unique_sorted.queries', query_column=1)
+    filter_titles(inp='../data/vrex_1week_long_unique_sorted.queries',
+                  outp='../data/non_titles_sorted_by_freq.queries', query_col=0)
diff --git a/code/parse_query.py b/code/parse_query.py
new file mode 100644
index 0000000..eb38056
--- /dev/null
+++ b/code/parse_query.py
@@ -0,0 +1,267 @@
+import os
+import sys
+import stat
+import numpy as np
+import json
+import time
+from itertools import izip
+from multiprocessing import Process
+from os import path
+
+__author__ = 'mtanve200'
+
+
+def parse_query_with_syntaxnet(query_generator,
+                               start_index=0,
+                               end_index=np.inf,
+                               shellname='syntaxnet/demo.sh'):
+    '''
+    Parses a query using syntaxnet. It breaks the input stream into mini batches of 1000
+    queries and passes through the syntaxnet. It extracts two different styles of the parse
+    tree. It reads the whole input and returns the trees as a list. Do do not feed an
+    infinitely long stream because that will overflow the memory.
+    It requires to be called from the /root/models/syntaxnet/
+    folder of the syntaxnet docker
+    :param query_generator: An iterator of (index, query) tuples
+    :param start_index: From which item it will start parsing. Note, it does not consider the index
+                        of the data in the query file. It considers the idex of the data as they arrive
+    :param end_index: Where it will stop reading. Same convention to start_index applies.
+
+    Note: It is utmost important to remove the last element (a blank line)
+    from the output of parse_query_with_syntaxnet function, before passing
+    it to the segment_gen function. Otherwise, it will create an invalid 
+    tree for the last element of the batch due to the blank line.
+    '''
+    assert start_index < end_index, 'Start index cannot be greater than end index'
+    allparsetreelist = []
+    orig_idx_list = []
+    container = []
+    idx_container = []
+
+    def process_single(idx_container, container, orig_idx, aquery):
+        container.append(aquery)
+        idx_container.append(orig_idx)
+
+    def write_to_disk(idx_container, container):
+        argtxt_ = '\n'.join(container)
+        output = os.popen('echo ' + '"' + argtxt_ + '"' + ' | ' + shellname).read().split('\n')
+        allparsetreelist.extend(output)
+        orig_idx_list.extend(idx_container)
+
+    # Iterate over the queries and the save the parsed tree
+    for i, (orig_idx, aquery) in enumerate(query_generator):
+        if i < start_index:
+            continue
+        elif start_index <= i <= end_index:
+            # put the first query in a container
+            process_single(idx_container, container, orig_idx, aquery)
+            if len(container) % 1000 == 999:
+                # The container is full. Process the queries within the container
+                write_to_disk(idx_container, container)
+                container = []
+                idx_container = []
+        elif i > end_index:
+            break
+    if len(container) > 0:
+        write_to_disk(idx_container, container)
+    return allparsetreelist, orig_idx_list
+
+
+def make_new_shell():
+    # Prepare a shell for conll style dependency tree
+    # This function must run within the docker image of syntaxnet
+    st = os.stat('syntaxnet/demo.sh')
+    os.chmod('syntaxnet/demo.sh', 0o777)
+    with open('syntaxnet/demo.sh') as f:
+        txt = f.read()
+    with open('syntaxnet/demo_conll.sh', 'wb') as f:
+        f.write(txt[:-107] + '\n')
+    st = os.stat('syntaxnet/demo_conll.sh')
+    os.chmod('syntaxnet/demo_conll.sh', st.st_mode | stat.S_IEXEC)
+
+
+def query_gen(inpfile):
+    '''
+    Construct an iterator to feed the queries from the inpfile
+    '''
+    # Start reading and yieling the queries
+    with open(inpfile) as f:
+        for aline in f:
+            spltline = aline.strip().split('\t')
+            yield int(spltline[0]), spltline[1]
+
+
+def abstract_query_gen(inpfile):
+    '''
+    It is similar to query_gen. The only difference is, it reads from the abstract query file,
+    thus processes and transmits the abstract queries accordingly.
+    '''
+    # Start reading and yieling the queries
+    with open(inpfile) as f:
+        for aline in f:
+            spltline = aline.strip().split('\t')
+            yield int(spltline[0]), spltline[2]
+
+
+def input_gen(filename):
+    '''
+    Makes an iterator from the file. This is useful when the output
+    of the syntaxnet is saved as a file and you take that file to
+    pass through the segment_gen function as an iterator.
+    '''
+    with open(filename) as f:
+        for aline in f:
+            if aline[0] == ' ':
+                yield aline[1:].rstrip()
+            else:
+                yield aline.rstrip()
+
+
+def segment_gen(inp_gen):
+    '''
+    Segments the stream from the output of syntaxnet into
+    one input and one parse tree at a time. The parse tree
+    is given in a json format.
+    :param inp_gen: input iterator 
+    '''
+    retval = ''
+    parsetree = ''
+    currlevel = -1
+    count = 0
+    for inp in inp_gen:
+        # Transforming the tree with states "Input", "Parse", and
+        # the normal tree parsing.
+        if inp.startswith('Input'):
+            # if there is something in retval from previous iterations
+            if retval:
+                # Close off the tree 
+                retval += parsetree + ']' * (currlevel + 2)
+                yield retval
+                # Reset the value
+                retval = ''
+            # There is nothing from previous iterations, so start making
+            retval += inp[6:].strip() + '\t'
+        elif inp.startswith('Parse'):
+            # start of the parse tree
+            parsetree = ''
+        elif not inp:
+            # if the input is empty, just skip it
+            continue
+        else:
+            parse_out, currlevel = jsonify_tree(inp, currlevel)
+            # Debug
+            # print inp,parse_out,currlevel
+            parsetree += parse_out
+    if retval and parsetree:
+        # Close off the last tree
+        retval += parsetree + ']' * (currlevel + 2)
+        yield retval
+
+
+def segment_gen_conll(inp_gen):
+    '''
+    similar to segment_gen, but works on conll style parse tree
+    '''
+    aparse = []
+    for inp in inp_gen:
+        if not inp:
+            yield json.dumps(aparse)
+            aparse = []
+        else:
+            aparse.append(inp.split('\t')[1:])
+
+
+def jsonify_tree(inp, currlevel):
+    '''
+    Converts from syntaxnet tree structure to json tree structure.
+    '''
+    nxtlevel = inp.find('+--') / 4
+    if nxtlevel == -1:
+        # Root Node
+        return '[ ' + '"' + inp + '"', -1
+    elif nxtlevel == currlevel + 1:
+        # Subtree of previous node
+        return ', [ ' + '"' + inp[nxtlevel * 4 + 4:].strip() + '"', nxtlevel
+    elif nxtlevel == currlevel:
+        # Another node in the same level of the tree
+        return ', ' + '"' + inp[nxtlevel * 4 + 4:].strip() + '"', nxtlevel
+    elif nxtlevel < currlevel:
+        # At least one subtree finished
+        leveljump = currlevel - nxtlevel
+        return ']' * leveljump + ',' + '"' + inp[nxtlevel * 4 + 4:].strip() + '"', nxtlevel
+    else:
+        # nxtlevel>currlevel+1 
+        # Impossible situation. Something is wrong
+        raise IOError('More than one level jump forward. At least one tree node must be missing.')
+
+
+def pipeline(inpfile, outfile,
+             start_idx=0,
+             end_idx=np.inf,
+             stream_generator_function=query_gen):
+    '''
+    This is the complete pipeline for parsing the (raw or abstract) queries from the queryfile
+    (query_analysis/data/non_titles.queries) using syntaxnet and producing the outfile.
+    :param outfile: The name (with path) of the file where the output will be written
+    :param start_idx: Element in the stream where the parsing should start
+    :param end_idx: Element in the stream where the parsing should stop
+    :param stream_generator_function: Determines whether the queries or the abstract queries
+    would be processed for parsing. As the formats of the query and abstract files are different,
+    the generator functions automatically reads the corresponding file formats.
+    This argument takes only the following two generator functions:
+    a) query_gen
+    b) abstract_query_gen
+    '''
+    # Normal parse tree
+    qgen1 = stream_generator_function(inpfile)
+    output_tree, orig_idx_list = parse_query_with_syntaxnet(qgen1, start_index=start_idx, end_index=end_idx)
+    tree_gen = segment_gen(output_tree)
+
+    # Conll style parse tree
+    qgen2 = stream_generator_function(inpfile)
+    output_conll, orig_idx_list = parse_query_with_syntaxnet(qgen2, start_index=start_idx, end_index=end_idx,
+                                                             shellname='syntaxnet/demo_conll.sh')
+    conll_gen = segment_gen_conll(output_conll)
+
+    # Save to file
+    with open(outfile, 'wb') as f:
+        for (i, tree, conll) in izip(orig_idx_list, tree_gen, conll_gen):
+            f.write(tree + '\t' + conll + '\t' + str(i) + '\n')
+            f.flush()
+
+
+if __name__ == '__main__':
+    make_new_shell()
+
+    abstract = False
+
+    inpfile = sys.argv[1]
+    outfile = sys.argv[2]
+
+    outdir = path.dirname(os.path.abspath(outfile))
+    if not os.path.exists(outdir):
+        raise OSError("Output directory does not exist: %s" % outdir)
+
+    def file_len(fname):
+        i = -1
+        with open(fname) as f:
+            for i, l in enumerate(f):
+                pass
+        return i + 1
+
+
+    line_cnt = file_len(inpfile)
+
+    print(sys.argv)
+    print(line_cnt)
+
+    if abstract:
+        for i in range(0, line_cnt, 1000):
+            p = Process(target=pipeline, args=(inpfile, outfile + str(i), i, i + 999, abstract_query_gen))
+            p.start()
+            time.sleep(5)
+    else:
+        for i in range(0, line_cnt, 1000):
+            p = Process(target=pipeline, args=(inpfile, outfile + str(i), i, i + 999))
+            p.start()
+            time.sleep(5)
diff --git a/code/syntaviz.py b/code/syntaviz.py
new file mode 100644
index 0000000..c177f95
--- /dev/null
+++ b/code/syntaviz.py
@@ -0,0 +1,422 @@
+# Copyright 2017 Comcast Cable Communications Management, LLC
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pdb
+
+from flask import Flask, abort, render_template, url_for, request
+import cluster_query
+import pickle as cp
+import numpy as np
+import urllib
+import json
+import sys
+import base64
+from io import BytesIO
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+
+'''
+This is a quick working prototype of a visualizer that would
+facilitate the exploring of syntactic patterns of queries and
+the statistical distribution of the actions currently taken for
+these patterns.
+
+Note: This server needs the following files:
+1.  '../data/all-queries-raw.txt': This is a tab delimited list
+    of original queries. It also contains
+2.  '../data/dependency_syntaxnet_jsonified': This is a tab 
+    delimited list of tokenized queries, dependency parse in
+    two different formats, and the query ID (index to the original
+    query list).
+3.  '../data/qaction.pickle': It contains a dictionary named qaction
+    which returns the actions taken (values) for the queries (keys).
+
+'''
+
+inpfile = sys.argv[1]
+outfile = sys.argv[2]
+query2actionfile = sys.argv[3]
+PORT=5678
+if len(sys.argv) > 4:
+    PORT = int(sys.argv[4])
+
+################## Load the pre-requisites ####################
+print("Loading cluster data ...")
+clust_head, queries, freq_list = cluster_query.cluster_counts_and_queries(
+    original_query_file=inpfile,
+    parsed_query_file=outfile,
+    get_freq=True)
+clust = clust_head
+tot_uniq = sum([clust[akey][0] for akey in clust])
+tot_nonuniq = sum([freq_list[aqid] for akey in clust for aqid in clust[akey][2]])
+print("Done clustering.")
+
+print("Loading list of actions performed for each query ...")
+qaction = cp.load(open(query2actionfile))
+print("Done loading actions.")
+
+###############################################################
+
+# The SyntaViz server
+app = Flask('SyntaViz')
+
+
+@app.route('/keys/<string:key>')
+@app.route('/keys//<int:st_idx>/<int:en_idx>')
+@app.route('/keys/<string:key>/<int:st_idx>/<int:en_idx>')
+def get_keys_json(key='', st_idx=0, en_idx=50):
+    '''
+    Make a list of keys
+    '''
+    key = urllib.unquote(urllib.unquote(key))
+    allkeys = []
+    try:
+        for i, akey, count in cluster_query.get_keys(clust, key, st_idx, en_idx):
+            if key:
+                akey = key + '|' + akey
+            allkeys.append((i, count, akey))
+    except KeyError:
+        print('Key Not Found:', key)
+        return abort(404)
+    return json.dumps(allkeys)
+
+
+@app.route('/queries/<string:key>')
+@app.route('/queries/<string:key>/<int:st_idx>/<int:en_idx>')
+def get_queries_json(key='', st_idx=0, en_idx=50):
+    '''
+    Make a list of queries
+    '''
+    key = urllib.unquote(urllib.unquote(key))
+    clust = clust_head
+    allqueries = []
+    try:
+        for i, qid, aquery in cluster_query.get_queries(clust, key, queries, st_idx, en_idx):
+            allqueries.append((i, qid, aquery))
+    except:
+        print('Key Not Found:', key)
+        return abort(404)
+    return json.dumps(allqueries)
+
+
+def get_action_hist(key, qaction):
+    '''
+    Returns the frequency of various actions taken (in response to
+    the queries of a key) as well as the list of actions
+    '''
+    action_hist = {}
+    clust = clust_head
+    for i, qid, aquery in cluster_query.get_queries(clust, key, queries, en_idx=float('inf')):
+        aquery = aquery.lower()
+        if aquery in qaction:
+            if qaction[aquery] in action_hist:
+                action_hist[qaction[aquery]] += 1
+            else:
+                action_hist[qaction[aquery]] = 1
+    return action_hist
+
+
+def get_plot(adict):
+    '''
+    Plot the action dictionary
+    '''
+    img_format = \
+        '<img src="data:image/png;base64,{0}" style="width: 100%;  height: auto;">'
+    count, labels = zip(*sorted([(adict[akey], akey) for akey in adict],
+                                key=lambda x: -1 * x[0]))
+    # total is the number of queries having an action
+    # m is the number of different actions
+    m = len(count)
+    total = sum(count)
+    # Plot
+    if m > 30:
+        plt.figure(num=1, figsize=(12, 8), fontsize=24)
+        plt.clf()
+        plt.bar(np.arange(30), count[:30])
+        plt.xticks(np.arange(30) + 0.4, labels[:30], rotation='vertical', fontsize=24)
+        plt.xlabel('Name of Actions (Only top 30 among {0})'.format(m))
+        plt.ylabel('Count of Actions')
+        plt.title('Total unique queries containing an action (Including NA) = {0}'.format(total))
+        try:
+            plt.tight_layout()
+        except:
+            print("matplotlib error")
+            pass
+            # pdb.set_trace()
+    else:
+        plt.figure(num=1, figsize=(12, 8), fontsize=24)
+        plt.clf()
+        plt.bar(np.arange(m), count)
+        plt.xticks(np.arange(m) + 0.4, labels, rotation='vertical', fontsize=24)
+        plt.xlabel('Name of Actions (all)')
+        plt.ylabel('Count of Actions')
+        plt.title('Total unique queries containing an action (Including NA) = {0}'.format(total))
+        try:
+            plt.tight_layout()
+        except:
+            print("matplotlib error")
+            pass
+            # pdb.set_trace()
+    # Convert to html
+    figfile = BytesIO()
+    plt.savefig(figfile, format='png')
+    figfile.seek(0)
+    figfile_png = base64.b64encode(figfile.getvalue())
+    return img_format.format(figfile_png)
+
+
+@app.route('/')
+@app.route('/both')
+def both():
+    '''
+    Show the page
+    '''
+    # Parsing arguments
+    key_k = urllib.unquote(urllib.unquote(request.args.get('key_k', '')))
+    st_idx_k = int(request.args.get('st_idx_k', 0))
+    en_idx_k = int(request.args.get('en_idx_k', 500))
+    st_idx_q = int(request.args.get('st_idx_q', 0))
+    en_idx_q = int(request.args.get('en_idx_q', 1000))
+    sort_key_by = int(request.args.get('sort_key_by', 0))  # sort by unique count(0), total count(1)
+    sort_query_by = int(request.args.get('sort_query_by', 0))  # sort by query frequency(0), or qid(1)
+
+    try:
+        # Build the list of keys
+        clust = clust_head
+        allkeys = []
+        for i, akey, count, nucount in cluster_query.get_keys( \
+                clust,
+                key_k,
+                st_idx_k,
+                en_idx_k,
+                freq_list=freq_list,
+                sortby=sort_key_by):
+            if key_k:
+                fullkey = key_k + '|' + akey
+            else:
+                fullkey = akey
+            encodedkey = urllib.quote(fullkey)
+            # Link to go to a specific cluster
+            keylink = url_for('both',
+                              key_k=encodedkey,
+                              st_idx_k=st_idx_k,
+                              en_idx_k=en_idx_k,
+                              st_idx_q=st_idx_q,
+                              en_idx_q=en_idx_q,
+                              sort_key_by=sort_key_by,
+                              sort_query_by=sort_query_by)
+            allkeys.append((i, count, akey, keylink, nucount,
+                            '{0:0.2f}'.format(float(count) / float(tot_uniq) * 100.),
+                            '{0:0.2f}'.format(float(nucount) / float(tot_nonuniq) * 100.)))
+    except KeyError:
+        print('Key Not Found:', key_k)
+        return abort(404)
+
+    # Build the left pane navigations
+    navformat = '<a href="{0}">{1}</a>'
+    diff = en_idx_k - st_idx_k
+    if st_idx_k > 0:
+        left_prev_code = navformat.format(url_for('both',
+                                                  key_k=key_k,
+                                                  st_idx_k=max(0, st_idx_k - diff),
+                                                  en_idx_k=st_idx_k,
+                                                  st_idx_q=st_idx_q,
+                                                  en_idx_q=en_idx_q,
+                                                  sort_key_by=sort_key_by,
+                                                  sort_query_by=sort_query_by), '&#60&#60')
+    else:
+        left_prev_code = '<<'
+    left_next_code = navformat.format(url_for('both',
+                                              key_k=key_k,
+                                              st_idx_k=en_idx_k,
+                                              en_idx_k=en_idx_k + diff,
+                                              st_idx_q=st_idx_q,
+                                              en_idx_q=en_idx_q,
+                                              sort_key_by=sort_key_by,
+                                              sort_query_by=sort_query_by), '&#62&#62')
+
+    # Build the breadcrumb
+    if not key_k:
+        currentkey = '&#60 None &#62'
+    else:
+        if '|' in key_k:
+            currentkey = ''
+            spltkey = key_k.split('|')
+            for i in range(len(spltkey)):
+                encodedkey = urllib.quote('|'.join(spltkey[:i + 1]))
+                currentkey += '<a href="{1}">{0}|</a>'.format(spltkey[i], url_for('both',
+                                                                                  key_k=encodedkey,
+                                                                                  st_idx_k=0,
+                                                                                  en_idx_k=en_idx_k - st_idx_k,
+                                                                                  st_idx_q=0,
+                                                                                  en_idx_q=en_idx_q - st_idx_q,
+                                                                                  sort_key_by=sort_key_by,
+                                                                                  sort_query_by=sort_query_by))
+            if currentkey[-1] == '|':
+                currentkey = currentkey[:-1]
+        else:
+            currentkey = key_k
+
+    # Build the links for sorting the keys
+    unique_link = url_for('both',
+                          key_k=key_k,
+                          st_idx_k=st_idx_k,
+                          en_idx_k=en_idx_k,
+                          st_idx_q=st_idx_q,
+                          en_idx_q=en_idx_q,
+                          sort_key_by=0,
+                          sort_query_by=sort_query_by)
+    total_link = url_for('both',
+                         key_k=key_k,
+                         st_idx_k=st_idx_k,
+                         en_idx_k=en_idx_k,
+                         st_idx_q=st_idx_q,
+                         en_idx_q=en_idx_q,
+                         sort_key_by=1,
+                         sort_query_by=sort_query_by)
+
+    # If no key is selected, just send out the root keys
+    if not key_k:
+        return render_template('fullpage.html',
+                               total_count=tot_nonuniq,
+                               uniq_count=tot_uniq,
+                               left_prev_code=left_prev_code,
+                               left_next_code=left_next_code,
+                               allkeys=allkeys,
+                               currentkey=currentkey,
+                               header_unique_link=unique_link,
+                               header_total_link=total_link)
+
+    try:
+        # Build the list of queries
+        allqueries = []
+        clust = clust_head
+        if sort_query_by == 0:
+            # sort by query frequency
+            frequency_list = freq_list
+        else:
+            # sort by qid
+            frequency_list = None
+        # Accumulate the queries
+        for i, qid, aquery in cluster_query.get_queries(clust,
+                                                        key_k,
+                                                        queries,
+                                                        st_idx_q,
+                                                        en_idx_q,
+                                                        freq_list=frequency_list):
+            if aquery.lower() in qaction:
+                query_action = qaction[aquery.lower()]
+            else:
+                query_action = '[Not Found]'
+            allqueries.append((i,
+                               qid,
+                               aquery,
+                               query_action,
+                               freq_list[qid],
+                               '{0:0.3f}'.format(float(freq_list[qid]) / tot_nonuniq * 100.)))
+
+    except KeyError:
+        print('Key Not Found:', key_k)
+        return abort(404)
+
+    # Build the right pane navigations
+    diff = en_idx_q - st_idx_q
+    if st_idx_q > 0:
+        right_prev_code = navformat.format(url_for('both',
+                                                   key_k=key_k,
+                                                   st_idx_k=st_idx_k,
+                                                   en_idx_k=en_idx_k,
+                                                   st_idx_q=max(0, st_idx_q - diff),
+                                                   en_idx_q=st_idx_q,
+                                                   sort_key_by=sort_key_by,
+                                                   sort_query_by=sort_query_by), '&#60&#60')
+    else:
+        right_prev_code = '&#60&#60'
+    right_next_code = navformat.format(url_for('both',
+                                               key_k=key_k,
+                                               st_idx_k=st_idx_k,
+                                               en_idx_k=en_idx_k,
+                                               st_idx_q=en_idx_q,
+                                               en_idx_q=en_idx_q + diff,
+                                               sort_key_by=sort_key_by,
+                                               sort_query_by=sort_query_by), '&#62&#62')
+
+    # Build the links for sorting the queries
+    # By frequency
+    header_freq_link = url_for('both',
+                               key_k=key_k,
+                               st_idx_k=st_idx_k,
+                               en_idx_k=en_idx_k,
+                               st_idx_q=st_idx_q,
+                               en_idx_q=en_idx_q,
+                               sort_key_by=sort_key_by,
+                               sort_query_by=0)
+    # By QID
+    header_qid_link = url_for('both',
+                              key_k=key_k,
+                              st_idx_k=st_idx_k,
+                              en_idx_k=en_idx_k,
+                              st_idx_q=st_idx_q,
+                              en_idx_q=en_idx_q,
+                              sort_key_by=sort_key_by,
+                              sort_query_by=1)
+
+    # Calculate the cluster statistics
+    clust_stats = cluster_query.get_statistics(clust, key_k, freq_list)
+
+    # Build the visualization on the right pane
+    action_freq = get_action_hist(key_k, qaction)
+    image_src = get_plot(action_freq)
+
+    # Send all the data with visualization if there are queries
+    if len(action_freq.keys()) > 0:
+        # When the plot exists
+        return render_template('fullpage.html',
+                               total_count=tot_nonuniq,
+                               uniq_count=tot_uniq,
+                               left_prev_code=left_prev_code,
+                               left_next_code=left_next_code,
+                               allkeys=allkeys,
+                               currentkey=currentkey,
+                               header_unique_link=unique_link,
+                               header_total_link=total_link,
+                               right_prev_code=right_prev_code,
+                               right_next_code=right_next_code,
+                               allqueries=allqueries,
+                               clust_stats=clust_stats,
+                               image_src=image_src,
+                               header_freq_link=header_freq_link,
+                               header_qid_link=header_qid_link)
+    else:
+        # When the plot does not exist
+        return render_template('fullpage.html',
+                               total_count=tot_nonuniq,
+                               uniq_count=tot_uniq,
+                               left_prev_code=left_prev_code,
+                               left_next_code=left_next_code,
+                               allkeys=allkeys,
+                               currentkey=currentkey,
+                               header_unique_link=unique_link,
+                               header_total_link=total_link,
+                               right_prev_code=right_prev_code,
+                               right_next_code=right_next_code,
+                               allqueries=allqueries,
+                               clust_stats=clust_stats,
+                               header_freq_link=header_freq_link,
+                               header_qid_link=header_qid_link)
+
+
+# Run the server
+if __name__ == '__main__':
+    app.debug = False
+    app.run(host='0.0.0.0', port=PORT)
diff --git a/code/templates/fullpage.html b/code/templates/fullpage.html
new file mode 100644
index 0000000..40a372a
--- /dev/null
+++ b/code/templates/fullpage.html
@@ -0,0 +1,111 @@
+<!--Author: M Iftekhar Tanveer (mtanve200)-->
+<html>
+    <body>
+        <h1><a href="{{url_for('both')}}">SyntaViz: Syntax-driven Query Visualizer</a></h1>
+        Total {{total_count}} queries loaded ({{uniq_count}} unique)<br/>
+        <div id="container" style="width:100%;">
+            <!--This is the left pane containing the clusters-->
+            <div id="leftpane" style="width:34%;float:left;">
+                <!-- Left pane header and the navigation buttons -->
+                <div id="leftpane_nav" style="width:100%;">
+                    <h3 align="center">Hierarchical Dependency Clusters</h3>
+                    <div id="current_cluster">
+                        <strong>Current Cluster:</strong><br/>{{currentkey|safe}}
+                    </div>
+                    <div id="left_prev" style="width:30%;float:left;">
+                        {{left_prev_code|safe}}
+                    </div>
+                    <div id="left_next" style="width:30%;float:right;text-align:right;">
+                        {{left_next_code|safe}}
+                    </div>
+                    <div style="clear:both;"></div>
+                </div>
+                <!-- Container for the clusters -->
+                <div id = "keycontainer">
+                <table style="width:100%;font-size:85%;">
+                <!-- Table Header -->
+                    <tr>
+                        <th align="left">idx</th>
+                        <th align="left">Cluster</th>
+                        <th align="left"><a href="{{header_unique_link|safe}}">Unique</a></th>
+                        <th align="left"><a href="{{header_total_link|safe}}">Total</a></th>
+                    </tr>
+                    <!-- Rows of the table -->
+                    {% for i,count,akey,encodedkey,nucount,uniq_perc,nonuniq_perc in allkeys %}
+                    <tr>
+                        <td>{{i}}</td>
+                        <td><a href="{{encodedkey}}">{{akey}}</a></td>
+                        <td>{{count}} ({{uniq_perc}}%)</td>
+                        <td>{{nucount}} ({{nonuniq_perc}}%)</td>
+                    </tr>
+                    {% endfor %}
+                </table>
+                </div>
+            </div>
+
+            {% if allqueries is defined %}
+            <!--This is the right pane containing the queries-->
+            <div id="rightpane" style="width:64%;float:right;">
+                <!--Statistics panel-->
+                <div id="cluster_stats" style="width:40%;vertical-align:top;float:left;">
+                    <h3 align="left">Current Cluster Statistics:</h3>
+                    <strong align="left">The current cluster contains:</strong><br/>
+                    {% set unq_cnt,tot_cnt, unq_nondep,tot_nondep,qid_to_subclust = clust_stats %}
+                    {{tot_cnt}} queries in total <br/>
+                    {{unq_cnt}} unique queries <br/>
+                    <strong align="left">Number of queries not belonging to any subcluster:</strong><br/>
+                    {{unq_nondep}} unique queries<br/>
+                    {{tot_nondep}} total queries<br/>
+                </div>
+                <!--The plot-->
+                {% if image_src is defined %}
+                <div id="visualizer" style="width:60%;vertical-align:top;float:right;">
+                    <h3 align="center">Histogram of Actions in the cluster</h3>
+                    {{image_src|safe}}
+                </div>
+                {% endif %}
+                <div style="clear:both;"></div>
+                <!--Navigation Buttons -->
+                <div id="rightpane_nav" style="width:100%;">
+                    <div id="right_prev" style="width:30%;float:left;">
+                        {{right_prev_code|safe}}
+                    </div>
+                    <div id="right_next" style="width:30%;float:right;text-align:right;">
+                        {{right_next_code|safe}}
+                    </div>
+                    <div style="clear:both;"></div>
+                </div>
+                <!-- Query List -->
+                <div id = "querycontainer" style="width:100%;">
+                    <table style="width:100%;font-size:85%;">
+                    <!-- Table Header -->
+                    <tr>
+                        <th align="left">idx</th>
+                        <th align="left">Query</th>
+                        <th align="left">Action</th>
+                        <th align="left">Subclusters</th>
+                        <th align="left"><a href="{{header_freq_link|safe}}">Frequency(%ofTotal)</a></th>
+                        <th align="left"><a href="{{header_qid_link|safe}}">QID</a></th>
+                    </tr>
+                    <!-- Rows of the table -->
+                    {% for i,qid,aquery,action,qfreq,qfreq_perc in allqueries %}
+                    <tr>
+                        <td>{{i}}</td>
+                        <td>{{aquery}}</td>
+                        <td>{{action}}</td>
+                        <td>{% for asubclust in qid_to_subclust[qid] %}
+                            {{asubclust}}, 
+                            {% endfor %}
+                        </td>
+                        <td>{{qfreq}} ({{qfreq_perc}}%)</td>
+                        <td>{{qid}}</td>
+                    </tr>
+                    {% endfor %}
+                    </table>
+                </div>
+            </div>
+            {% endif %}
+            <div style="clear:both;"></div>
+        </div>
+    </body>
+</html>
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..3d7aca6
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,7 @@
+[aliases]
+test=pytest
+
+[tool:pytest]
+addopts =
+    --cov-report term-missing
+    --cov=syntaviz
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..271b244
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,37 @@
+from setuptools import setup, find_packages
+import re
+
+
+def get_version():
+    """
+    Extract the version from the module's root __init__.py file
+    """
+    root_init_file = open("syntaviz/__init__.py").read()
+    match = re.search("__version__[ ]+=[ ]+[\"'](.+)[\"']", root_init_file)
+    return match.group(1) if match is not None else "unknown"
+
+
+setup(
+    name="syntaviz",
+    version=get_version(),
+    description="SyntaViz",
+
+    packages=find_packages(),
+
+    package_data={},
+
+    python_requires='>=2.7, <3',
+
+    install_requires=["Flask==0.12.2",
+                      "matplotlib==2.0.2",
+                      "numpy==1.8.2",
+                      "scikit-learn==0.18.2",
+                      "scipy==0.19.1",
+                      "ipython==5.1.0",
+                      "bokeh==0.12.5",
+                      "nltk==3.2.3",
+                      "pandas==0.20.2",
+                      "torch"],
+    setup_requires=['pytest-runner'],
+    tests_require=['pytest', 'pytest-cov'],
+)