#### [Cypher](https://neo4j.com/product/cypher-graph-query-language/)

Cypher is a graph-optimized query language that understands, and takes advantage of, data connections. It follows connections – in any direction – to reveal previously unknown relationships and clusters. Cypher queries are much easier to write than massive SQL joins. Compare this Cypher query to its equivalent in SQL.

Neo4j and Cypher Under the Hood

Cypher is an expressive language with advanced graph patterns and collection support. Under the hood, the cypher processing pipeline first parses the query if not in cache, then goes through semantic verification and rewriting of the AST, followed by finding the cheapest execution plan (logical and physical) for all the operations using available planners, all the way to query execution.


In [1]:
import json, re
from nltk import word_tokenize

from pygments.lexers import get_lexer_by_name
from cypher_parser import CyqueryStatmentParser
lexer = get_lexer_by_name("py2neo.cypher")

alias_pattern = re.compile(r'(t[1-9]|[a-z])')	
labels_pattern = re.compile(r':`[a-z|A-Z].*`')

from process_cypher import CLAUSE_KEYWORDS, CYPHER_OPERATORS, WHERE_OPS, UNIT_OPS, AGG_OPS, ORDER_OPS, TABLE_TYPE, DerivedFieldAliasError, DerivedTableAliasError, ParenthesesInConditionError, ValueListError


In [2]:
#schema file path
# fpath = '/home/22842219/Desktop/openSource/UnifiedSKGG-Cypher/data/text2cypher/schema.json'
fpath ='/home/22842219/Desktop/phd/SemanticParser4Graph/semantic_parser/data/text2cypher/schema.json'
db_id = 'concert_singer'


In [3]:
from process_cypher import Schema, get_schema_from_json
schema = Schema(get_schema_from_json(fpath, db_id))
schema.idMap

{'*': 0,
 '`concert_singer.stadium`.Name': 1,
 '`concert_singer.stadium`.Lowest': 2,
 '`concert_singer.stadium`.Stadium_ID': 3,
 '`concert_singer.stadium`.Capacity': 4,
 '`concert_singer.stadium`.Highest': 5,
 '`concert_singer.stadium`.Location': 6,
 '`concert_singer.stadium`.Average': 7,
 '`concert_singer.singer`.Country': 8,
 '`concert_singer.singer`.Age': 9,
 '`concert_singer.singer`.Name': 10,
 '`concert_singer.singer`.Song_Name': 11,
 '`concert_singer.singer`.Is_male': 12,
 '`concert_singer.singer`.Singer_ID': 13,
 '`concert_singer.singer`.Song_release_year': 14,
 '`concert_singer.concert`.Theme': 15,
 '`concert_singer.concert`.Stadium_ID': 16,
 '`concert_singer.concert`.concert_Name': 17,
 '`concert_singer.concert`.concert_ID': 18,
 '`concert_singer.concert`.Year': 19,
 '`concert_singer.stadium`': 0,
 '`concert_singer.singer`': 1,
 '`concert_singer.concert`': 2}

## debugging Cypher parser

In [4]:
from process_cypher import tokenize, scan_labels_with_alias

In [5]:
test_cypher = "MATCH (concert:`concert_singer.concert`)-[]-(T2:`concert_singer.stadium`)\nWITH T2.Name AS Name, count(*) AS count, max(T2.Name) as maxi \nRETURN Name,count, maxi ORDER BY count "
                   #"match (n:`concert_singer.singer`) return count(*)"
toks = tokenize(test_cypher)
print(f'toks: {toks}')

+++++++++++++++++++++++++++++++++tokenize++++++++++++++++++++++++++++++
raw queries: ['MATCH (concert:`concert_singer.concert`)-[]-(T2:`concert_singer.stadium`)\nWITH T2.Name AS Name, count(*) AS count, max(T2.Name) as maxi \nRETURN Name,count, maxi ORDER BY count '] <class 'list'>
toks: [('Token.Keyword', 'MATCH'), ('Token.Text.Whitespace', ' '), ('Token.Punctuation', '('), ('Token.Name.Variable', 'concert'), ('Token.Punctuation', ':'), ('Token.Name.Label', '`concert_singer.concert`'), ('Token.Punctuation', ')-['), ('Token.Punctuation', ']-('), ('Token.Name.Variable', 'T2'), ('Token.Punctuation', ':'), ('Token.Name.Label', '`concert_singer.stadium`'), ('Token.Punctuation', ')'), ('Token.Keyword', 'WITH'), ('Token.Text.Whitespace', ' '), ('Token.Name.Variable', 'T2'), ('Token.Operator', '.'), ('Token.Keyword', 'Name'), ('Token.Text.Whitespace', ' '), ('Token.Keyword', 'AS'), ('Token.Text.Whitespace', ' '), ('Token.Keyword', 'Name'), ('Token.Punctuation', ','), ('Token.Text.Whitespace',

In [6]:
# test labels with alias
labels_with_alias = scan_labels_with_alias(toks)
print(labels_with_alias)

{'concert': '`concert_singer.concert`', 'T2': '`concert_singer.stadium`', 'Name': '`concert_singer.stadium`.Name', 'count': '*', 'maxi': '`concert_singer.stadium`.Name'}


In [7]:
start_idx = 0
# test parse_cypher
isBlock = False  #indicate if this is a block of cypher/subcypher
len_ = len(toks)
idx = start_idx
toks_ = [tok[1].lower() for tok in toks]

cypher = {}


In [8]:

from process_cypher import parse_cypher, parse_match, parse_with

In [9]:

# parse from clause in order to get default tables
match_end_idx, table_units, default_tables = parse_match(
    toks, start_idx, labels_with_alias, schema
)
cypher['match']={'table_units': table_units}

# parse 'with' clause
idx = match_end_idx
idx, with_units = parse_with(toks, idx, labels_with_alias, schema)
cypher['with'] = with_units


In [10]:
print(f'labels_with_alias: {labels_with_alias}')
print(f'cypher: {cypher}')

labels_with_alias: {'concert': '`concert_singer.concert`', 'T2': '`concert_singer.stadium`', 'Name': '`concert_singer.stadium`.Name', 'count': '*', 'maxi': '`concert_singer.stadium`.Name'}
cypher: {'match': {'table_units': [('table_unit', 2), ('table_unit', 0)]}, 'with': [(0, 'Name', 1, False), (3, 'count', 0, False), (1, 'maxi', 1, False)]}


In [11]:
default_tables

['concert', 'T2']

In [12]:
from process_cypher import parse_return, parse_order_by
# return clause
idx, return_col_units = parse_return(toks, idx, labels_with_alias, schema, default_tables)
cypher['return']=return_col_units
# order by clause
idx, order_col_units = parse_order_by(toks, idx, labels_with_alias, schema, default_tables)
cypher['orderBy'] = order_col_units

val_unit: (0, (0, 0, False), None)


In [13]:
cypher

{'match': {'table_units': [('table_unit', 2), ('table_unit', 0)]},
 'with': [(0, 'Name', 1, False),
  (3, 'count', 0, False),
  (1, 'maxi', 1, False)],
 'return': (False,
  [(0, (0, (0, 1, False), None)),
   (0, (0, (0, 0, False), None)),
   (0, (0, (0, 1, False), None))]),
 'orderBy': ('asc', [(0, (0, 0, False), None)])}

In [15]:
def parse_col(toks, start_idx, labels_with_alias, schema, default_tables=None):
    """
        :returns next idx, column id
    """
    toks_ = [tok[1].lower() for tok in toks]
    idx = start_idx
    tok = toks_[idx]
    if tok == "*":
        return idx + 1, schema.idMap[tok]
    print(f'debugging parse_col: idx: {idx}')
    print('step 2: ', toks_[idx:])
    if toks[idx][0] in ['Token.Punctuation', 'Token.Text.Whitespace' ]:
        idx+=1
    print('idx:', idx)
    print(tok, labels_with_alias)
    print(toks[idx])

    if toks[idx][0]=='Token.Name.Variable' or toks[idx][1] in labels_with_alias: # if token is a composite
        key = labels_with_alias[toks[idx][1]]
        if '.' in toks_[idx:]:
            schema_key = '{}.{}'.format(key, toks[idx+2][1])
            idx+=2
        else:
            schema_key = key
            idx+=1
        return idx, schema.idMap[schema_key]

    assert default_tables is not None and len(default_tables) > 0, "Default tables should not be None or empty"

    print('heyyy: tok:', tok)
    print(toks[idx])
    print(default_tables)
    for alias in default_tables:
        table = labels_with_alias[alias]
        table_original = table.strip('`').split('.')[-1]
        if tok in schema.schema[table_original]:
            key = table + "." + tok
            return idx+1, schema.idMap[key]

    assert False, "Error col: {}".format(tok)

def parse_col_unit(toks, start_idx, labels_with_alias, schema, default_tables=None):
    """
        :returns next idx, (agg_op id, col_id)
    """
    toks_ = [tok[1].lower() for tok in toks]
    idx = start_idx
    len_ = len(toks)
    # isBlock = False
    isDistinct = False

    # if toks_[idx] == '(':
    #     isBlock = True
    #     idx += 1
    print(f'debugging parse_col_unit:  idx: {idx}')
    print("step 0:", toks_[idx:])
    print(toks[idx:])
    
    if toks[idx][0] in ['Token.Punctuation', 'Token.Text.Whitespace']:
        idx += 1


       
    if toks_[idx] in AGG_OPS and toks[idx][0]=='Token.Name.Function':
        agg_id = AGG_OPS.index(toks_[idx])
        idx += 1
        assert idx < len_ and toks_[idx] == '('
        idx += 1
        if toks_[idx] == "distinct":
            idx += 1
            isDistinct = True
        
        idx, col_id = parse_col(toks, idx, labels_with_alias, schema, default_tables)
        assert idx < len_ and toks_[idx] == ')'
        idx += 1
        return idx, (agg_id, col_id, isDistinct)

    if toks_[idx] == "distinct":
        idx += 1
        isDistinct = True
    agg_id = AGG_OPS.index("none")
    idx, col_id = parse_col(toks, idx, labels_with_alias, schema, default_tables)

    # if isBlock:
    #     assert toks_[idx] == ')'
    #     idx += 1  # skip ')'
        
    return idx, (agg_id, col_id, isDistinct)

def parse_val_unit(toks, start_idx, labels_with_alias, schema, default_tables=None):
    idx = start_idx
    len_ = len(toks)
    # isBlock = False
    # if toks[idx] == '(':
    #     isBlock = True
    #     idx += 1

    col_unit1 = None
    col_unit2 = None
    unit_op = UNIT_OPS.index('none')
    print(f'debugging parse_val_unit idx: {idx}, {toks_[idx:]}')
    idx, col_unit1 = parse_col_unit(toks, idx, labels_with_alias, schema, default_tables)
    if idx < len_ and toks[idx] in UNIT_OPS:
        unit_op = UNIT_OPS.index(toks[idx])
        idx += 1
        idx, col_unit2 = parse_col_unit(toks, idx, labels_with_alias, schema, default_tables)

    # if isBlock:
    #     assert toks[idx] == ')'
    #     idx += 1  # skip ')'

    return idx, (unit_op, col_unit1, col_unit2)


In [16]:
pred0 = "MATCH (t1:`concert_singer.singer`) \
    WHERE t1.Song_Name =~'.*[Hey]?.*' RETURN t1.Name,t1.Nation"

In [17]:
pred1 = ' | concert_singer.singer| :`concert_singer.singer.stadium.singer.stadium.singer.stadium.singer.stadium.singer.singer.stadium.singer.singer.stadium.singer.singer.singer.stadium.singer.stadium.singer.singer.stadium.sing'

In [18]:
pred2 = 'SELECT * FROMTV_Series_Has_TVshow.TV_Series,,,,,,,: `tvshow.TV_Series_Has_TVshow.TV_Series_Has_TVshow.TV_Series_Has_TVshow.TV_Series_Has_TVshow.TV_Series_Has_TVshow.TV_Series_Has_TVshow.TV_Series_Has_TV'
