#### [Cypher](https://neo4j.com/product/cypher-graph-query-language/)

Cypher is a graph-optimized query language that understands, and takes advantage of, data connections. It follows connections – in any direction – to reveal previously unknown relationships and clusters. Cypher queries are much easier to write than massive SQL joins. Compare this Cypher query to its equivalent in SQL.

Neo4j and Cypher Under the Hood

Cypher is an expressive language with advanced graph patterns and collection support. Under the hood, the cypher processing pipeline first parses the query if not in cache, then goes through semantic verification and rewriting of the AST, followed by finding the cheapest execution plan (logical and physical) for all the operations using available planners, all the way to query execution.


In [1]:
import json, re
from nltk import word_tokenize

from pygments.lexers import get_lexer_by_name
from cypher_parser import CyqueryStatmentParser
lexer = get_lexer_by_name("py2neo.cypher")

alias_pattern = re.compile(r'(t[1-9]|[a-z])')	
labels_pattern = re.compile(r':`[a-z|A-Z].*`')

from process_cypher import CLAUSE_KEYWORDS, CYPHER_OPERATORS, WHERE_OPS, UNIT_OPS, AGG_OPS, ORDER_OPS, TABLE_TYPE, DerivedFieldAliasError, DerivedTableAliasError, ParenthesesInConditionError, ValueListError


In [2]:
#schema file path
# fpath = '/home/22842219/Desktop/openSource/UnifiedSKGG-Cypher/data/text2cypher/schema.json'
fpath ='/home/22842219/Desktop/phd/SemanticParser4Graph/semantic_parser/data/text2cypher/schema.json'
db_id = 'concert_singer'


In [3]:
from process_cypher import Schema, get_schema_from_json
schema = Schema(get_schema_from_json(fpath, db_id))
schema.idMap

{'*': 0,
 '`concert_singer.stadium`.Name': 1,
 '`concert_singer.stadium`.Lowest': 2,
 '`concert_singer.stadium`.Stadium_ID': 3,
 '`concert_singer.stadium`.Capacity': 4,
 '`concert_singer.stadium`.Highest': 5,
 '`concert_singer.stadium`.Location': 6,
 '`concert_singer.stadium`.Average': 7,
 '`concert_singer.singer`.Country': 8,
 '`concert_singer.singer`.Age': 9,
 '`concert_singer.singer`.Name': 10,
 '`concert_singer.singer`.Song_Name': 11,
 '`concert_singer.singer`.Is_male': 12,
 '`concert_singer.singer`.Singer_ID': 13,
 '`concert_singer.singer`.Song_release_year': 14,
 '`concert_singer.concert`.Theme': 15,
 '`concert_singer.concert`.Stadium_ID': 16,
 '`concert_singer.concert`.concert_Name': 17,
 '`concert_singer.concert`.concert_ID': 18,
 '`concert_singer.concert`.Year': 19,
 '`concert_singer.stadium`': 0,
 '`concert_singer.singer`': 1,
 '`concert_singer.concert`': 2}

## debugging Cypher parser

In [4]:
from process_cypher import tokenize, scan_labels_with_alias

In [5]:
test_cypher = "MATCH (concert:`concert_singer.concert`)-[]-(T2:`concert_singer.stadium`)\nWITH T2.Name AS Name, count(*) AS count, max(T2.Name) as maxi \nRETURN Name,count, maxi ORDER BY count "
                   #"match (n:`concert_singer.singer`) return count(*)"
toks = tokenize(test_cypher)
print(f'toks: {toks}')

+++++++++++++++++++++++++++++++++tokenize++++++++++++++++++++++++++++++
raw queries: ['MATCH (concert:`concert_singer.concert`)-[]-(T2:`concert_singer.stadium`)\nWITH T2.Name AS Name, count(*) AS count, max(T2.Name) as maxi \nRETURN Name,count, maxi ORDER BY count '] <class 'list'>
toks: [('Token.Keyword', 'MATCH'), ('Token.Text.Whitespace', ' '), ('Token.Punctuation', '('), ('Token.Name.Variable', 'concert'), ('Token.Punctuation', ':'), ('Token.Name.Label', '`concert_singer.concert`'), ('Token.Punctuation', ')-['), ('Token.Punctuation', ']-('), ('Token.Name.Variable', 'T2'), ('Token.Punctuation', ':'), ('Token.Name.Label', '`concert_singer.stadium`'), ('Token.Punctuation', ')'), ('Token.Keyword', 'WITH'), ('Token.Text.Whitespace', ' '), ('Token.Name.Variable', 'T2'), ('Token.Operator', '.'), ('Token.Keyword', 'Name'), ('Token.Text.Whitespace', ' '), ('Token.Keyword', 'AS'), ('Token.Text.Whitespace', ' '), ('Token.Keyword', 'Name'), ('Token.Punctuation', ','), ('Token.Text.Whitespace',

In [6]:
# test labels with alias
labels_with_alias = scan_labels_with_alias(toks)
print(labels_with_alias)

{'concert': '`concert_singer.concert`', 'T2': '`concert_singer.stadium`', 'Name': '`concert_singer.stadium`.Name', 'count': '*', 'maxi': '`concert_singer.stadium`.Name'}


In [7]:
start_idx = 0
# test parse_cypher
isBlock = False  #indicate if this is a block of cypher/subcypher
len_ = len(toks)
idx = start_idx
toks_ = [tok[1].lower() for tok in toks]

cypher = {}


In [8]:

from process_cypher import parse_cypher, parse_match, parse_with

In [9]:

# parse from clause in order to get default tables
match_end_idx, table_units, default_tables = parse_match(
    toks, start_idx, labels_with_alias, schema
)
cypher['match']={'table_units': table_units}

# parse 'with' clause
idx = match_end_idx
idx, with_units = parse_with(toks, idx, labels_with_alias, schema)
cypher['with'] = with_units


In [10]:
print(f'labels_with_alias: {labels_with_alias}')
print(f'cypher: {cypher}')

labels_with_alias: {'concert': '`concert_singer.concert`', 'T2': '`concert_singer.stadium`', 'Name': '`concert_singer.stadium`.Name', 'count': '*', 'maxi': '`concert_singer.stadium`.Name'}
cypher: {'match': {'table_units': [('table_unit', 2), ('table_unit', 0)]}, 'with': [(0, 'Name', 1, False), (3, 'count', 0, False), (1, 'maxi', 1, False)]}


In [11]:
default_tables

['concert', 'T2']

In [12]:
from process_cypher import parse_return, parse_order_by
# return clause
idx, return_col_units = parse_return(toks, idx, labels_with_alias, schema, default_tables)
cypher['return']=return_col_units
# order by clause
idx, order_col_units = parse_order_by(toks, idx, labels_with_alias, schema, default_tables)
cypher['orderBy'] = order_col_units

val_unit: (0, (0, 0, False), None)


In [13]:
cypher

{'match': {'table_units': [('table_unit', 2), ('table_unit', 0)]},
 'with': [(0, 'Name', 1, False),
  (3, 'count', 0, False),
  (1, 'maxi', 1, False)],
 'return': (False,
  [(0, (0, (0, 1, False), None)),
   (0, (0, (0, 0, False), None)),
   (0, (0, (0, 1, False), None))]),
 'orderBy': ('asc', [(0, (0, 0, False), None)])}

####

In [14]:
test1 = "MATCH (singer:`concert_singer.singer`)\nWHERE singer.Country <> 'France' OR singer.Country = 'China' \nRETURN avg(singer.Age),min(singer.Age),max(singer.Age)"

In [15]:
test2= "MATCH (department:`department_management.department`)\nWHERE department.Ranking>=10 and department.Ranking<=15.0\nRETURN avg(department.Num_Employees)\nLIMIT 10"

In [16]:
from process_cypher import Schema, get_schema_from_json
schema = Schema(get_schema_from_json(fpath, 'department_management'))
schema.idMap

{'*': 0,
 '`department_management.department`.Name': 1,
 '`department_management.department`.Creation': 2,
 '`department_management.department`.Department_ID': 3,
 '`department_management.department`.Ranking': 4,
 '`department_management.department`.Budget_in_Billions': 5,
 '`department_management.department`.Num_Employees': 6,
 '`department_management.head`.name': 7,
 '`department_management.head`.born_state': 8,
 '`department_management.head`.age': 9,
 '`department_management.head`.head_ID': 10,
 '`department_management.management`.temporary_acting': 11,
 '`department_management.department`': 0,
 '`department_management.head`': 1,
 '`department_management.management`': 2}

In [17]:

from process_cypher import tokenize, scan_labels_with_alias, parse_cypher

toks = tokenize(test2)
print(f'toks: {toks}')

+++++++++++++++++++++++++++++++++tokenize++++++++++++++++++++++++++++++
raw queries: ['MATCH (department:`department_management.department`)\nWHERE department.Ranking>=10 and department.Ranking<=15.0\nRETURN avg(department.Num_Employees)\nLIMIT 10'] <class 'list'>
toks: [('Token.Keyword', 'MATCH'), ('Token.Text.Whitespace', ' '), ('Token.Punctuation', '('), ('Token.Name.Variable', 'department'), ('Token.Punctuation', ':'), ('Token.Name.Label', '`department_management.department`'), ('Token.Punctuation', ')'), ('Token.Keyword', 'WHERE'), ('Token.Text.Whitespace', ' '), ('Token.Name.Variable', 'department'), ('Token.Operator', '.'), ('Token.Name.Variable', 'Ranking'), ('Token.Operator', '>='), ('Token.Literal.Number.Integer', '10'), ('Token.Text.Whitespace', ' '), ('Token.Operator', 'and'), ('Token.Text.Whitespace', ' '), ('Token.Name.Variable', 'department'), ('Token.Operator', '.'), ('Token.Name.Variable', 'Ranking'), ('Token.Operator', '<='), ('Token.Literal.Number.Float', '15.0'), ('

In [18]:
# test labels with alias
labels_with_alias = scan_labels_with_alias(toks)
print(labels_with_alias)



{'department': '`department_management.department`'}


In [27]:
idx, cypher = parse_cypher(toks, 0, labels_with_alias, schema)



ValueError: not enough values to unpack (expected 4, got 3)

In [20]:

from process_cypher import parse_cypher, parse_match, parse_with, parse_where
# parse from clause in order to get default tables
match_end_idx, table_units, default_tables = parse_match(
    toks, start_idx, labels_with_alias, schema
)
cypher['match']={'table_units': table_units}

# parse 'with' clause
idx = match_end_idx
idx, with_units = parse_with(toks, idx, labels_with_alias, schema)
cypher['with'] = with_units

print(f'labels_with_alias: {labels_with_alias}')
print(f'cypher: {cypher}')

labels_with_alias: {'department': '`department_management.department`'}
cypher: {'match': {'table_units': [('table_unit', 0)]}, 'with': []}


In [21]:
idx, where_conds = parse_where(toks, idx, labels_with_alias, schema, default_tables)
cypher['where'] = where_conds
cypher

heyy 12 (0, (0, 4, False), None)
[(False, 2, (0, (0, 4, False), None), '10')]
heyy 20 (0, (0, 4, False), None)
[(False, 2, (0, (0, 4, False), None), '10'), 'and', (False, 3, (0, (0, 4, False), None), '15.0')]


{'match': {'table_units': [('table_unit', 0)]},
 'with': [],
 'where': [(False, 2, (0, (0, 4, False), None), '10'),
  'and',
  (False, 3, (0, (0, 4, False), None), '15.0')]}

In [22]:
from process_cypher import parse_return, parse_order_by, parse_limit
# return clause
idx, return_col_units = parse_return(toks, idx, labels_with_alias, schema, default_tables)
cypher['return']=return_col_units



In [25]:
# order by clause
idx, order_col_units = parse_order_by(toks, idx, labels_with_alias, schema, default_tables)
cypher['order by'] = order_col_units

# limit clause
idx, limit_val = parse_limit(toks, idx)
cypher["limit"] = limit_val

In [24]:
cypher

{'match': {'table_units': [('table_unit', 0)]},
 'with': [],
 'where': [(False, 2, (0, (0, 4, False), None), '10'),
  'and',
  (False, 3, (0, (0, 4, False), None), '15.0')],
 'return': (False, [(5, (0, (0, 6, False), None))]),
 'orderBy': [],
 'limit': 10}