Skip to content

Commit

Permalink
SqlFactExtractor and FragmentLocator are running and can be tested, b…
Browse files Browse the repository at this point in the history
…ut need refactoring.
  • Loading branch information
todeslord committed Nov 7, 2013
1 parent bc73f0a commit 1426b7d
Show file tree
Hide file tree
Showing 21 changed files with 2,322 additions and 0 deletions.
7 changes: 7 additions & 0 deletions technologies/SqlFactExtractor/Makefile
@@ -0,0 +1,7 @@
test:
python test.py example/createTable.sql
diff --ignore-all-space example/testResult.json example/estimatedResult.json
diff --ignore-all-space example/testResultWithLineNumbers.json example/estimatedResultWithLineNumbers.json

rm example/testResult.json
rm example/testResultWithLineNumbers.json
20 changes: 20 additions & 0 deletions technologies/SqlFactExtractor/Readme.md
@@ -0,0 +1,20 @@
# Headline

A fact extractor for Sql

# Usage

The executable is "extractor.py".

The source code for fact extraction is read from stdin.

The extracted JSON facts are written to stdout.

# Testing

Test the tool with "make test".

See the Makefile for details.

See example/createTable.sql is used as input.

4 changes: 4 additions & 0 deletions technologies/SqlFactExtractor/SQLClassifier.py
@@ -0,0 +1,4 @@
CLASSIFIER_FILE = "sql_file"
CLASSIFIER_CREATE = "create_statement"
CLASSIFIER_TABLE = "table"
CLASSIFIER_COLUMN = "column"
229 changes: 229 additions & 0 deletions technologies/SqlFactExtractor/SQLFactExtractor.py
@@ -0,0 +1,229 @@
import sqlparse
import sys
import json
import re
from SQLClassifier import *

class SQLFactExtractor(object):
"""docstring for SQLFactExtractor"""

def enumerate_auto(*sequential, **named):
enums = dict(zip(sequential, range(len(sequential))), **named)
return type('Enum', (), enums)

StatementType = enumerate_auto(
'CREATE',
'ALTER',
'UNKNOWN'
)

ExpectedToken = enumerate_auto(
'NONE',
'BEGIN',
'VARIABLE_NAME',
'VARIABLE_TYPE'
)

def __init__(self, file, log_code):
self.sqlDatei = file
self.load_reserved_sql_keywords()
self.log_code = log_code

def load_reserved_sql_keywords(self):
self.reserved_keywords = []
for line in open("sqlKeyWords.txt", "r"):
self.reserved_keywords.append(str.lower(line.rstrip()))

def get_statement_type(self,statement):
for my_token in statement.tokens:
if str.upper(str(my_token)) == "CREATE":
return self.StatementType.CREATE
elif str.upper(str(my_token)) == "ALTER":
return self.StatementType.ALTER
return self.StatementType.UNKNOWN

def miss_whitespace(self,tokens_generator_elem):
while tokens_generator_elem.is_whitespace():
tokens_generator_elem = tokens_generator_elem.next();
return tokens_generator_elem

def extract_file(self):
fragment_result = {"classifier": CLASSIFIER_FILE, "fragments": []}
constraints_list = []

for each in sqlparse.parse(open(self.sqlDatei).read()):
if each.get_type() != "UNKNOWN":
statement_type = self.get_statement_type(each)
if self.StatementType.CREATE == statement_type:#create
self.extract_create_statement(each, fragment_result)
elif self.StatementType.ALTER == statement_type:#alter
self.extract_alter_statement(each, constraints_list)

self.add_contraints(fragment_result, constraints_list)
if self.log_code:
self.add_code_linenumbers(fragment_result)

return fragment_result

def extract_create_statement(self, each, fragment_result):
subject_token = None
item_list = []
expected_token = self.ExpectedToken.BEGIN

#get Relevant Data
for token in each.tokens:
#print(dir(token))
#print(token.value)
if str(token) not in ['create', 'table'] and not token.is_whitespace() and str(token) != ";":
subject_token = token.flatten().next()
for sub_token in token.flatten():
if self.ExpectedToken.BEGIN == expected_token and '(' in str(sub_token):
expected_token = self.ExpectedToken.VARIABLE_NAME
elif self.ExpectedToken.VARIABLE_NAME == expected_token and not sub_token.is_whitespace():
if str.lower(str(sub_token)) in self.reserved_keywords:
expected_token = self.ExpectedToken.NONE
else:
item_list.append(str(sub_token))
expected_token = self.ExpectedToken.VARIABLE_TYPE
elif self.ExpectedToken.VARIABLE_TYPE == expected_token and not sub_token.is_whitespace():
item_list.append(str(sub_token))
expected_token = self.ExpectedToken.NONE
elif self.ExpectedToken.NONE == expected_token and ',' in str(sub_token):
expected_token = self.ExpectedToken.VARIABLE_NAME
# create fragments as JSON
statement_fragment = {"classifier": CLASSIFIER_CREATE, "fragments": []}
table_fragment = {"classifier": CLASSIFIER_TABLE, "name": str(subject_token), "fragments": []}
i = 0
while i < len(item_list):
table_fragment["fragments"].append(
{"classifier": CLASSIFIER_COLUMN, "name": item_list[i], "type": item_list[i + 1], "fragments": []})
i += 2
statement_fragment["fragments"].append(table_fragment)
fragment_result["fragments"].append(statement_fragment)

self.add_code_to_fragments(each, statement_fragment)


def extract_alter_statement(self, each, constraints_list):
subject_table = None
constraint_name = ""
foreign_key_var = ""
references = ""
#get Relevant Data
i = 0
while str.lower(str(each.tokens[i])) in ['alter', 'table'] or each.tokens[i].is_whitespace():
i += 1
subject_table = str(each.tokens[i])
while str.lower(str(each.tokens[i])) != 'constraint' or each.tokens[i].is_whitespace():
i += 1
constraint_name = str(each.tokens[i + 2])
while (str.lower(str(each.tokens[i])) != 'foreign' and str.lower(str(each.tokens[i + 2])) != 'key') or \
each.tokens[i].is_whitespace():
i += 1
foreign_key_var = str(each.tokens[i + 4])
foreign_key_var = foreign_key_var[1:len(foreign_key_var) - 1]
while str.lower(str(each.tokens[i])) != 'references' or each.tokens[i].is_whitespace():
i += 1
references = str(each.tokens[i + 2])

constraints_list.append({
"subject_table": subject_table,
"constraint_name": constraint_name,
"foreign_key_var": foreign_key_var,
"references": references
})

def add_code_to_fragments(self, each, statement_fragment):
if self.log_code:
statement_fragment["code"] = str(each)
statement_fragment["fragments"][0]["code"] = self.delete_beginning_control_characters(self.remouve_beginning_create(str(each)))

#filter column code
long_string = str(each)
long_string = long_string[long_string.find("(")+1:]
column_code_list = self.format_end_of_column_generation( long_string.split(","))

for column_id in range(0, len(statement_fragment["fragments"][0]["fragments"])):
statement_fragment["fragments"][0]["fragments"][column_id]["code"] = self.delete_beginning_control_characters(column_code_list[column_id])

def remouve_beginning_create(self, code_string):
return code_string[code_string.find("table"):]

def format_end_of_column_generation(self, column_codes):
column_codes[-1] = self.delete_last_char_if_equals(column_codes[-1], ";")
column_codes[-1] = self.delete_last_char_if_equals(column_codes[-1], ")")

return column_codes

def delete_last_char_if_equals(self, string, char):
if char == string[-1]:
string = string[:-1]
return string

def delete_beginning_control_characters(self, string):
return string[re.search("\w",string).start() :]

def add_contraints(self, fragment_result, constraints_list):
for constraint in constraints_list:
#find table
for create in fragment_result["fragments"]:
if create["fragments"][0]["name"] == constraint["subject_table"]:
#find column
for column in create["fragments"][0]["fragments"]:
if column["name"] == constraint["foreign_key_var"]:
column["constraints"] = [{
"type": "foreign_key",
"references": constraint["references"],
"name": constraint["constraint_name"]
}]

def add_code_linenumbers(self, fragment_result):
self.add_file_linenumbers(fragment_result)

self.add_fragment_linenumbers(fragment_result["fragments"], 1, self.get_file_length(self.sqlDatei))

def add_fragment_linenumbers(self, fragment_pointer, start, end):
if type(fragment_pointer) == list:
for frag_elem in fragment_pointer:
self.add_fragment_linenumbers(frag_elem, start, end)
elif type(fragment_pointer) == dict:
self.derive_linenumber(fragment_pointer, start, end)
self.add_fragment_linenumbers(fragment_pointer["fragments"], fragment_pointer["line_start"], fragment_pointer["line_end"])
else:
print("ERROR: add_code_linenumbers UNKNOWN type:"+str(type(fragment_pointer)))

def derive_linenumber(self, fragment_pointer, start, end):
open_file = open(self.sqlDatei, "r")

self.go_to_line(open_file, start)

char_pointer_fragment = 0
line_counter = start
start_line = start
while char_pointer_fragment < len(fragment_pointer["code"]):
char = open_file.read(1)
if("\n" == char):
line_counter += 1
if char == fragment_pointer["code"][char_pointer_fragment]:
if char_pointer_fragment == 0:
start_line = line_counter
char_pointer_fragment += 1
else:
char_pointer_fragment = 0

fragment_pointer["line_start"] = start_line
fragment_pointer["line_end"] = line_counter

def go_to_line(self, open_file, start):
for times in range(1,start):
open_file.readline()

def add_file_linenumbers(self, fragment_result):
fragment_result["line_start"] = 1
fragment_result["line_end"] = self.get_file_length(self.sqlDatei)

def get_file_length(self, file_string):
linenumber = 0
for line in open(self.sqlDatei):
linenumber += 1
return linenumber
35 changes: 35 additions & 0 deletions technologies/SqlFactExtractor/example/createTable.sql
@@ -0,0 +1,35 @@
create
table COMPANY
(
ID bigint generated by default as identity (start with 1),
name varchar(255),
primary key (ID)
);

create
table DEPARTMENT
(
ID bigint generated by default as identity (start with 1),
name varchar(255),
COMP_ID bigint,
DEPT_ID bigint,
primary key (ID)
);

create
table EMPLOYEE
(
ID bigint generated by default as identity (start with 1),
name varchar(255),
address varchar(255),
salary double,
manager bit,
MENTOR bigint,
DEPT_ID bigint,
primary key (ID)
);

alter table DEPARTMENT add constraint FK4F782F5255C77F64 foreign key (DEPT_ID) references DEPARTMENT;
alter table DEPARTMENT add constraint FK4F782F52C7CB872B foreign key (COMP_ID) references COMPANY;
alter table EMPLOYEE add constraint FK75C8D6AE55C77F64 foreign key (DEPT_ID) references DEPARTMENT;
alter table EMPLOYEE add constraint FK75C8D6AE800BE06C foreign key (MENTOR) references EMPLOYEE;
1 change: 1 addition & 0 deletions technologies/SqlFactExtractor/example/estimatedResult.json
@@ -0,0 +1 @@
{"fragments": [{"fragments": [{"fragments": [{"fragments": [], "type": "bigint", "classifier": "column", "name": "ID"}, {"fragments": [], "type": "varchar", "classifier": "column", "name": "name"}], "classifier": "table", "name": "COMPANY"}], "classifier": "create_statement"}, {"fragments": [{"fragments": [{"fragments": [], "type": "bigint", "classifier": "column", "name": "ID"}, {"fragments": [], "type": "varchar", "classifier": "column", "name": "name"}, {"fragments": [], "constraints": [{"references": "COMPANY", "type": "foreign_key", "name": "FK4F782F52C7CB872B"}], "type": "bigint", "classifier": "column", "name": "COMP_ID"}, {"fragments": [], "constraints": [{"references": "DEPARTMENT", "type": "foreign_key", "name": "FK4F782F5255C77F64"}], "type": "bigint", "classifier": "column", "name": "DEPT_ID"}], "classifier": "table", "name": "DEPARTMENT"}], "classifier": "create_statement"}, {"fragments": [{"fragments": [{"fragments": [], "type": "bigint", "classifier": "column", "name": "ID"}, {"fragments": [], "type": "varchar", "classifier": "column", "name": "name"}, {"fragments": [], "type": "varchar", "classifier": "column", "name": "address"}, {"fragments": [], "type": "double", "classifier": "column", "name": "salary"}, {"fragments": [], "type": "bit", "classifier": "column", "name": "manager"}, {"fragments": [], "constraints": [{"references": "EMPLOYEE", "type": "foreign_key", "name": "FK75C8D6AE800BE06C"}], "type": "bigint", "classifier": "column", "name": "MENTOR"}, {"fragments": [], "constraints": [{"references": "DEPARTMENT", "type": "foreign_key", "name": "FK75C8D6AE55C77F64"}], "type": "bigint", "classifier": "column", "name": "DEPT_ID"}], "classifier": "table", "name": "EMPLOYEE"}], "classifier": "create_statement"}], "classifier": "sql_file"}
@@ -0,0 +1 @@
{"fragments": [{"fragments": [{"code": "table COMPANY \n\t(\n\t\tID bigint generated by default as identity (start with 1), \n\t\tname varchar(255), \n\t\tprimary key (ID)\n\t);", "name": "COMPANY", "line_start": 2, "fragments": [{"code": "ID bigint generated by default as identity (start with 1)", "name": "ID", "line_end": 4, "line_start": 4, "fragments": [], "type": "bigint", "classifier": "column"}, {"code": "name varchar(255)", "name": "name", "line_end": 5, "line_start": 5, "fragments": [], "type": "varchar", "classifier": "column"}], "line_end": 7, "classifier": "table"}], "code": "create \n\ttable COMPANY \n\t(\n\t\tID bigint generated by default as identity (start with 1), \n\t\tname varchar(255), \n\t\tprimary key (ID)\n\t);", "line_end": 7, "classifier": "create_statement", "line_start": 1}, {"fragments": [{"code": "table DEPARTMENT \n\t(\n\t\tID bigint generated by default as identity (start with 1), \n\t\tname varchar(255), \n\t\tCOMP_ID bigint, \n\t\tDEPT_ID bigint, \n\t\tprimary key (ID)\n\t);", "name": "DEPARTMENT", "line_start": 10, "fragments": [{"code": "ID bigint generated by default as identity (start with 1)", "name": "ID", "line_end": 12, "line_start": 12, "fragments": [], "type": "bigint", "classifier": "column"}, {"code": "name varchar(255)", "name": "name", "line_end": 13, "line_start": 13, "fragments": [], "type": "varchar", "classifier": "column"}, {"code": "COMP_ID bigint", "name": "COMP_ID", "line_end": 14, "line_start": 14, "fragments": [], "type": "bigint", "classifier": "column", "constraints": [{"references": "COMPANY", "type": "foreign_key", "name": "FK4F782F52C7CB872B"}]}, {"code": "DEPT_ID bigint", "name": "DEPT_ID", "line_end": 15, "line_start": 15, "fragments": [], "type": "bigint", "classifier": "column", "constraints": [{"references": "DEPARTMENT", "type": "foreign_key", "name": "FK4F782F5255C77F64"}]}], "line_end": 17, "classifier": "table"}], "code": "\n\ncreate \n\ttable DEPARTMENT \n\t(\n\t\tID bigint generated by default as identity (start with 1), \n\t\tname varchar(255), \n\t\tCOMP_ID bigint, \n\t\tDEPT_ID bigint, \n\t\tprimary key (ID)\n\t);", "line_end": 17, "classifier": "create_statement", "line_start": 8}, {"fragments": [{"code": "table EMPLOYEE \n\t(\n\t\tID bigint generated by default as identity (start with 1),\n\t\tname varchar(255), \n\t\taddress varchar(255), \n\t\tsalary double, \n\t\tmanager bit, \n\t\tMENTOR bigint, \n\t\tDEPT_ID bigint, \n\t\tprimary key (ID)\n\t);", "name": "EMPLOYEE", "line_start": 20, "fragments": [{"code": "ID bigint generated by default as identity (start with 1)", "name": "ID", "line_end": 22, "line_start": 22, "fragments": [], "type": "bigint", "classifier": "column"}, {"code": "name varchar(255)", "name": "name", "line_end": 23, "line_start": 23, "fragments": [], "type": "varchar", "classifier": "column"}, {"code": "address varchar(255)", "name": "address", "line_end": 24, "line_start": 24, "fragments": [], "type": "varchar", "classifier": "column"}, {"code": "salary double", "name": "salary", "line_end": 25, "line_start": 25, "fragments": [], "type": "double", "classifier": "column"}, {"code": "manager bit", "name": "manager", "line_end": 26, "line_start": 26, "fragments": [], "type": "bit", "classifier": "column"}, {"code": "MENTOR bigint", "name": "MENTOR", "line_end": 27, "line_start": 27, "fragments": [], "type": "bigint", "classifier": "column", "constraints": [{"references": "EMPLOYEE", "type": "foreign_key", "name": "FK75C8D6AE800BE06C"}]}, {"code": "DEPT_ID bigint", "name": "DEPT_ID", "line_end": 28, "line_start": 28, "fragments": [], "type": "bigint", "classifier": "column", "constraints": [{"references": "DEPARTMENT", "type": "foreign_key", "name": "FK75C8D6AE55C77F64"}]}], "line_end": 30, "classifier": "table"}], "code": "\n\ncreate \n\ttable EMPLOYEE \n\t(\n\t\tID bigint generated by default as identity (start with 1),\n\t\tname varchar(255), \n\t\taddress varchar(255), \n\t\tsalary double, \n\t\tmanager bit, \n\t\tMENTOR bigint, \n\t\tDEPT_ID bigint, \n\t\tprimary key (ID)\n\t);", "line_end": 30, "classifier": "create_statement", "line_start": 18}], "line_end": 35, "classifier": "sql_file", "line_start": 1}
5 changes: 5 additions & 0 deletions technologies/SqlFactExtractor/extractor.py
@@ -0,0 +1,5 @@
#! /usr/bin/env python
from SQLFactExtractor import *

extractor = SQLFactExtractor(sys.argv[1], False)
print(json.dumps(extractor.extract_file()))

0 comments on commit 1426b7d

Please sign in to comment.