Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
SqlFactExtractor and FragmentLocator are running and can be tested, b…
…ut need refactoring.
- Loading branch information
Showing
21 changed files
with
2,322 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
test: | ||
python test.py example/createTable.sql | ||
diff --ignore-all-space example/testResult.json example/estimatedResult.json | ||
diff --ignore-all-space example/testResultWithLineNumbers.json example/estimatedResultWithLineNumbers.json | ||
|
||
rm example/testResult.json | ||
rm example/testResultWithLineNumbers.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Headline | ||
|
||
A fact extractor for Sql | ||
|
||
# Usage | ||
|
||
The executable is "extractor.py". | ||
|
||
The source code for fact extraction is read from stdin. | ||
|
||
The extracted JSON facts are written to stdout. | ||
|
||
# Testing | ||
|
||
Test the tool with "make test". | ||
|
||
See the Makefile for details. | ||
|
||
See example/createTable.sql is used as input. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
CLASSIFIER_FILE = "sql_file" | ||
CLASSIFIER_CREATE = "create_statement" | ||
CLASSIFIER_TABLE = "table" | ||
CLASSIFIER_COLUMN = "column" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,229 @@ | ||
import sqlparse | ||
import sys | ||
import json | ||
import re | ||
from SQLClassifier import * | ||
|
||
class SQLFactExtractor(object): | ||
"""docstring for SQLFactExtractor""" | ||
|
||
def enumerate_auto(*sequential, **named): | ||
enums = dict(zip(sequential, range(len(sequential))), **named) | ||
return type('Enum', (), enums) | ||
|
||
StatementType = enumerate_auto( | ||
'CREATE', | ||
'ALTER', | ||
'UNKNOWN' | ||
) | ||
|
||
ExpectedToken = enumerate_auto( | ||
'NONE', | ||
'BEGIN', | ||
'VARIABLE_NAME', | ||
'VARIABLE_TYPE' | ||
) | ||
|
||
def __init__(self, file, log_code): | ||
self.sqlDatei = file | ||
self.load_reserved_sql_keywords() | ||
self.log_code = log_code | ||
|
||
def load_reserved_sql_keywords(self): | ||
self.reserved_keywords = [] | ||
for line in open("sqlKeyWords.txt", "r"): | ||
self.reserved_keywords.append(str.lower(line.rstrip())) | ||
|
||
def get_statement_type(self,statement): | ||
for my_token in statement.tokens: | ||
if str.upper(str(my_token)) == "CREATE": | ||
return self.StatementType.CREATE | ||
elif str.upper(str(my_token)) == "ALTER": | ||
return self.StatementType.ALTER | ||
return self.StatementType.UNKNOWN | ||
|
||
def miss_whitespace(self,tokens_generator_elem): | ||
while tokens_generator_elem.is_whitespace(): | ||
tokens_generator_elem = tokens_generator_elem.next(); | ||
return tokens_generator_elem | ||
|
||
def extract_file(self): | ||
fragment_result = {"classifier": CLASSIFIER_FILE, "fragments": []} | ||
constraints_list = [] | ||
|
||
for each in sqlparse.parse(open(self.sqlDatei).read()): | ||
if each.get_type() != "UNKNOWN": | ||
statement_type = self.get_statement_type(each) | ||
if self.StatementType.CREATE == statement_type:#create | ||
self.extract_create_statement(each, fragment_result) | ||
elif self.StatementType.ALTER == statement_type:#alter | ||
self.extract_alter_statement(each, constraints_list) | ||
|
||
self.add_contraints(fragment_result, constraints_list) | ||
if self.log_code: | ||
self.add_code_linenumbers(fragment_result) | ||
|
||
return fragment_result | ||
|
||
def extract_create_statement(self, each, fragment_result): | ||
subject_token = None | ||
item_list = [] | ||
expected_token = self.ExpectedToken.BEGIN | ||
|
||
#get Relevant Data | ||
for token in each.tokens: | ||
#print(dir(token)) | ||
#print(token.value) | ||
if str(token) not in ['create', 'table'] and not token.is_whitespace() and str(token) != ";": | ||
subject_token = token.flatten().next() | ||
for sub_token in token.flatten(): | ||
if self.ExpectedToken.BEGIN == expected_token and '(' in str(sub_token): | ||
expected_token = self.ExpectedToken.VARIABLE_NAME | ||
elif self.ExpectedToken.VARIABLE_NAME == expected_token and not sub_token.is_whitespace(): | ||
if str.lower(str(sub_token)) in self.reserved_keywords: | ||
expected_token = self.ExpectedToken.NONE | ||
else: | ||
item_list.append(str(sub_token)) | ||
expected_token = self.ExpectedToken.VARIABLE_TYPE | ||
elif self.ExpectedToken.VARIABLE_TYPE == expected_token and not sub_token.is_whitespace(): | ||
item_list.append(str(sub_token)) | ||
expected_token = self.ExpectedToken.NONE | ||
elif self.ExpectedToken.NONE == expected_token and ',' in str(sub_token): | ||
expected_token = self.ExpectedToken.VARIABLE_NAME | ||
# create fragments as JSON | ||
statement_fragment = {"classifier": CLASSIFIER_CREATE, "fragments": []} | ||
table_fragment = {"classifier": CLASSIFIER_TABLE, "name": str(subject_token), "fragments": []} | ||
i = 0 | ||
while i < len(item_list): | ||
table_fragment["fragments"].append( | ||
{"classifier": CLASSIFIER_COLUMN, "name": item_list[i], "type": item_list[i + 1], "fragments": []}) | ||
i += 2 | ||
statement_fragment["fragments"].append(table_fragment) | ||
fragment_result["fragments"].append(statement_fragment) | ||
|
||
self.add_code_to_fragments(each, statement_fragment) | ||
|
||
|
||
def extract_alter_statement(self, each, constraints_list): | ||
subject_table = None | ||
constraint_name = "" | ||
foreign_key_var = "" | ||
references = "" | ||
#get Relevant Data | ||
i = 0 | ||
while str.lower(str(each.tokens[i])) in ['alter', 'table'] or each.tokens[i].is_whitespace(): | ||
i += 1 | ||
subject_table = str(each.tokens[i]) | ||
while str.lower(str(each.tokens[i])) != 'constraint' or each.tokens[i].is_whitespace(): | ||
i += 1 | ||
constraint_name = str(each.tokens[i + 2]) | ||
while (str.lower(str(each.tokens[i])) != 'foreign' and str.lower(str(each.tokens[i + 2])) != 'key') or \ | ||
each.tokens[i].is_whitespace(): | ||
i += 1 | ||
foreign_key_var = str(each.tokens[i + 4]) | ||
foreign_key_var = foreign_key_var[1:len(foreign_key_var) - 1] | ||
while str.lower(str(each.tokens[i])) != 'references' or each.tokens[i].is_whitespace(): | ||
i += 1 | ||
references = str(each.tokens[i + 2]) | ||
|
||
constraints_list.append({ | ||
"subject_table": subject_table, | ||
"constraint_name": constraint_name, | ||
"foreign_key_var": foreign_key_var, | ||
"references": references | ||
}) | ||
|
||
def add_code_to_fragments(self, each, statement_fragment): | ||
if self.log_code: | ||
statement_fragment["code"] = str(each) | ||
statement_fragment["fragments"][0]["code"] = self.delete_beginning_control_characters(self.remouve_beginning_create(str(each))) | ||
|
||
#filter column code | ||
long_string = str(each) | ||
long_string = long_string[long_string.find("(")+1:] | ||
column_code_list = self.format_end_of_column_generation( long_string.split(",")) | ||
|
||
for column_id in range(0, len(statement_fragment["fragments"][0]["fragments"])): | ||
statement_fragment["fragments"][0]["fragments"][column_id]["code"] = self.delete_beginning_control_characters(column_code_list[column_id]) | ||
|
||
def remouve_beginning_create(self, code_string): | ||
return code_string[code_string.find("table"):] | ||
|
||
def format_end_of_column_generation(self, column_codes): | ||
column_codes[-1] = self.delete_last_char_if_equals(column_codes[-1], ";") | ||
column_codes[-1] = self.delete_last_char_if_equals(column_codes[-1], ")") | ||
|
||
return column_codes | ||
|
||
def delete_last_char_if_equals(self, string, char): | ||
if char == string[-1]: | ||
string = string[:-1] | ||
return string | ||
|
||
def delete_beginning_control_characters(self, string): | ||
return string[re.search("\w",string).start() :] | ||
|
||
def add_contraints(self, fragment_result, constraints_list): | ||
for constraint in constraints_list: | ||
#find table | ||
for create in fragment_result["fragments"]: | ||
if create["fragments"][0]["name"] == constraint["subject_table"]: | ||
#find column | ||
for column in create["fragments"][0]["fragments"]: | ||
if column["name"] == constraint["foreign_key_var"]: | ||
column["constraints"] = [{ | ||
"type": "foreign_key", | ||
"references": constraint["references"], | ||
"name": constraint["constraint_name"] | ||
}] | ||
|
||
def add_code_linenumbers(self, fragment_result): | ||
self.add_file_linenumbers(fragment_result) | ||
|
||
self.add_fragment_linenumbers(fragment_result["fragments"], 1, self.get_file_length(self.sqlDatei)) | ||
|
||
def add_fragment_linenumbers(self, fragment_pointer, start, end): | ||
if type(fragment_pointer) == list: | ||
for frag_elem in fragment_pointer: | ||
self.add_fragment_linenumbers(frag_elem, start, end) | ||
elif type(fragment_pointer) == dict: | ||
self.derive_linenumber(fragment_pointer, start, end) | ||
self.add_fragment_linenumbers(fragment_pointer["fragments"], fragment_pointer["line_start"], fragment_pointer["line_end"]) | ||
else: | ||
print("ERROR: add_code_linenumbers UNKNOWN type:"+str(type(fragment_pointer))) | ||
|
||
def derive_linenumber(self, fragment_pointer, start, end): | ||
open_file = open(self.sqlDatei, "r") | ||
|
||
self.go_to_line(open_file, start) | ||
|
||
char_pointer_fragment = 0 | ||
line_counter = start | ||
start_line = start | ||
while char_pointer_fragment < len(fragment_pointer["code"]): | ||
char = open_file.read(1) | ||
if("\n" == char): | ||
line_counter += 1 | ||
if char == fragment_pointer["code"][char_pointer_fragment]: | ||
if char_pointer_fragment == 0: | ||
start_line = line_counter | ||
char_pointer_fragment += 1 | ||
else: | ||
char_pointer_fragment = 0 | ||
|
||
fragment_pointer["line_start"] = start_line | ||
fragment_pointer["line_end"] = line_counter | ||
|
||
def go_to_line(self, open_file, start): | ||
for times in range(1,start): | ||
open_file.readline() | ||
|
||
def add_file_linenumbers(self, fragment_result): | ||
fragment_result["line_start"] = 1 | ||
fragment_result["line_end"] = self.get_file_length(self.sqlDatei) | ||
|
||
def get_file_length(self, file_string): | ||
linenumber = 0 | ||
for line in open(self.sqlDatei): | ||
linenumber += 1 | ||
return linenumber |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
create | ||
table COMPANY | ||
( | ||
ID bigint generated by default as identity (start with 1), | ||
name varchar(255), | ||
primary key (ID) | ||
); | ||
|
||
create | ||
table DEPARTMENT | ||
( | ||
ID bigint generated by default as identity (start with 1), | ||
name varchar(255), | ||
COMP_ID bigint, | ||
DEPT_ID bigint, | ||
primary key (ID) | ||
); | ||
|
||
create | ||
table EMPLOYEE | ||
( | ||
ID bigint generated by default as identity (start with 1), | ||
name varchar(255), | ||
address varchar(255), | ||
salary double, | ||
manager bit, | ||
MENTOR bigint, | ||
DEPT_ID bigint, | ||
primary key (ID) | ||
); | ||
|
||
alter table DEPARTMENT add constraint FK4F782F5255C77F64 foreign key (DEPT_ID) references DEPARTMENT; | ||
alter table DEPARTMENT add constraint FK4F782F52C7CB872B foreign key (COMP_ID) references COMPANY; | ||
alter table EMPLOYEE add constraint FK75C8D6AE55C77F64 foreign key (DEPT_ID) references DEPARTMENT; | ||
alter table EMPLOYEE add constraint FK75C8D6AE800BE06C foreign key (MENTOR) references EMPLOYEE; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"fragments": [{"fragments": [{"fragments": [{"fragments": [], "type": "bigint", "classifier": "column", "name": "ID"}, {"fragments": [], "type": "varchar", "classifier": "column", "name": "name"}], "classifier": "table", "name": "COMPANY"}], "classifier": "create_statement"}, {"fragments": [{"fragments": [{"fragments": [], "type": "bigint", "classifier": "column", "name": "ID"}, {"fragments": [], "type": "varchar", "classifier": "column", "name": "name"}, {"fragments": [], "constraints": [{"references": "COMPANY", "type": "foreign_key", "name": "FK4F782F52C7CB872B"}], "type": "bigint", "classifier": "column", "name": "COMP_ID"}, {"fragments": [], "constraints": [{"references": "DEPARTMENT", "type": "foreign_key", "name": "FK4F782F5255C77F64"}], "type": "bigint", "classifier": "column", "name": "DEPT_ID"}], "classifier": "table", "name": "DEPARTMENT"}], "classifier": "create_statement"}, {"fragments": [{"fragments": [{"fragments": [], "type": "bigint", "classifier": "column", "name": "ID"}, {"fragments": [], "type": "varchar", "classifier": "column", "name": "name"}, {"fragments": [], "type": "varchar", "classifier": "column", "name": "address"}, {"fragments": [], "type": "double", "classifier": "column", "name": "salary"}, {"fragments": [], "type": "bit", "classifier": "column", "name": "manager"}, {"fragments": [], "constraints": [{"references": "EMPLOYEE", "type": "foreign_key", "name": "FK75C8D6AE800BE06C"}], "type": "bigint", "classifier": "column", "name": "MENTOR"}, {"fragments": [], "constraints": [{"references": "DEPARTMENT", "type": "foreign_key", "name": "FK75C8D6AE55C77F64"}], "type": "bigint", "classifier": "column", "name": "DEPT_ID"}], "classifier": "table", "name": "EMPLOYEE"}], "classifier": "create_statement"}], "classifier": "sql_file"} |
1 change: 1 addition & 0 deletions
1
technologies/SqlFactExtractor/example/estimatedResultWithLineNumbers.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"fragments": [{"fragments": [{"code": "table COMPANY \n\t(\n\t\tID bigint generated by default as identity (start with 1), \n\t\tname varchar(255), \n\t\tprimary key (ID)\n\t);", "name": "COMPANY", "line_start": 2, "fragments": [{"code": "ID bigint generated by default as identity (start with 1)", "name": "ID", "line_end": 4, "line_start": 4, "fragments": [], "type": "bigint", "classifier": "column"}, {"code": "name varchar(255)", "name": "name", "line_end": 5, "line_start": 5, "fragments": [], "type": "varchar", "classifier": "column"}], "line_end": 7, "classifier": "table"}], "code": "create \n\ttable COMPANY \n\t(\n\t\tID bigint generated by default as identity (start with 1), \n\t\tname varchar(255), \n\t\tprimary key (ID)\n\t);", "line_end": 7, "classifier": "create_statement", "line_start": 1}, {"fragments": [{"code": "table DEPARTMENT \n\t(\n\t\tID bigint generated by default as identity (start with 1), \n\t\tname varchar(255), \n\t\tCOMP_ID bigint, \n\t\tDEPT_ID bigint, \n\t\tprimary key (ID)\n\t);", "name": "DEPARTMENT", "line_start": 10, "fragments": [{"code": "ID bigint generated by default as identity (start with 1)", "name": "ID", "line_end": 12, "line_start": 12, "fragments": [], "type": "bigint", "classifier": "column"}, {"code": "name varchar(255)", "name": "name", "line_end": 13, "line_start": 13, "fragments": [], "type": "varchar", "classifier": "column"}, {"code": "COMP_ID bigint", "name": "COMP_ID", "line_end": 14, "line_start": 14, "fragments": [], "type": "bigint", "classifier": "column", "constraints": [{"references": "COMPANY", "type": "foreign_key", "name": "FK4F782F52C7CB872B"}]}, {"code": "DEPT_ID bigint", "name": "DEPT_ID", "line_end": 15, "line_start": 15, "fragments": [], "type": "bigint", "classifier": "column", "constraints": [{"references": "DEPARTMENT", "type": "foreign_key", "name": "FK4F782F5255C77F64"}]}], "line_end": 17, "classifier": "table"}], "code": "\n\ncreate \n\ttable DEPARTMENT \n\t(\n\t\tID bigint generated by default as identity (start with 1), \n\t\tname varchar(255), \n\t\tCOMP_ID bigint, \n\t\tDEPT_ID bigint, \n\t\tprimary key (ID)\n\t);", "line_end": 17, "classifier": "create_statement", "line_start": 8}, {"fragments": [{"code": "table EMPLOYEE \n\t(\n\t\tID bigint generated by default as identity (start with 1),\n\t\tname varchar(255), \n\t\taddress varchar(255), \n\t\tsalary double, \n\t\tmanager bit, \n\t\tMENTOR bigint, \n\t\tDEPT_ID bigint, \n\t\tprimary key (ID)\n\t);", "name": "EMPLOYEE", "line_start": 20, "fragments": [{"code": "ID bigint generated by default as identity (start with 1)", "name": "ID", "line_end": 22, "line_start": 22, "fragments": [], "type": "bigint", "classifier": "column"}, {"code": "name varchar(255)", "name": "name", "line_end": 23, "line_start": 23, "fragments": [], "type": "varchar", "classifier": "column"}, {"code": "address varchar(255)", "name": "address", "line_end": 24, "line_start": 24, "fragments": [], "type": "varchar", "classifier": "column"}, {"code": "salary double", "name": "salary", "line_end": 25, "line_start": 25, "fragments": [], "type": "double", "classifier": "column"}, {"code": "manager bit", "name": "manager", "line_end": 26, "line_start": 26, "fragments": [], "type": "bit", "classifier": "column"}, {"code": "MENTOR bigint", "name": "MENTOR", "line_end": 27, "line_start": 27, "fragments": [], "type": "bigint", "classifier": "column", "constraints": [{"references": "EMPLOYEE", "type": "foreign_key", "name": "FK75C8D6AE800BE06C"}]}, {"code": "DEPT_ID bigint", "name": "DEPT_ID", "line_end": 28, "line_start": 28, "fragments": [], "type": "bigint", "classifier": "column", "constraints": [{"references": "DEPARTMENT", "type": "foreign_key", "name": "FK75C8D6AE55C77F64"}]}], "line_end": 30, "classifier": "table"}], "code": "\n\ncreate \n\ttable EMPLOYEE \n\t(\n\t\tID bigint generated by default as identity (start with 1),\n\t\tname varchar(255), \n\t\taddress varchar(255), \n\t\tsalary double, \n\t\tmanager bit, \n\t\tMENTOR bigint, \n\t\tDEPT_ID bigint, \n\t\tprimary key (ID)\n\t);", "line_end": 30, "classifier": "create_statement", "line_start": 18}], "line_end": 35, "classifier": "sql_file", "line_start": 1} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#! /usr/bin/env python | ||
from SQLFactExtractor import * | ||
|
||
extractor = SQLFactExtractor(sys.argv[1], False) | ||
print(json.dumps(extractor.extract_file())) |
Oops, something went wrong.