Skip to content
This repository was archived by the owner on Sep 30, 2020. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ You may have a schema in the target database which is stricter than the source d

- Only supports foreign keys that reference the `id` field of the parent table.
- If the schemas of the tables differ at all you'll get errors - you can use the post_copy_sql parameter to add SQL that fixes this, but it's still a manual process.
- Self relations aren't properly supported - so you need to make sure there aren't any self-relations using conditions
- Self relations aren't properly supported, they are excluding from sampling. If you don't want such tables to be copied without sampling, add them to the exclude list.

## Recommendations

Expand Down
12 changes: 8 additions & 4 deletions lib/database_sampler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,7 @@ def get_foreign_keys(source=true)
ON (t.table_name = c.table_name)
WHERE t.table_schema = 'public'
AND t.table_name NOT ilike 'list_members_part_list_ids_%'
AND (c.parent_table IS NULL OR t.table_name != c.parent_table)
ORDER BY t.table_name} # To avoid loops we remove self-references for now
ORDER BY t.table_name}

if source
@foreign_keys_source ||= (@source_conn.exec(sql).to_a + @manual_links).reject{ |fk| @exclude_tables.include?(fk['table_name']) || @exclude_tables.include?(fk['parent_table'])}
Expand All @@ -135,7 +134,7 @@ def get_foreign_keys(source=true)

def get_children(table_name)
foreign_keys = get_foreign_keys(!@use_fks_from_target)
foreign_keys.select{ |r| r['parent_table'] == table_name }.map{ |r| r['table_name'] }
foreign_keys.select{ |r| r['parent_table'] == table_name and r['parent_table'] != r['table_name'] }.map{ |r| r['table_name'] }
end


Expand All @@ -146,7 +145,7 @@ def get_network
network = {}
foreign_keys.each do |row|
data = network[row['table_name']] || {parents_count: 0, parents_remaining: 0, parents: {}, children: []}
if row['parent_table']
if row['parent_table'] and row['parent_table'] != row['table_name']
data[:parents_count] += 1
data[:parents_remaining] += 1
data[:parents][row['parent_table']] = { source_column: row['column'], target_column: row['parent_column'] }
Expand Down Expand Up @@ -270,6 +269,11 @@ def get_max_id_for_table(table)
end

def make_sample_tables
# Reject self_relations from samples
foreign_keys = get_foreign_keys(!@use_fks_from_target)
self_relations = foreign_keys.select { |r| r['table_name'] == r['parent_table'] }.map { |r| r['table_name'] }
@samples.reject! { |s| self_relations.include? s['table'] }

@samples.each do |sample|
# Sampling a large table with ORDER BY RANDOM() is slow. So we use a faster method: generating a set of numbers in the range 1..max(id) for the table, and selecting those.
# To calculate the range and number of IDs we need, we need to know two things: the max_id and what proportion of the IDs between 1 and max(id) actually exist (the 'density')
Expand Down