Skip to content

Commit 4a30408

Browse files
committed
Improve cleanup algorithm
1 parent 66e5aa0 commit 4a30408

File tree

2 files changed

+77
-74
lines changed

2 files changed

+77
-74
lines changed

importer/src/hierarchyitem.cpp

Lines changed: 63 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
#include "hierarchyitem.h"
22
#include "utils.h"
33

4+
#include <boost/algorithm/string/trim.hpp>
45
#include <fstream>
56
#include <iostream>
7+
#include <sstream>
68
#include <stdexcept>
79

810
std::set<std::string> HierarchyItem::s_priority_types;
@@ -27,26 +29,7 @@ HierarchyItem::HierarchyItem(const pqxx::row &row)
2729
m_data_extra = parse_to_map(row["extra"].as<std::string>(""));
2830

2931
set_names();
30-
}
31-
32-
// trim from start (in place)
33-
static inline void ltrim(std::string &s)
34-
{
35-
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) { return !std::isspace(ch); }));
36-
}
37-
38-
// trim from end (in place)
39-
static inline void rtrim(std::string &s)
40-
{
41-
s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) { return !std::isspace(ch); }).base(),
42-
s.end());
43-
}
44-
45-
// trim from both ends (in place)
46-
static inline void trim(std::string &s)
47-
{
48-
ltrim(s);
49-
rtrim(s);
32+
m_key = key();
5033
}
5134

5235
static std::set<std::string> load_list(const std::string &fname)
@@ -65,7 +48,7 @@ static std::set<std::string> load_list(const std::string &fname)
6548

6649
while (std::getline(f, line))
6750
{
68-
trim(line);
51+
boost::algorithm::trim(line);
6952
if (!line.empty())
7053
d.insert(line);
7154
}
@@ -121,6 +104,25 @@ bool HierarchyItem::is_duplicate(std::shared_ptr<HierarchyItem> item) const
121104
return false;
122105
}
123106

107+
std::string HierarchyItem::key() const
108+
{
109+
std::stringstream ss;
110+
111+
ss << m_name << "-" << m_name_extra << "-" << m_postcode << "-";
112+
113+
if (m_type.rfind("building", 0) == 0)
114+
ss << "building";
115+
else if (m_type.rfind("highway", 0) == 0)
116+
ss << "highway";
117+
else
118+
ss << m_type;
119+
120+
if (s_priority_types.count(m_type) > 0)
121+
ss << "-" << m_id;
122+
123+
return ss.str();
124+
}
125+
124126
void HierarchyItem::add_child(std::shared_ptr<HierarchyItem> child)
125127
{
126128
m_children.push_back(child);
@@ -161,58 +163,56 @@ void HierarchyItem::set_parent(hindex parent, bool force)
161163
// c->set_parent(m_id, force);
162164
}
163165

164-
void HierarchyItem::cleanup_children()
166+
void HierarchyItem::cleanup_children(bool duplicate_only)
165167
{
166168
// as a result of this run, children that are supposed to be kept are staying in children
167169
// property. all disposed ones are still pointed to via Hierarchy map, but should not be accessed
168170
// while moving along hierarchy for indexing or writing it
169-
{
170-
std::deque<std::shared_ptr<HierarchyItem> > children;
171-
for (auto item : m_children)
172-
{
173-
item->cleanup_children();
174-
if (item->keep())
175-
children.push_back(item);
176-
else
177-
children.insert(children.end(), item->m_children.begin(), item->m_children.end());
178-
}
179-
m_children = children;
180-
}
181-
182-
// check for duplicates
183-
bool had_duplicates = false;
184-
for (size_t child_index = 0; child_index < m_children.size(); ++child_index)
171+
if (!duplicate_only)
185172
{
186-
std::shared_ptr<HierarchyItem> item = m_children[child_index];
187173
std::deque<std::shared_ptr<HierarchyItem> > children;
188-
std::deque<std::shared_ptr<HierarchyItem> > duplicates;
189-
190-
children.insert(children.end(), m_children.begin(), m_children.begin() + child_index + 1);
191-
192-
for (size_t i = child_index + 1; i < m_children.size(); ++i)
193-
if (m_children[i]->is_duplicate(item))
194-
duplicates.push_back(m_children[i]);
195-
else
196-
children.push_back(m_children[i]);
197-
198-
// merge duplicates
199-
for (auto &i : duplicates)
174+
for (auto item : m_children)
200175
{
201-
had_duplicates = true;
202-
item->add_linked(i);
203-
item->m_children.insert(item->m_children.end(), i->m_children.begin(),
204-
i->m_children.end());
205-
for (auto &i_children : i->m_children)
206-
i_children->set_parent(item->m_id, true);
207-
i->drop();
176+
item->cleanup_children();
177+
if (item->keep())
178+
children.push_back(item);
179+
else
180+
children.insert(children.end(), item->m_children.begin(), item->m_children.end());
208181
}
182+
m_children = children;
183+
}
209184

210-
if (had_duplicates)
211-
item->cleanup_children();
185+
// print out items with huge amount of children
186+
if (m_children.size() > 10000)
187+
{
188+
print_item(0);
189+
m_children[0]->print_item(3);
190+
}
212191

213-
m_children = children;
192+
// check for duplicates
193+
std::map<std::string, std::shared_ptr<HierarchyItem> > children;
194+
for (std::shared_ptr<HierarchyItem> item : m_children)
195+
{
196+
std::string key = item->key();
197+
auto main_pair = children.find(key);
198+
if (main_pair != children.end())
199+
{
200+
std::shared_ptr<HierarchyItem> main = main_pair->second;
201+
main->m_children.insert(main->m_children.end(), item->m_children.begin(),
202+
item->m_children.end());
203+
for (auto &i_children : item->m_children)
204+
i_children->set_parent(main->m_id, true);
205+
item->drop();
206+
main->cleanup_children(true);
207+
}
208+
else
209+
children[key] = item;
214210
}
215211

212+
m_children.clear();
213+
for (auto &iter : children)
214+
m_children.push_back(iter.second);
215+
216216
// set parent, forced
217217
for (auto item : m_children)
218218
item->set_parent(m_id, true);
@@ -284,9 +284,8 @@ void HierarchyItem::print_item(unsigned int offset) const
284284
if (!m_housenumber.empty())
285285
std::cout << "house " << m_housenumber << " ";
286286
std::cout << m_name << " ";
287-
std::cout << "(" << m_my_index << " " << m_last_child_index << ": "
288-
<< m_last_child_index - m_my_index << ": " << m_parent_id << ", " << m_country
289-
<< ", osmid=" << m_osm_id << ")\n";
287+
std::cout << "(" << m_my_index << " " << m_last_child_index << ": " << m_children.size() << ": "
288+
<< m_parent_id << ", " << m_country << ", osmid=" << m_osm_id << ", " << m_key << ")\n";
290289
}
291290

292291
void HierarchyItem::print_branch(unsigned int offset) const

importer/src/hierarchyitem.h

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ class HierarchyItem
3131
void add_child(std::shared_ptr<HierarchyItem> child);
3232
void add_linked(std::shared_ptr<HierarchyItem> linked);
3333
void set_parent(hindex parent, bool force = false);
34-
void cleanup_children();
34+
void cleanup_children(bool duplicate_only = false);
3535
sqlid index(sqlid idx, sqlid parent);
3636
void write(sqlite3pp::database &db) const;
3737

@@ -42,18 +42,22 @@ class HierarchyItem
4242
static void load_priority_list(const std::string &fname);
4343
static void load_skip_list(const std::string &fname);
4444

45+
static std::set<std::string> get_priority_list() { return s_priority_types; }
46+
4547
protected:
46-
void set_names();
47-
bool is_duplicate(std::shared_ptr<HierarchyItem> item) const;
48+
void set_names();
49+
bool is_duplicate(std::shared_ptr<HierarchyItem> item) const;
50+
std::string key() const;
4851

4952
private:
50-
hindex m_id;
51-
hindex m_linked_id{ 0 };
52-
hindex m_parent_id;
53-
sqlid m_my_index{ 0 };
54-
sqlid m_parent_index{ 0 };
55-
sqlid m_last_child_index{ 0 };
56-
bool m_dropped{ false };
53+
hindex m_id;
54+
hindex m_linked_id{ 0 };
55+
hindex m_parent_id;
56+
sqlid m_my_index{ 0 };
57+
sqlid m_parent_index{ 0 };
58+
sqlid m_last_child_index{ 0 };
59+
bool m_dropped{ false };
60+
std::string m_key;
5761

5862
std::string m_type;
5963
float m_latitude;

0 commit comments

Comments
 (0)