From 14717f6132672a5d271832a68de0b323b73abb2a Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 23 Aug 2021 15:19:58 +0800 Subject: [PATCH 01/10] [pack #67] refactor --- git-odb/src/store/compound/init.rs | 2 ++ git-pack/src/data/output/count/mod.rs | 4 ++-- .../{iter_from_objects.rs => objects.rs} | 4 ++-- .../pack/data/output/count_and_entries.rs | 22 +++++++++---------- gitoxide-core/src/pack/create.rs | 10 ++++----- 5 files changed, 22 insertions(+), 20 deletions(-) rename git-pack/src/data/output/count/{iter_from_objects.rs => objects.rs} (99%) diff --git a/git-odb/src/store/compound/init.rs b/git-odb/src/store/compound/init.rs index 84b1eeb14a..5532f243d8 100644 --- a/git-odb/src/store/compound/init.rs +++ b/git-odb/src/store/compound/init.rs @@ -38,6 +38,8 @@ impl compound::Store { p.extension().unwrap_or_default() == "idx" && p.file_name().unwrap_or_default().to_string_lossy().starts_with("pack-") }) + // TODO: make this configurable, git for instance sorts by modification date + // https://github.com/libgit2/libgit2/blob/main/src/odb_pack.c#L41-L158 .map(|(p, md)| pack::Bundle::at(p).map(|b| (b, md.len()))) .collect::, _>>()?; packs_and_sizes.sort_by_key(|e| e.1); diff --git a/git-pack/src/data/output/count/mod.rs b/git-pack/src/data/output/count/mod.rs index f3cf5c2c69..6835a0ebf0 100644 --- a/git-pack/src/data/output/count/mod.rs +++ b/git-pack/src/data/output/count/mod.rs @@ -26,5 +26,5 @@ impl Count { } /// -pub mod iter_from_objects; -pub use iter_from_objects::{objects, objects_unthreaded}; +pub mod objects; +pub use objects::{objects, objects_unthreaded}; diff --git a/git-pack/src/data/output/count/iter_from_objects.rs b/git-pack/src/data/output/count/objects.rs similarity index 99% rename from git-pack/src/data/output/count/iter_from_objects.rs rename to git-pack/src/data/output/count/objects.rs index 15cb3603c6..104563bd04 100644 --- a/git-pack/src/data/output/count/iter_from_objects.rs +++ b/git-pack/src/data/output/count/objects.rs @@ -318,7 +318,7 @@ where mod tree { pub mod changes { - use crate::data::output::count::iter_from_objects::util::InsertImmutable; + use crate::data::output::count::objects::util::InsertImmutable; use git_diff::tree::{ visit::{Action, Change}, Visit, @@ -374,7 +374,7 @@ mod tree { } pub mod traverse { - use crate::data::output::count::iter_from_objects::util::InsertImmutable; + use crate::data::output::count::objects::util::InsertImmutable; use git_hash::ObjectId; use git_object::{bstr::BStr, immutable::tree::Entry}; use git_traverse::tree::visit::{Action, Visit}; diff --git a/git-pack/tests/pack/data/output/count_and_entries.rs b/git-pack/tests/pack/data/output/count_and_entries.rs index 09de2008ae..056ec01dbc 100644 --- a/git-pack/tests/pack/data/output/count_and_entries.rs +++ b/git-pack/tests/pack/data/output/count_and_entries.rs @@ -90,7 +90,7 @@ fn traversals() -> crate::Result { allow_thin_pack, ) in [ ( - count::iter_from_objects::ObjectExpansion::AsIs, + count::objects::ObjectExpansion::AsIs, Count { trees: 0, commits: 15, @@ -105,7 +105,7 @@ fn traversals() -> crate::Result { blobs: 0, tags: 1, }, - output::count::iter_from_objects::Outcome { + output::count::objects::Outcome { input_objects: 16, expanded_objects: 0, decoded_objects: 16, @@ -122,7 +122,7 @@ fn traversals() -> crate::Result { false, ), ( - count::iter_from_objects::ObjectExpansion::TreeAdditionsComparedToAncestor, + count::objects::ObjectExpansion::TreeAdditionsComparedToAncestor, Count { trees: 3, commits: 2, // todo: why more? @@ -137,7 +137,7 @@ fn traversals() -> crate::Result { blobs: 96, tags: 0, }, - output::count::iter_from_objects::Outcome { + output::count::objects::Outcome { input_objects: 1, expanded_objects: 102, decoded_objects: 18, @@ -154,7 +154,7 @@ fn traversals() -> crate::Result { true, ), ( - count::iter_from_objects::ObjectExpansion::TreeAdditionsComparedToAncestor, + count::objects::ObjectExpansion::TreeAdditionsComparedToAncestor, Count { trees: 5, commits: 2, // todo: why more? @@ -169,7 +169,7 @@ fn traversals() -> crate::Result { blobs: 96, tags: 0, }, - output::count::iter_from_objects::Outcome { + output::count::objects::Outcome { input_objects: 1, expanded_objects: 102, decoded_objects: 18, @@ -186,10 +186,10 @@ fn traversals() -> crate::Result { false, ), ( - count::iter_from_objects::ObjectExpansion::TreeContents, + count::objects::ObjectExpansion::TreeContents, whole_pack, whole_pack_obj_count, - output::count::iter_from_objects::Outcome { + output::count::objects::Outcome { input_objects: 16, expanded_objects: 852, decoded_objects: 57, @@ -206,10 +206,10 @@ fn traversals() -> crate::Result { false, ), ( - count::iter_from_objects::ObjectExpansion::TreeAdditionsComparedToAncestor, + count::objects::ObjectExpansion::TreeAdditionsComparedToAncestor, whole_pack, whole_pack_obj_count, - output::count::iter_from_objects::Outcome { + output::count::objects::Outcome { input_objects: 16, expanded_objects: 866, decoded_objects: 208, @@ -255,7 +255,7 @@ fn traversals() -> crate::Result { .map(Ok::<_, Infallible>), progress::Discard, &AtomicBool::new(false), - count::iter_from_objects::Options { + count::objects::Options { input_object_expansion: expansion_mode, thread_limit: deterministic_count_needs_single_thread, ..Default::default() diff --git a/gitoxide-core/src/pack/create.rs b/gitoxide-core/src/pack/create.rs index d019359679..45fba11b4c 100644 --- a/gitoxide-core/src/pack/create.rs +++ b/gitoxide-core/src/pack/create.rs @@ -49,9 +49,9 @@ impl FromStr for ObjectExpansion { } } -impl From for pack::data::output::count::iter_from_objects::ObjectExpansion { +impl From for pack::data::output::count::objects::ObjectExpansion { fn from(v: ObjectExpansion) -> Self { - use pack::data::output::count::iter_from_objects::ObjectExpansion::*; + use pack::data::output::count::objects::ObjectExpansion::*; match v { ObjectExpansion::None => AsIs, ObjectExpansion::TreeTraversal => TreeContents, @@ -190,7 +190,7 @@ where input, progress, &interrupt::IS_INTERRUPTED, - pack::data::output::count::iter_from_objects::Options { + pack::data::output::count::objects::Options { thread_limit, chunk_size, input_object_expansion, @@ -300,7 +300,7 @@ fn print(stats: Statistics, format: OutputFormat, out: impl std::io::Write) -> a fn human_output( Statistics { counts: - pack::data::output::count::iter_from_objects::Outcome { + pack::data::output::count::objects::Outcome { input_objects, expanded_objects, decoded_objects, @@ -343,7 +343,7 @@ fn human_output( #[derive(Default)] #[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] struct Statistics { - counts: pack::data::output::count::iter_from_objects::Outcome, + counts: pack::data::output::count::objects::Outcome, entries: pack::data::output::entry::iter_from_counts::Outcome, } From 7fbc9617da97d4ba4bb3784f41d4163c0839c03c Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 23 Aug 2021 15:21:03 +0800 Subject: [PATCH 02/10] Release git-pack v0.9.0 --- Cargo.lock | 2 +- git-odb/Cargo.toml | 2 +- git-pack/Cargo.toml | 2 +- git-repository/Cargo.toml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 861ddddae5..6a636468ea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1123,7 +1123,7 @@ dependencies = [ [[package]] name = "git-pack" -version = "0.8.2" +version = "0.9.0" dependencies = [ "bstr", "btoi", diff --git a/git-odb/Cargo.toml b/git-odb/Cargo.toml index 4d069403ae..3c71ce5b21 100644 --- a/git-odb/Cargo.toml +++ b/git-odb/Cargo.toml @@ -31,7 +31,7 @@ all-features = true git-features = { version = "^0.16.0", path = "../git-features", features = ["rustsha1", "walkdir", "zlib"] } git-hash = { version = "^0.5.0", path = "../git-hash" } git-object = { version ="0.12.0", path = "../git-object" } -git-pack = { version ="0.8.0", path = "../git-pack" } +git-pack = { version ="^0.9.0", path = "../git-pack" } btoi = "0.4.2" tempfile = "3.1.0" diff --git a/git-pack/Cargo.toml b/git-pack/Cargo.toml index 95224a4143..d52ede706c 100644 --- a/git-pack/Cargo.toml +++ b/git-pack/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "git-pack" -version = "0.8.2" +version = "0.9.0" repository = "https://github.com/Byron/gitoxide" authors = ["Sebastian Thiel "] license = "MIT/Apache-2.0" diff --git a/git-repository/Cargo.toml b/git-repository/Cargo.toml index 9b7ba8d90a..43c8af586d 100644 --- a/git-repository/Cargo.toml +++ b/git-repository/Cargo.toml @@ -42,7 +42,7 @@ git-odb = { version ="0.20.0", path = "../git-odb" } git-hash = { version = "^0.5.0", path = "../git-hash" } git-object = { version ="0.12.0", path = "../git-object" } git-actor = { version ="0.3.1", path = "../git-actor" } -git-pack = { version ="0.8.0", path = "../git-pack" } +git-pack = { version ="^0.9.0", path = "../git-pack" } git-url = { version = "0.3.0", path = "../git-url", optional = true } git-traverse = { version ="0.7.0", path = "../git-traverse", optional = true } From d08b6739d8e9294b795aba75e9c7f9f20645af2b Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 23 Aug 2021 15:33:54 +0800 Subject: [PATCH 03/10] [pack #67] Don't pre-fetch packed objects during counting This could be a real avenue, in single-threaded mode this probably shouldn't happen and the work should rather be postponed to entry generation, copying, which right now is ridiculously fast as it doesn't have to do much work anymore. > counting done 8.1M objects in 107.57s (75.5k objects/s) --- git-pack/src/data/output/count/objects.rs | 3 ++- gitoxide-core/src/pack/create.rs | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/git-pack/src/data/output/count/objects.rs b/git-pack/src/data/output/count/objects.rs index 104563bd04..09f75cc373 100644 --- a/git-pack/src/data/output/count/objects.rs +++ b/git-pack/src/data/output/count/objects.rs @@ -463,7 +463,8 @@ fn id_to_count( statistics.expanded_objects += 1; output::Count { id: id.to_owned(), - entry_pack_location: db.location_by_oid(id, buf), + // entry_pack_location: db.location_by_oid(id, buf), + entry_pack_location: None, } } diff --git a/gitoxide-core/src/pack/create.rs b/gitoxide-core/src/pack/create.rs index 45fba11b4c..eab0d74757 100644 --- a/gitoxide-core/src/pack/create.rs +++ b/gitoxide-core/src/pack/create.rs @@ -210,6 +210,7 @@ where counts.shrink_to_fit(); counts }; + todo!("figure out performance issues"); progress.inc(); let num_objects = counts.len(); From 811bb54991636f7e517087b62cf0c8c8cc2ad9e6 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 23 Aug 2021 15:35:59 +0800 Subject: [PATCH 04/10] Revert "[pack #67] Don't pre-fetch packed objects during counting" This reverts commit d08b6739d8e9294b795aba75e9c7f9f20645af2b. --- git-pack/src/data/output/count/objects.rs | 3 +-- gitoxide-core/src/pack/create.rs | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/git-pack/src/data/output/count/objects.rs b/git-pack/src/data/output/count/objects.rs index 09f75cc373..104563bd04 100644 --- a/git-pack/src/data/output/count/objects.rs +++ b/git-pack/src/data/output/count/objects.rs @@ -463,8 +463,7 @@ fn id_to_count( statistics.expanded_objects += 1; output::Count { id: id.to_owned(), - // entry_pack_location: db.location_by_oid(id, buf), - entry_pack_location: None, + entry_pack_location: db.location_by_oid(id, buf), } } diff --git a/gitoxide-core/src/pack/create.rs b/gitoxide-core/src/pack/create.rs index eab0d74757..45fba11b4c 100644 --- a/gitoxide-core/src/pack/create.rs +++ b/gitoxide-core/src/pack/create.rs @@ -210,7 +210,6 @@ where counts.shrink_to_fit(); counts }; - todo!("figure out performance issues"); progress.inc(); let num_objects = counts.len(); From 7b068296fe5ca10af212d8fe2662940188b7359c Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 23 Aug 2021 15:56:17 +0800 Subject: [PATCH 05/10] =?UTF-8?q?[pack=20#170]=20Don't=20double-lookup=20t?= =?UTF-8?q?rees=20during=20traversal=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …which will already be worth quite a bit. Next is delaying the pack access for another win. Ultimately it won't be enough to be as fast as git though. --- git-pack/src/data/output/count/objects.rs | 33 ++++++++++++++++------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/git-pack/src/data/output/count/objects.rs b/git-pack/src/data/output/count/objects.rs index 104563bd04..ad67272a1c 100644 --- a/git-pack/src/data/output/count/objects.rs +++ b/git-pack/src/data/output/count/objects.rs @@ -205,12 +205,20 @@ where &mut tree_traversal_state, |oid, buf| { stats.decoded_objects += 1; - db.find_existing_tree_iter(oid, buf, cache).ok() + match db.find_existing(oid, buf, cache).ok() { + Some(obj) => { + progress.inc(); + stats.expanded_objects += 1; + out.push(output::Count::from_data(oid, &obj)); + obj.into_tree_iter() + } + None => None, + } }, &mut traverse_delegate, ) .map_err(Error::TreeTraverse)?; - &traverse_delegate.objects + &traverse_delegate.non_trees } else { for commit_id in &parent_commit_ids { let parent_tree_id = { @@ -280,12 +288,20 @@ where &mut tree_traversal_state, |oid, buf| { stats.decoded_objects += 1; - db.find_existing_tree_iter(oid, buf, cache).ok() + match db.find_existing(oid, buf, cache).ok() { + Some(obj) => { + progress.inc(); + stats.expanded_objects += 1; + out.push(output::Count::from_data(oid, &obj)); + obj.into_tree_iter() + } + None => None, + } }, &mut traverse_delegate, ) .map_err(Error::TreeTraverse)?; - for id in traverse_delegate.objects.iter() { + for id in traverse_delegate.non_trees.iter() { out.push(id_to_count(db, buf1, id, progress, stats)); } break; @@ -380,7 +396,7 @@ mod tree { use git_traverse::tree::visit::{Action, Visit}; pub struct AllUnseen<'a, H> { - pub objects: Vec, + pub non_trees: Vec, all_seen: &'a H, } @@ -390,12 +406,12 @@ mod tree { { pub fn new(all_seen: &'a H) -> Self { AllUnseen { - objects: Default::default(), + non_trees: Default::default(), all_seen, } } pub fn clear(&mut self) { - self.objects.clear(); + self.non_trees.clear(); } } @@ -414,7 +430,6 @@ mod tree { fn visit_tree(&mut self, entry: &Entry<'_>) -> Action { let inserted = self.all_seen.insert(entry.oid.to_owned()); if inserted { - self.objects.push(entry.oid.to_owned()); Action::Continue } else { Action::Skip @@ -424,7 +439,7 @@ mod tree { fn visit_nontree(&mut self, entry: &Entry<'_>) -> Action { let inserted = self.all_seen.insert(entry.oid.to_owned()); if inserted { - self.objects.push(entry.oid.to_owned()); + self.non_trees.push(entry.oid.to_owned()); } Action::Continue } From f3c21f99594ab4080b8aa1ffed9ea8a33e18fabd Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 23 Aug 2021 19:32:17 +0800 Subject: [PATCH 06/10] =?UTF-8?q?[pack=20#170]=20first=20step=20towards=20?= =?UTF-8?q?resolving=20in=20multi-threaded=20mode=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …however, it requires parallel mutable iteration, and that requires unsafe. Maybe we just go rayon here? --- git-pack/src/data/output/count/mod.rs | 30 ++++++++++++++- git-pack/src/data/output/count/objects.rs | 17 +++++++-- .../src/data/output/entry/iter_from_counts.rs | 38 +++++++++++++++---- 3 files changed, 73 insertions(+), 12 deletions(-) diff --git a/git-pack/src/data/output/count/mod.rs b/git-pack/src/data/output/count/mod.rs index 6835a0ebf0..41073537e2 100644 --- a/git-pack/src/data/output/count/mod.rs +++ b/git-pack/src/data/output/count/mod.rs @@ -12,7 +12,33 @@ pub struct Count { /// The hash of the object to write pub id: ObjectId, /// A way to locate a pack entry in the object database, only available if the object is in a pack. - pub entry_pack_location: Option, + pub entry_pack_location: PackLocation, +} + +/// Specifies how the pack location was handled during counting +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +pub enum PackLocation { + /// We did not lookup this object + NotLookedUp, + /// The object was looked up and there may be a location in a pack, along with enty information + LookedUp(Option), +} + +impl PackLocation { + /// Directly go through to LookedUp variant, panic otherwise + pub fn is_none(&self) -> bool { + match self { + PackLocation::LookedUp(opt) => opt.is_none(), + PackLocation::NotLookedUp => unreachable!("must have been resolved"), + } + } + /// Directly go through to LookedUp variant, panic otherwise + pub fn as_ref(&self) -> Option<&crate::bundle::Location> { + match self { + PackLocation::LookedUp(opt) => opt.as_ref(), + PackLocation::NotLookedUp => unreachable!("must have been resolved"), + } + } } impl Count { @@ -20,7 +46,7 @@ impl Count { pub fn from_data(oid: impl Into, obj: &data::Object<'_>) -> Self { Count { id: oid.into(), - entry_pack_location: obj.pack_location.clone(), + entry_pack_location: PackLocation::LookedUp(obj.pack_location.clone()), } } } diff --git a/git-pack/src/data/output/count/objects.rs b/git-pack/src/data/output/count/objects.rs index ad67272a1c..37242346df 100644 --- a/git-pack/src/data/output/count/objects.rs +++ b/git-pack/src/data/output/count/objects.rs @@ -91,6 +91,7 @@ where cache, progress, should_interrupt, + true, ) } }, @@ -125,6 +126,7 @@ where pack_cache, &mut progress, should_interrupt, + false, ) } @@ -139,6 +141,7 @@ fn expand_inner( cache: &mut impl crate::cache::DecodeEntry, progress: &mut impl Progress, should_interrupt: &AtomicBool, + allow_pack_lookups: bool, ) -> Result, IterErr> where Find: crate::Find + Send + Sync, @@ -267,7 +270,7 @@ where &changes_delegate.objects }; for id in objects.iter() { - out.push(id_to_count(db, buf2, id, progress, stats)); + out.push(id_to_count(db, buf2, id, progress, stats, allow_pack_lookups)); } break; } @@ -302,7 +305,7 @@ where ) .map_err(Error::TreeTraverse)?; for id in traverse_delegate.non_trees.iter() { - out.push(id_to_count(db, buf1, id, progress, stats)); + out.push(id_to_count(db, buf1, id, progress, stats, allow_pack_lookups)); } break; } @@ -447,6 +450,7 @@ mod tree { } } +#[inline] fn push_obj_count_unique( out: &mut Vec, all_seen: &impl util::InsertImmutable, @@ -467,18 +471,24 @@ fn push_obj_count_unique( } } +#[inline] fn id_to_count( db: &Find, buf: &mut Vec, id: &oid, progress: &mut impl Progress, statistics: &mut Outcome, + allow_pack_lookups: bool, ) -> output::Count { progress.inc(); statistics.expanded_objects += 1; output::Count { id: id.to_owned(), - entry_pack_location: db.location_by_oid(id, buf), + entry_pack_location: if allow_pack_lookups { + PackLocation::LookedUp(db.location_by_oid(id, buf)) + } else { + PackLocation::NotLookedUp + }, } } @@ -640,6 +650,7 @@ mod types { Interrupted, } } +use crate::data::output::count::PackLocation; use std::cell::RefCell; use std::collections::HashSet; pub use types::{Error, ObjectExpansion, Options, Outcome}; diff --git a/git-pack/src/data/output/entry/iter_from_counts.rs b/git-pack/src/data/output/entry/iter_from_counts.rs index 02b9315032..671c9f505d 100644 --- a/git-pack/src/data/output/entry/iter_from_counts.rs +++ b/git-pack/src/data/output/entry/iter_from_counts.rs @@ -57,20 +57,46 @@ where matches!(version, crate::data::Version::V2), "currently we can only write version 2" ); + let (chunk_size, thread_limit, _) = + parallel::optimize_chunk_size_and_thread_limit(chunk_size, Some(counts.len()), thread_limit, None); + let chunks = util::ChunkRanges::new(chunk_size, counts.len()).enumerate(); + // { + // let mut progress = progress.add_child("resolving"); + // progress.init(Some(counts.len()), git_features::progress::count("counts")); + // parallel::in_parallel_if( + // || counts.len() > 4_000, + // chunks.clone(), + // thread_limit, + // { + // let make_cache = make_cache.clone(); + // |_n| (Vec::::new(), make_cache()) + // }, + // |chunk_range, (buf, pack_cache)| { + // let chunk = &counts[chunk_range]; + // for count in chunk { + // let () = count; + // } + // Ok(()) + // }, + // parallel::reduce::IdentityWithResult::<_, _>::default(), + // ); + // } let counts_range_by_pack_id = match mode { Mode::PackCopyAndBaseObjects => { let mut progress = progress.add_child("sorting"); progress.init(Some(counts.len()), git_features::progress::count("counts")); let start = std::time::Instant::now(); + use crate::data::output::count::PackLocation::*; counts.sort_by(|lhs, rhs| match (&lhs.entry_pack_location, &rhs.entry_pack_location) { - (None, None) => Ordering::Equal, - (Some(_), None) => Ordering::Greater, - (None, Some(_)) => Ordering::Less, - (Some(lhs), Some(rhs)) => lhs + (LookedUp(None), LookedUp(None)) => Ordering::Equal, + (LookedUp(Some(_)), LookedUp(None)) => Ordering::Greater, + (LookedUp(None), LookedUp(Some(_))) => Ordering::Less, + (LookedUp(Some(lhs)), LookedUp(Some(rhs))) => lhs .pack_id .cmp(&rhs.pack_id) .then(lhs.pack_offset.cmp(&rhs.pack_offset)), + (_, _) => unreachable!("counts were resolved beforehand"), }); let mut index: Vec<(u32, std::ops::Range)> = Vec::new(); @@ -93,9 +119,6 @@ where } }; let counts = Arc::new(counts); - let (chunk_size, thread_limit, _) = - parallel::optimize_chunk_size_and_thread_limit(chunk_size, Some(counts.len()), thread_limit, None); - let chunks = util::ChunkRanges::new(chunk_size, counts.len()).enumerate(); let progress = Arc::new(parking_lot::Mutex::new(progress)); parallel::reduce::Stepwise::new( @@ -205,6 +228,7 @@ where } mod util { + #[derive(Clone)] pub struct ChunkRanges { cursor: usize, size: usize, From 7461f31f03d67ecc9fdf398adf3cb6d4eb365412 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 23 Aug 2021 19:52:12 +0800 Subject: [PATCH 07/10] [pack #170] Basic entry resolution without progress --- .../src/data/output/entry/iter_from_counts.rs | 57 +++++++++++-------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/git-pack/src/data/output/entry/iter_from_counts.rs b/git-pack/src/data/output/entry/iter_from_counts.rs index 671c9f505d..6d7d66d967 100644 --- a/git-pack/src/data/output/entry/iter_from_counts.rs +++ b/git-pack/src/data/output/entry/iter_from_counts.rs @@ -59,28 +59,39 @@ where ); let (chunk_size, thread_limit, _) = parallel::optimize_chunk_size_and_thread_limit(chunk_size, Some(counts.len()), thread_limit, None); - let chunks = util::ChunkRanges::new(chunk_size, counts.len()).enumerate(); - // { - // let mut progress = progress.add_child("resolving"); - // progress.init(Some(counts.len()), git_features::progress::count("counts")); - // parallel::in_parallel_if( - // || counts.len() > 4_000, - // chunks.clone(), - // thread_limit, - // { - // let make_cache = make_cache.clone(); - // |_n| (Vec::::new(), make_cache()) - // }, - // |chunk_range, (buf, pack_cache)| { - // let chunk = &counts[chunk_range]; - // for count in chunk { - // let () = count; - // } - // Ok(()) - // }, - // parallel::reduce::IdentityWithResult::<_, _>::default(), - // ); - // } + let chunks = util::ChunkRanges::new(chunk_size, counts.len()); + { + let mut progress = progress.add_child("resolving"); + progress.init(Some(counts.len()), git_features::progress::count("counts")); + let enough_counts_present = counts.len() > 4_000; + parallel::in_parallel_if( + || enough_counts_present, + chunks.clone(), + thread_limit, + |_n| Vec::::new(), + |chunk_range, buf| { + let chunk = { + let c = &counts[chunk_range]; + let mut_ptr = c.as_ptr() as *mut output::Count; + // SAFETY: We know that 'chunks' is only non-overlapping slices, and this function owns `counts`. + #[allow(unsafe_code)] + unsafe { + std::slice::from_raw_parts_mut(mut_ptr, c.len()) + } + }; + for count in chunk { + use crate::data::output::count::PackLocation::*; + match count.entry_pack_location { + LookedUp(_) => continue, + NotLookedUp => count.entry_pack_location = LookedUp(db.location_by_oid(count.id, buf)), + } + } + Ok::<_, ()>(()) + }, + parallel::reduce::IdentityWithResult::<(), ()>::default(), + ) + .expect("infallible - we ignore none-existing objects"); + } let counts_range_by_pack_id = match mode { Mode::PackCopyAndBaseObjects => { let mut progress = progress.add_child("sorting"); @@ -122,7 +133,7 @@ where let progress = Arc::new(parking_lot::Mutex::new(progress)); parallel::reduce::Stepwise::new( - chunks, + chunks.enumerate(), thread_limit, { let progress = Arc::clone(&progress); From ada0b96e3707c06d7d6f7e4002907e12b45f7419 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 23 Aug 2021 19:56:50 +0800 Subject: [PATCH 08/10] [pack #170] basic progress for resolution --- git-pack/src/data/output/count/mod.rs | 1 + .../src/data/output/entry/iter_from_counts.rs | 47 ++++++++++++------- 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/git-pack/src/data/output/count/mod.rs b/git-pack/src/data/output/count/mod.rs index 41073537e2..1706c5ae62 100644 --- a/git-pack/src/data/output/count/mod.rs +++ b/git-pack/src/data/output/count/mod.rs @@ -17,6 +17,7 @@ pub struct Count { /// Specifies how the pack location was handled during counting #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] pub enum PackLocation { /// We did not lookup this object NotLookedUp, diff --git a/git-pack/src/data/output/entry/iter_from_counts.rs b/git-pack/src/data/output/entry/iter_from_counts.rs index 6d7d66d967..87ac7b20bd 100644 --- a/git-pack/src/data/output/entry/iter_from_counts.rs +++ b/git-pack/src/data/output/entry/iter_from_counts.rs @@ -61,36 +61,47 @@ where parallel::optimize_chunk_size_and_thread_limit(chunk_size, Some(counts.len()), thread_limit, None); let chunks = util::ChunkRanges::new(chunk_size, counts.len()); { - let mut progress = progress.add_child("resolving"); - progress.init(Some(counts.len()), git_features::progress::count("counts")); + let progress = Arc::new(parking_lot::Mutex::new(progress.add_child("resolving"))); + progress + .lock() + .init(Some(counts.len()), git_features::progress::count("counts")); let enough_counts_present = counts.len() > 4_000; + let start = std::time::Instant::now(); parallel::in_parallel_if( || enough_counts_present, chunks.clone(), thread_limit, |_n| Vec::::new(), - |chunk_range, buf| { - let chunk = { - let c = &counts[chunk_range]; - let mut_ptr = c.as_ptr() as *mut output::Count; - // SAFETY: We know that 'chunks' is only non-overlapping slices, and this function owns `counts`. - #[allow(unsafe_code)] - unsafe { - std::slice::from_raw_parts_mut(mut_ptr, c.len()) - } - }; - for count in chunk { - use crate::data::output::count::PackLocation::*; - match count.entry_pack_location { - LookedUp(_) => continue, - NotLookedUp => count.entry_pack_location = LookedUp(db.location_by_oid(count.id, buf)), + { + let progress = Arc::clone(&progress); + let counts = &counts; + let db = &db; + move |chunk_range, buf| { + let chunk = { + let c = &counts[chunk_range]; + let mut_ptr = c.as_ptr() as *mut output::Count; + // SAFETY: We know that 'chunks' is only non-overlapping slices, and this function owns `counts`. + #[allow(unsafe_code)] + unsafe { + std::slice::from_raw_parts_mut(mut_ptr, c.len()) + } + }; + let chunk_size = chunk.len(); + for count in chunk { + use crate::data::output::count::PackLocation::*; + match count.entry_pack_location { + LookedUp(_) => continue, + NotLookedUp => count.entry_pack_location = LookedUp(db.location_by_oid(count.id, buf)), + } } + progress.lock().inc_by(chunk_size); + Ok::<_, ()>(()) } - Ok::<_, ()>(()) }, parallel::reduce::IdentityWithResult::<(), ()>::default(), ) .expect("infallible - we ignore none-existing objects"); + progress.lock().show_throughput(start); } let counts_range_by_pack_id = match mode { Mode::PackCopyAndBaseObjects => { From 4d820d2f94dc3afc062bbd25e969c87410212c3a Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 23 Aug 2021 20:55:56 +0800 Subject: [PATCH 09/10] [pack #170] clru allows for free lists, reducing allocation pressure... ... and also some time. --- Cargo.lock | 7 +++ git-pack/Cargo.toml | 3 +- git-pack/src/cache.rs | 85 ++++++++++++++++++++++++++++++++ gitoxide-core/src/pack/create.rs | 2 +- 4 files changed, 95 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6a636468ea..235b999810 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -484,6 +484,12 @@ dependencies = [ "syn", ] +[[package]] +name = "clru" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "218d6bd3dde8e442a975fa1cd233c0e5fded7596bccfe39f58eca98d22421e0a" + [[package]] name = "cmake" version = "0.1.45" @@ -1129,6 +1135,7 @@ dependencies = [ "btoi", "byteorder", "bytesize", + "clru", "common_macros", "dashmap", "filebuffer", diff --git a/git-pack/Cargo.toml b/git-pack/Cargo.toml index d52ede706c..088bc40e28 100644 --- a/git-pack/Cargo.toml +++ b/git-pack/Cargo.toml @@ -13,7 +13,7 @@ doctest = false [features] pack-cache-lru-static = ["uluru"] -pack-cache-lru-dynamic = ["memory-lru"] +pack-cache-lru-dynamic = ["clru", "memory-lru"] serde1 = ["serde", "git-object/serde1"] internal-testing-git-features-parallel = ["git-features/parallel"] internal-testing-to-avoid-being-run-by-cargo-test-all = [] @@ -50,6 +50,7 @@ parking_lot = { version = "0.11.0", default-features = false } thiserror = "1.0.26" uluru = { version = "3.0.0", optional = true } memory-lru = { version = "0.1.0", optional = true } +clru = { version = "0.5.0", optional = true } dashmap = "4.0.2" [dev-dependencies] diff --git a/git-pack/src/cache.rs b/git-pack/src/cache.rs index 2b20f12917..a024094d35 100644 --- a/git-pack/src/cache.rs +++ b/git-pack/src/cache.rs @@ -104,6 +104,91 @@ pub mod lru { #[cfg(feature = "pack-cache-lru-dynamic")] pub use memory::MemoryCappedHashmap; + #[cfg(feature = "pack-cache-lru-dynamic")] + mod memory2 { + use super::DecodeEntry; + use clru::WeightScale; + use std::num::NonZeroUsize; + + struct Entry { + data: Vec, + kind: git_object::Kind, + compressed_size: usize, + } + + type Key = (u32, u64); + struct CustomScale; + + impl WeightScale for CustomScale { + fn weight(&self, _key: &Key, value: &Entry) -> usize { + value.data.len() + } + } + + /// An LRU cache with hash map backing and an eviction rule based on the memory usage for object data in bytes. + pub struct MemoryCappedHashmap2 { + inner: clru::CLruCache, + free_list: Vec>, + debug: git_features::cache::Debug, + } + + impl MemoryCappedHashmap2 { + /// Return a new instance which evicts least recently used items if it uses more than `memory_cap_in_bytes` + /// object data. + pub fn new(memory_cap_in_bytes: usize) -> MemoryCappedHashmap2 { + MemoryCappedHashmap2 { + inner: clru::CLruCache::with_config( + clru::CLruCacheConfig::new(NonZeroUsize::new(memory_cap_in_bytes).expect("non zero")) + .with_scale(CustomScale), + ), + free_list: Vec::new(), + debug: git_features::cache::Debug::new(format!("MemoryCappedHashmap({}B)", memory_cap_in_bytes)), + } + } + } + + impl DecodeEntry for MemoryCappedHashmap2 { + fn put(&mut self, pack_id: u32, offset: u64, data: &[u8], kind: git_object::Kind, compressed_size: usize) { + self.debug.put(); + if let Ok(Some(previous_entry)) = self.inner.put_with_weight( + (pack_id, offset), + Entry { + data: self + .free_list + .pop() + .map(|mut v| { + v.clear(); + v.resize(data.len(), 0); + v.copy_from_slice(data); + v + }) + .unwrap_or_else(|| Vec::from(data)), + kind, + compressed_size, + }, + ) { + self.free_list.push(previous_entry.data) + } + } + + fn get(&mut self, pack_id: u32, offset: u64, out: &mut Vec) -> Option<(git_object::Kind, usize)> { + let res = self.inner.get(&(pack_id, offset)).map(|e| { + out.resize(e.data.len(), 0); + out.copy_from_slice(&e.data); + (e.kind, e.compressed_size) + }); + if res.is_some() { + self.debug.hit() + } else { + self.debug.miss() + } + res + } + } + } + #[cfg(feature = "pack-cache-lru-dynamic")] + pub use memory2::MemoryCappedHashmap2; + #[cfg(feature = "pack-cache-lru-static")] mod _static { use super::DecodeEntry; diff --git a/gitoxide-core/src/pack/create.rs b/gitoxide-core/src/pack/create.rs index 45fba11b4c..582ba73309 100644 --- a/gitoxide-core/src/pack/create.rs +++ b/gitoxide-core/src/pack/create.rs @@ -176,7 +176,7 @@ where if may_use_multiple_threads { Box::new(pack::cache::Never) as Box } else { - Box::new(pack::cache::lru::MemoryCappedHashmap::new(512 * 1024 * 1024)) as Box + Box::new(pack::cache::lru::MemoryCappedHashmap2::new(512 * 1024 * 1024)) as Box // todo: Make that configurable } }; From dce4f97a84aa6a73e31e7397501cfce27241c5b8 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 23 Aug 2021 20:58:58 +0800 Subject: [PATCH 10/10] [pack #170] there can only be one --- Cargo.lock | 1 - git-pack/Cargo.toml | 3 +- git-pack/src/cache.rs | 75 +++----------------------------- gitoxide-core/src/pack/create.rs | 2 +- 4 files changed, 8 insertions(+), 73 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 235b999810..63c2c8bf61 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1149,7 +1149,6 @@ dependencies = [ "git-traverse", "hex", "itoa", - "memory-lru", "parking_lot 0.11.1 (registry+https://github.com/rust-lang/crates.io-index)", "serde", "smallvec", diff --git a/git-pack/Cargo.toml b/git-pack/Cargo.toml index 088bc40e28..3a8d201ba4 100644 --- a/git-pack/Cargo.toml +++ b/git-pack/Cargo.toml @@ -13,7 +13,7 @@ doctest = false [features] pack-cache-lru-static = ["uluru"] -pack-cache-lru-dynamic = ["clru", "memory-lru"] +pack-cache-lru-dynamic = ["clru"] serde1 = ["serde", "git-object/serde1"] internal-testing-git-features-parallel = ["git-features/parallel"] internal-testing-to-avoid-being-run-by-cargo-test-all = [] @@ -49,7 +49,6 @@ bytesize = "1.0.1" parking_lot = { version = "0.11.0", default-features = false } thiserror = "1.0.26" uluru = { version = "3.0.0", optional = true } -memory-lru = { version = "0.1.0", optional = true } clru = { version = "0.5.0", optional = true } dashmap = "4.0.2" diff --git a/git-pack/src/cache.rs b/git-pack/src/cache.rs index a024094d35..10cda434c5 100644 --- a/git-pack/src/cache.rs +++ b/git-pack/src/cache.rs @@ -43,69 +43,6 @@ pub mod lru { #[cfg(feature = "pack-cache-lru-dynamic")] mod memory { - use super::DecodeEntry; - struct Entry { - data: Vec, - kind: git_object::Kind, - compressed_size: usize, - } - - impl memory_lru::ResidentSize for Entry { - fn resident_size(&self) -> usize { - self.data.len() - } - } - - /// An LRU cache with hash map backing and an eviction rule based on the memory usage for object data in bytes. - pub struct MemoryCappedHashmap { - inner: memory_lru::MemoryLruCache<(u32, u64), Entry>, - debug: git_features::cache::Debug, - } - - impl MemoryCappedHashmap { - /// Return a new instance which evicts least recently used items if it uses more than `memory_cap_in_bytes` - /// object data. - pub fn new(memory_cap_in_bytes: usize) -> MemoryCappedHashmap { - MemoryCappedHashmap { - inner: memory_lru::MemoryLruCache::new(memory_cap_in_bytes), - debug: git_features::cache::Debug::new(format!("MemoryCappedHashmap({}B)", memory_cap_in_bytes)), - } - } - } - - impl DecodeEntry for MemoryCappedHashmap { - fn put(&mut self, pack_id: u32, offset: u64, data: &[u8], kind: git_object::Kind, compressed_size: usize) { - self.debug.put(); - self.inner.insert( - (pack_id, offset), - Entry { - data: Vec::from(data), - kind, - compressed_size, - }, - ) - } - - fn get(&mut self, pack_id: u32, offset: u64, out: &mut Vec) -> Option<(git_object::Kind, usize)> { - let res = self.inner.get(&(pack_id, offset)).map(|e| { - out.resize(e.data.len(), 0); - out.copy_from_slice(&e.data); - (e.kind, e.compressed_size) - }); - if res.is_some() { - self.debug.hit() - } else { - self.debug.miss() - } - res - } - } - } - #[cfg(feature = "pack-cache-lru-dynamic")] - pub use memory::MemoryCappedHashmap; - - #[cfg(feature = "pack-cache-lru-dynamic")] - mod memory2 { use super::DecodeEntry; use clru::WeightScale; use std::num::NonZeroUsize; @@ -126,17 +63,17 @@ pub mod lru { } /// An LRU cache with hash map backing and an eviction rule based on the memory usage for object data in bytes. - pub struct MemoryCappedHashmap2 { + pub struct MemoryCappedHashmap { inner: clru::CLruCache, free_list: Vec>, debug: git_features::cache::Debug, } - impl MemoryCappedHashmap2 { + impl MemoryCappedHashmap { /// Return a new instance which evicts least recently used items if it uses more than `memory_cap_in_bytes` /// object data. - pub fn new(memory_cap_in_bytes: usize) -> MemoryCappedHashmap2 { - MemoryCappedHashmap2 { + pub fn new(memory_cap_in_bytes: usize) -> MemoryCappedHashmap { + MemoryCappedHashmap { inner: clru::CLruCache::with_config( clru::CLruCacheConfig::new(NonZeroUsize::new(memory_cap_in_bytes).expect("non zero")) .with_scale(CustomScale), @@ -147,7 +84,7 @@ pub mod lru { } } - impl DecodeEntry for MemoryCappedHashmap2 { + impl DecodeEntry for MemoryCappedHashmap { fn put(&mut self, pack_id: u32, offset: u64, data: &[u8], kind: git_object::Kind, compressed_size: usize) { self.debug.put(); if let Ok(Some(previous_entry)) = self.inner.put_with_weight( @@ -187,7 +124,7 @@ pub mod lru { } } #[cfg(feature = "pack-cache-lru-dynamic")] - pub use memory2::MemoryCappedHashmap2; + pub use memory::MemoryCappedHashmap; #[cfg(feature = "pack-cache-lru-static")] mod _static { diff --git a/gitoxide-core/src/pack/create.rs b/gitoxide-core/src/pack/create.rs index 582ba73309..45fba11b4c 100644 --- a/gitoxide-core/src/pack/create.rs +++ b/gitoxide-core/src/pack/create.rs @@ -176,7 +176,7 @@ where if may_use_multiple_threads { Box::new(pack::cache::Never) as Box } else { - Box::new(pack::cache::lru::MemoryCappedHashmap2::new(512 * 1024 * 1024)) as Box + Box::new(pack::cache::lru::MemoryCappedHashmap::new(512 * 1024 * 1024)) as Box // todo: Make that configurable } };