diff --git a/Cargo.lock b/Cargo.lock index 861ddddae5..63c2c8bf61 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -484,6 +484,12 @@ dependencies = [ "syn", ] +[[package]] +name = "clru" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "218d6bd3dde8e442a975fa1cd233c0e5fded7596bccfe39f58eca98d22421e0a" + [[package]] name = "cmake" version = "0.1.45" @@ -1123,12 +1129,13 @@ dependencies = [ [[package]] name = "git-pack" -version = "0.8.2" +version = "0.9.0" dependencies = [ "bstr", "btoi", "byteorder", "bytesize", + "clru", "common_macros", "dashmap", "filebuffer", @@ -1142,7 +1149,6 @@ dependencies = [ "git-traverse", "hex", "itoa", - "memory-lru", "parking_lot 0.11.1 (registry+https://github.com/rust-lang/crates.io-index)", "serde", "smallvec", diff --git a/git-odb/Cargo.toml b/git-odb/Cargo.toml index 4d069403ae..3c71ce5b21 100644 --- a/git-odb/Cargo.toml +++ b/git-odb/Cargo.toml @@ -31,7 +31,7 @@ all-features = true git-features = { version = "^0.16.0", path = "../git-features", features = ["rustsha1", "walkdir", "zlib"] } git-hash = { version = "^0.5.0", path = "../git-hash" } git-object = { version ="0.12.0", path = "../git-object" } -git-pack = { version ="0.8.0", path = "../git-pack" } +git-pack = { version ="^0.9.0", path = "../git-pack" } btoi = "0.4.2" tempfile = "3.1.0" diff --git a/git-odb/src/store/compound/init.rs b/git-odb/src/store/compound/init.rs index 84b1eeb14a..5532f243d8 100644 --- a/git-odb/src/store/compound/init.rs +++ b/git-odb/src/store/compound/init.rs @@ -38,6 +38,8 @@ impl compound::Store { p.extension().unwrap_or_default() == "idx" && p.file_name().unwrap_or_default().to_string_lossy().starts_with("pack-") }) + // TODO: make this configurable, git for instance sorts by modification date + // https://github.com/libgit2/libgit2/blob/main/src/odb_pack.c#L41-L158 .map(|(p, md)| pack::Bundle::at(p).map(|b| (b, md.len()))) .collect::, _>>()?; packs_and_sizes.sort_by_key(|e| e.1); diff --git a/git-pack/Cargo.toml b/git-pack/Cargo.toml index 95224a4143..3a8d201ba4 100644 --- a/git-pack/Cargo.toml +++ b/git-pack/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "git-pack" -version = "0.8.2" +version = "0.9.0" repository = "https://github.com/Byron/gitoxide" authors = ["Sebastian Thiel "] license = "MIT/Apache-2.0" @@ -13,7 +13,7 @@ doctest = false [features] pack-cache-lru-static = ["uluru"] -pack-cache-lru-dynamic = ["memory-lru"] +pack-cache-lru-dynamic = ["clru"] serde1 = ["serde", "git-object/serde1"] internal-testing-git-features-parallel = ["git-features/parallel"] internal-testing-to-avoid-being-run-by-cargo-test-all = [] @@ -49,7 +49,7 @@ bytesize = "1.0.1" parking_lot = { version = "0.11.0", default-features = false } thiserror = "1.0.26" uluru = { version = "3.0.0", optional = true } -memory-lru = { version = "0.1.0", optional = true } +clru = { version = "0.5.0", optional = true } dashmap = "4.0.2" [dev-dependencies] diff --git a/git-pack/src/cache.rs b/git-pack/src/cache.rs index 2b20f12917..10cda434c5 100644 --- a/git-pack/src/cache.rs +++ b/git-pack/src/cache.rs @@ -44,21 +44,28 @@ pub mod lru { #[cfg(feature = "pack-cache-lru-dynamic")] mod memory { use super::DecodeEntry; + use clru::WeightScale; + use std::num::NonZeroUsize; + struct Entry { data: Vec, kind: git_object::Kind, compressed_size: usize, } - impl memory_lru::ResidentSize for Entry { - fn resident_size(&self) -> usize { - self.data.len() + type Key = (u32, u64); + struct CustomScale; + + impl WeightScale for CustomScale { + fn weight(&self, _key: &Key, value: &Entry) -> usize { + value.data.len() } } /// An LRU cache with hash map backing and an eviction rule based on the memory usage for object data in bytes. pub struct MemoryCappedHashmap { - inner: memory_lru::MemoryLruCache<(u32, u64), Entry>, + inner: clru::CLruCache, + free_list: Vec>, debug: git_features::cache::Debug, } @@ -67,7 +74,11 @@ pub mod lru { /// object data. pub fn new(memory_cap_in_bytes: usize) -> MemoryCappedHashmap { MemoryCappedHashmap { - inner: memory_lru::MemoryLruCache::new(memory_cap_in_bytes), + inner: clru::CLruCache::with_config( + clru::CLruCacheConfig::new(NonZeroUsize::new(memory_cap_in_bytes).expect("non zero")) + .with_scale(CustomScale), + ), + free_list: Vec::new(), debug: git_features::cache::Debug::new(format!("MemoryCappedHashmap({}B)", memory_cap_in_bytes)), } } @@ -76,14 +87,25 @@ pub mod lru { impl DecodeEntry for MemoryCappedHashmap { fn put(&mut self, pack_id: u32, offset: u64, data: &[u8], kind: git_object::Kind, compressed_size: usize) { self.debug.put(); - self.inner.insert( + if let Ok(Some(previous_entry)) = self.inner.put_with_weight( (pack_id, offset), Entry { - data: Vec::from(data), + data: self + .free_list + .pop() + .map(|mut v| { + v.clear(); + v.resize(data.len(), 0); + v.copy_from_slice(data); + v + }) + .unwrap_or_else(|| Vec::from(data)), kind, compressed_size, }, - ) + ) { + self.free_list.push(previous_entry.data) + } } fn get(&mut self, pack_id: u32, offset: u64, out: &mut Vec) -> Option<(git_object::Kind, usize)> { diff --git a/git-pack/src/data/output/count/mod.rs b/git-pack/src/data/output/count/mod.rs index f3cf5c2c69..1706c5ae62 100644 --- a/git-pack/src/data/output/count/mod.rs +++ b/git-pack/src/data/output/count/mod.rs @@ -12,7 +12,34 @@ pub struct Count { /// The hash of the object to write pub id: ObjectId, /// A way to locate a pack entry in the object database, only available if the object is in a pack. - pub entry_pack_location: Option, + pub entry_pack_location: PackLocation, +} + +/// Specifies how the pack location was handled during counting +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub enum PackLocation { + /// We did not lookup this object + NotLookedUp, + /// The object was looked up and there may be a location in a pack, along with enty information + LookedUp(Option), +} + +impl PackLocation { + /// Directly go through to LookedUp variant, panic otherwise + pub fn is_none(&self) -> bool { + match self { + PackLocation::LookedUp(opt) => opt.is_none(), + PackLocation::NotLookedUp => unreachable!("must have been resolved"), + } + } + /// Directly go through to LookedUp variant, panic otherwise + pub fn as_ref(&self) -> Option<&crate::bundle::Location> { + match self { + PackLocation::LookedUp(opt) => opt.as_ref(), + PackLocation::NotLookedUp => unreachable!("must have been resolved"), + } + } } impl Count { @@ -20,11 +47,11 @@ impl Count { pub fn from_data(oid: impl Into, obj: &data::Object<'_>) -> Self { Count { id: oid.into(), - entry_pack_location: obj.pack_location.clone(), + entry_pack_location: PackLocation::LookedUp(obj.pack_location.clone()), } } } /// -pub mod iter_from_objects; -pub use iter_from_objects::{objects, objects_unthreaded}; +pub mod objects; +pub use objects::{objects, objects_unthreaded}; diff --git a/git-pack/src/data/output/count/iter_from_objects.rs b/git-pack/src/data/output/count/objects.rs similarity index 92% rename from git-pack/src/data/output/count/iter_from_objects.rs rename to git-pack/src/data/output/count/objects.rs index 15cb3603c6..37242346df 100644 --- a/git-pack/src/data/output/count/iter_from_objects.rs +++ b/git-pack/src/data/output/count/objects.rs @@ -91,6 +91,7 @@ where cache, progress, should_interrupt, + true, ) } }, @@ -125,6 +126,7 @@ where pack_cache, &mut progress, should_interrupt, + false, ) } @@ -139,6 +141,7 @@ fn expand_inner( cache: &mut impl crate::cache::DecodeEntry, progress: &mut impl Progress, should_interrupt: &AtomicBool, + allow_pack_lookups: bool, ) -> Result, IterErr> where Find: crate::Find + Send + Sync, @@ -205,12 +208,20 @@ where &mut tree_traversal_state, |oid, buf| { stats.decoded_objects += 1; - db.find_existing_tree_iter(oid, buf, cache).ok() + match db.find_existing(oid, buf, cache).ok() { + Some(obj) => { + progress.inc(); + stats.expanded_objects += 1; + out.push(output::Count::from_data(oid, &obj)); + obj.into_tree_iter() + } + None => None, + } }, &mut traverse_delegate, ) .map_err(Error::TreeTraverse)?; - &traverse_delegate.objects + &traverse_delegate.non_trees } else { for commit_id in &parent_commit_ids { let parent_tree_id = { @@ -259,7 +270,7 @@ where &changes_delegate.objects }; for id in objects.iter() { - out.push(id_to_count(db, buf2, id, progress, stats)); + out.push(id_to_count(db, buf2, id, progress, stats, allow_pack_lookups)); } break; } @@ -280,13 +291,21 @@ where &mut tree_traversal_state, |oid, buf| { stats.decoded_objects += 1; - db.find_existing_tree_iter(oid, buf, cache).ok() + match db.find_existing(oid, buf, cache).ok() { + Some(obj) => { + progress.inc(); + stats.expanded_objects += 1; + out.push(output::Count::from_data(oid, &obj)); + obj.into_tree_iter() + } + None => None, + } }, &mut traverse_delegate, ) .map_err(Error::TreeTraverse)?; - for id in traverse_delegate.objects.iter() { - out.push(id_to_count(db, buf1, id, progress, stats)); + for id in traverse_delegate.non_trees.iter() { + out.push(id_to_count(db, buf1, id, progress, stats, allow_pack_lookups)); } break; } @@ -318,7 +337,7 @@ where mod tree { pub mod changes { - use crate::data::output::count::iter_from_objects::util::InsertImmutable; + use crate::data::output::count::objects::util::InsertImmutable; use git_diff::tree::{ visit::{Action, Change}, Visit, @@ -374,13 +393,13 @@ mod tree { } pub mod traverse { - use crate::data::output::count::iter_from_objects::util::InsertImmutable; + use crate::data::output::count::objects::util::InsertImmutable; use git_hash::ObjectId; use git_object::{bstr::BStr, immutable::tree::Entry}; use git_traverse::tree::visit::{Action, Visit}; pub struct AllUnseen<'a, H> { - pub objects: Vec, + pub non_trees: Vec, all_seen: &'a H, } @@ -390,12 +409,12 @@ mod tree { { pub fn new(all_seen: &'a H) -> Self { AllUnseen { - objects: Default::default(), + non_trees: Default::default(), all_seen, } } pub fn clear(&mut self) { - self.objects.clear(); + self.non_trees.clear(); } } @@ -414,7 +433,6 @@ mod tree { fn visit_tree(&mut self, entry: &Entry<'_>) -> Action { let inserted = self.all_seen.insert(entry.oid.to_owned()); if inserted { - self.objects.push(entry.oid.to_owned()); Action::Continue } else { Action::Skip @@ -424,7 +442,7 @@ mod tree { fn visit_nontree(&mut self, entry: &Entry<'_>) -> Action { let inserted = self.all_seen.insert(entry.oid.to_owned()); if inserted { - self.objects.push(entry.oid.to_owned()); + self.non_trees.push(entry.oid.to_owned()); } Action::Continue } @@ -432,6 +450,7 @@ mod tree { } } +#[inline] fn push_obj_count_unique( out: &mut Vec, all_seen: &impl util::InsertImmutable, @@ -452,18 +471,24 @@ fn push_obj_count_unique( } } +#[inline] fn id_to_count( db: &Find, buf: &mut Vec, id: &oid, progress: &mut impl Progress, statistics: &mut Outcome, + allow_pack_lookups: bool, ) -> output::Count { progress.inc(); statistics.expanded_objects += 1; output::Count { id: id.to_owned(), - entry_pack_location: db.location_by_oid(id, buf), + entry_pack_location: if allow_pack_lookups { + PackLocation::LookedUp(db.location_by_oid(id, buf)) + } else { + PackLocation::NotLookedUp + }, } } @@ -625,6 +650,7 @@ mod types { Interrupted, } } +use crate::data::output::count::PackLocation; use std::cell::RefCell; use std::collections::HashSet; pub use types::{Error, ObjectExpansion, Options, Outcome}; diff --git a/git-pack/src/data/output/entry/iter_from_counts.rs b/git-pack/src/data/output/entry/iter_from_counts.rs index 02b9315032..87ac7b20bd 100644 --- a/git-pack/src/data/output/entry/iter_from_counts.rs +++ b/git-pack/src/data/output/entry/iter_from_counts.rs @@ -57,20 +57,68 @@ where matches!(version, crate::data::Version::V2), "currently we can only write version 2" ); + let (chunk_size, thread_limit, _) = + parallel::optimize_chunk_size_and_thread_limit(chunk_size, Some(counts.len()), thread_limit, None); + let chunks = util::ChunkRanges::new(chunk_size, counts.len()); + { + let progress = Arc::new(parking_lot::Mutex::new(progress.add_child("resolving"))); + progress + .lock() + .init(Some(counts.len()), git_features::progress::count("counts")); + let enough_counts_present = counts.len() > 4_000; + let start = std::time::Instant::now(); + parallel::in_parallel_if( + || enough_counts_present, + chunks.clone(), + thread_limit, + |_n| Vec::::new(), + { + let progress = Arc::clone(&progress); + let counts = &counts; + let db = &db; + move |chunk_range, buf| { + let chunk = { + let c = &counts[chunk_range]; + let mut_ptr = c.as_ptr() as *mut output::Count; + // SAFETY: We know that 'chunks' is only non-overlapping slices, and this function owns `counts`. + #[allow(unsafe_code)] + unsafe { + std::slice::from_raw_parts_mut(mut_ptr, c.len()) + } + }; + let chunk_size = chunk.len(); + for count in chunk { + use crate::data::output::count::PackLocation::*; + match count.entry_pack_location { + LookedUp(_) => continue, + NotLookedUp => count.entry_pack_location = LookedUp(db.location_by_oid(count.id, buf)), + } + } + progress.lock().inc_by(chunk_size); + Ok::<_, ()>(()) + } + }, + parallel::reduce::IdentityWithResult::<(), ()>::default(), + ) + .expect("infallible - we ignore none-existing objects"); + progress.lock().show_throughput(start); + } let counts_range_by_pack_id = match mode { Mode::PackCopyAndBaseObjects => { let mut progress = progress.add_child("sorting"); progress.init(Some(counts.len()), git_features::progress::count("counts")); let start = std::time::Instant::now(); + use crate::data::output::count::PackLocation::*; counts.sort_by(|lhs, rhs| match (&lhs.entry_pack_location, &rhs.entry_pack_location) { - (None, None) => Ordering::Equal, - (Some(_), None) => Ordering::Greater, - (None, Some(_)) => Ordering::Less, - (Some(lhs), Some(rhs)) => lhs + (LookedUp(None), LookedUp(None)) => Ordering::Equal, + (LookedUp(Some(_)), LookedUp(None)) => Ordering::Greater, + (LookedUp(None), LookedUp(Some(_))) => Ordering::Less, + (LookedUp(Some(lhs)), LookedUp(Some(rhs))) => lhs .pack_id .cmp(&rhs.pack_id) .then(lhs.pack_offset.cmp(&rhs.pack_offset)), + (_, _) => unreachable!("counts were resolved beforehand"), }); let mut index: Vec<(u32, std::ops::Range)> = Vec::new(); @@ -93,13 +141,10 @@ where } }; let counts = Arc::new(counts); - let (chunk_size, thread_limit, _) = - parallel::optimize_chunk_size_and_thread_limit(chunk_size, Some(counts.len()), thread_limit, None); - let chunks = util::ChunkRanges::new(chunk_size, counts.len()).enumerate(); let progress = Arc::new(parking_lot::Mutex::new(progress)); parallel::reduce::Stepwise::new( - chunks, + chunks.enumerate(), thread_limit, { let progress = Arc::clone(&progress); @@ -205,6 +250,7 @@ where } mod util { + #[derive(Clone)] pub struct ChunkRanges { cursor: usize, size: usize, diff --git a/git-pack/tests/pack/data/output/count_and_entries.rs b/git-pack/tests/pack/data/output/count_and_entries.rs index 09de2008ae..056ec01dbc 100644 --- a/git-pack/tests/pack/data/output/count_and_entries.rs +++ b/git-pack/tests/pack/data/output/count_and_entries.rs @@ -90,7 +90,7 @@ fn traversals() -> crate::Result { allow_thin_pack, ) in [ ( - count::iter_from_objects::ObjectExpansion::AsIs, + count::objects::ObjectExpansion::AsIs, Count { trees: 0, commits: 15, @@ -105,7 +105,7 @@ fn traversals() -> crate::Result { blobs: 0, tags: 1, }, - output::count::iter_from_objects::Outcome { + output::count::objects::Outcome { input_objects: 16, expanded_objects: 0, decoded_objects: 16, @@ -122,7 +122,7 @@ fn traversals() -> crate::Result { false, ), ( - count::iter_from_objects::ObjectExpansion::TreeAdditionsComparedToAncestor, + count::objects::ObjectExpansion::TreeAdditionsComparedToAncestor, Count { trees: 3, commits: 2, // todo: why more? @@ -137,7 +137,7 @@ fn traversals() -> crate::Result { blobs: 96, tags: 0, }, - output::count::iter_from_objects::Outcome { + output::count::objects::Outcome { input_objects: 1, expanded_objects: 102, decoded_objects: 18, @@ -154,7 +154,7 @@ fn traversals() -> crate::Result { true, ), ( - count::iter_from_objects::ObjectExpansion::TreeAdditionsComparedToAncestor, + count::objects::ObjectExpansion::TreeAdditionsComparedToAncestor, Count { trees: 5, commits: 2, // todo: why more? @@ -169,7 +169,7 @@ fn traversals() -> crate::Result { blobs: 96, tags: 0, }, - output::count::iter_from_objects::Outcome { + output::count::objects::Outcome { input_objects: 1, expanded_objects: 102, decoded_objects: 18, @@ -186,10 +186,10 @@ fn traversals() -> crate::Result { false, ), ( - count::iter_from_objects::ObjectExpansion::TreeContents, + count::objects::ObjectExpansion::TreeContents, whole_pack, whole_pack_obj_count, - output::count::iter_from_objects::Outcome { + output::count::objects::Outcome { input_objects: 16, expanded_objects: 852, decoded_objects: 57, @@ -206,10 +206,10 @@ fn traversals() -> crate::Result { false, ), ( - count::iter_from_objects::ObjectExpansion::TreeAdditionsComparedToAncestor, + count::objects::ObjectExpansion::TreeAdditionsComparedToAncestor, whole_pack, whole_pack_obj_count, - output::count::iter_from_objects::Outcome { + output::count::objects::Outcome { input_objects: 16, expanded_objects: 866, decoded_objects: 208, @@ -255,7 +255,7 @@ fn traversals() -> crate::Result { .map(Ok::<_, Infallible>), progress::Discard, &AtomicBool::new(false), - count::iter_from_objects::Options { + count::objects::Options { input_object_expansion: expansion_mode, thread_limit: deterministic_count_needs_single_thread, ..Default::default() diff --git a/git-repository/Cargo.toml b/git-repository/Cargo.toml index 9b7ba8d90a..43c8af586d 100644 --- a/git-repository/Cargo.toml +++ b/git-repository/Cargo.toml @@ -42,7 +42,7 @@ git-odb = { version ="0.20.0", path = "../git-odb" } git-hash = { version = "^0.5.0", path = "../git-hash" } git-object = { version ="0.12.0", path = "../git-object" } git-actor = { version ="0.3.1", path = "../git-actor" } -git-pack = { version ="0.8.0", path = "../git-pack" } +git-pack = { version ="^0.9.0", path = "../git-pack" } git-url = { version = "0.3.0", path = "../git-url", optional = true } git-traverse = { version ="0.7.0", path = "../git-traverse", optional = true } diff --git a/gitoxide-core/src/pack/create.rs b/gitoxide-core/src/pack/create.rs index d019359679..45fba11b4c 100644 --- a/gitoxide-core/src/pack/create.rs +++ b/gitoxide-core/src/pack/create.rs @@ -49,9 +49,9 @@ impl FromStr for ObjectExpansion { } } -impl From for pack::data::output::count::iter_from_objects::ObjectExpansion { +impl From for pack::data::output::count::objects::ObjectExpansion { fn from(v: ObjectExpansion) -> Self { - use pack::data::output::count::iter_from_objects::ObjectExpansion::*; + use pack::data::output::count::objects::ObjectExpansion::*; match v { ObjectExpansion::None => AsIs, ObjectExpansion::TreeTraversal => TreeContents, @@ -190,7 +190,7 @@ where input, progress, &interrupt::IS_INTERRUPTED, - pack::data::output::count::iter_from_objects::Options { + pack::data::output::count::objects::Options { thread_limit, chunk_size, input_object_expansion, @@ -300,7 +300,7 @@ fn print(stats: Statistics, format: OutputFormat, out: impl std::io::Write) -> a fn human_output( Statistics { counts: - pack::data::output::count::iter_from_objects::Outcome { + pack::data::output::count::objects::Outcome { input_objects, expanded_objects, decoded_objects, @@ -343,7 +343,7 @@ fn human_output( #[derive(Default)] #[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] struct Statistics { - counts: pack::data::output::count::iter_from_objects::Outcome, + counts: pack::data::output::count::objects::Outcome, entries: pack::data::output::entry::iter_from_counts::Outcome, }