Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improving counting performance #170

Merged
merged 10 commits into from
Aug 23, 2021
10 changes: 8 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion git-odb/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ all-features = true
git-features = { version = "^0.16.0", path = "../git-features", features = ["rustsha1", "walkdir", "zlib"] }
git-hash = { version = "^0.5.0", path = "../git-hash" }
git-object = { version ="0.12.0", path = "../git-object" }
git-pack = { version ="0.8.0", path = "../git-pack" }
git-pack = { version ="^0.9.0", path = "../git-pack" }

btoi = "0.4.2"
tempfile = "3.1.0"
Expand Down
2 changes: 2 additions & 0 deletions git-odb/src/store/compound/init.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ impl compound::Store {
p.extension().unwrap_or_default() == "idx"
&& p.file_name().unwrap_or_default().to_string_lossy().starts_with("pack-")
})
// TODO: make this configurable, git for instance sorts by modification date
// https://github.com/libgit2/libgit2/blob/main/src/odb_pack.c#L41-L158
.map(|(p, md)| pack::Bundle::at(p).map(|b| (b, md.len())))
.collect::<Result<Vec<_>, _>>()?;
packs_and_sizes.sort_by_key(|e| e.1);
Expand Down
6 changes: 3 additions & 3 deletions git-pack/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "git-pack"
version = "0.8.2"
version = "0.9.0"
repository = "https://github.com/Byron/gitoxide"
authors = ["Sebastian Thiel <sebastian.thiel@icloud.com>"]
license = "MIT/Apache-2.0"
Expand All @@ -13,7 +13,7 @@ doctest = false

[features]
pack-cache-lru-static = ["uluru"]
pack-cache-lru-dynamic = ["memory-lru"]
pack-cache-lru-dynamic = ["clru"]
serde1 = ["serde", "git-object/serde1"]
internal-testing-git-features-parallel = ["git-features/parallel"]
internal-testing-to-avoid-being-run-by-cargo-test-all = []
Expand Down Expand Up @@ -49,7 +49,7 @@ bytesize = "1.0.1"
parking_lot = { version = "0.11.0", default-features = false }
thiserror = "1.0.26"
uluru = { version = "3.0.0", optional = true }
memory-lru = { version = "0.1.0", optional = true }
clru = { version = "0.5.0", optional = true }
dashmap = "4.0.2"

[dev-dependencies]
Expand Down
38 changes: 30 additions & 8 deletions git-pack/src/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,21 +44,28 @@ pub mod lru {
#[cfg(feature = "pack-cache-lru-dynamic")]
mod memory {
use super::DecodeEntry;
use clru::WeightScale;
use std::num::NonZeroUsize;

struct Entry {
data: Vec<u8>,
kind: git_object::Kind,
compressed_size: usize,
}

impl memory_lru::ResidentSize for Entry {
fn resident_size(&self) -> usize {
self.data.len()
type Key = (u32, u64);
struct CustomScale;

impl WeightScale<Key, Entry> for CustomScale {
fn weight(&self, _key: &Key, value: &Entry) -> usize {
value.data.len()
}
}

/// An LRU cache with hash map backing and an eviction rule based on the memory usage for object data in bytes.
pub struct MemoryCappedHashmap {
inner: memory_lru::MemoryLruCache<(u32, u64), Entry>,
inner: clru::CLruCache<Key, Entry, std::collections::hash_map::RandomState, CustomScale>,
free_list: Vec<Vec<u8>>,
debug: git_features::cache::Debug,
}

Expand All @@ -67,7 +74,11 @@ pub mod lru {
/// object data.
pub fn new(memory_cap_in_bytes: usize) -> MemoryCappedHashmap {
MemoryCappedHashmap {
inner: memory_lru::MemoryLruCache::new(memory_cap_in_bytes),
inner: clru::CLruCache::with_config(
clru::CLruCacheConfig::new(NonZeroUsize::new(memory_cap_in_bytes).expect("non zero"))
.with_scale(CustomScale),
),
free_list: Vec::new(),
debug: git_features::cache::Debug::new(format!("MemoryCappedHashmap({}B)", memory_cap_in_bytes)),
}
}
Expand All @@ -76,14 +87,25 @@ pub mod lru {
impl DecodeEntry for MemoryCappedHashmap {
fn put(&mut self, pack_id: u32, offset: u64, data: &[u8], kind: git_object::Kind, compressed_size: usize) {
self.debug.put();
self.inner.insert(
if let Ok(Some(previous_entry)) = self.inner.put_with_weight(
(pack_id, offset),
Entry {
data: Vec::from(data),
data: self
.free_list
.pop()
.map(|mut v| {
v.clear();
v.resize(data.len(), 0);
v.copy_from_slice(data);
v
})
.unwrap_or_else(|| Vec::from(data)),
kind,
compressed_size,
},
)
) {
self.free_list.push(previous_entry.data)
}
}

fn get(&mut self, pack_id: u32, offset: u64, out: &mut Vec<u8>) -> Option<(git_object::Kind, usize)> {
Expand Down
35 changes: 31 additions & 4 deletions git-pack/src/data/output/count/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,46 @@ pub struct Count {
/// The hash of the object to write
pub id: ObjectId,
/// A way to locate a pack entry in the object database, only available if the object is in a pack.
pub entry_pack_location: Option<crate::bundle::Location>,
pub entry_pack_location: PackLocation,
}

/// Specifies how the pack location was handled during counting
#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)]
#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))]
pub enum PackLocation {
/// We did not lookup this object
NotLookedUp,
/// The object was looked up and there may be a location in a pack, along with enty information
LookedUp(Option<crate::bundle::Location>),
}

impl PackLocation {
/// Directly go through to LookedUp variant, panic otherwise
pub fn is_none(&self) -> bool {
match self {
PackLocation::LookedUp(opt) => opt.is_none(),
PackLocation::NotLookedUp => unreachable!("must have been resolved"),
}
}
/// Directly go through to LookedUp variant, panic otherwise
pub fn as_ref(&self) -> Option<&crate::bundle::Location> {
match self {
PackLocation::LookedUp(opt) => opt.as_ref(),
PackLocation::NotLookedUp => unreachable!("must have been resolved"),
}
}
}

impl Count {
/// Create a new instance from the given `oid` and its corresponding git `obj`ect data.
pub fn from_data(oid: impl Into<ObjectId>, obj: &data::Object<'_>) -> Self {
Count {
id: oid.into(),
entry_pack_location: obj.pack_location.clone(),
entry_pack_location: PackLocation::LookedUp(obj.pack_location.clone()),
}
}
}

///
pub mod iter_from_objects;
pub use iter_from_objects::{objects, objects_unthreaded};
pub mod objects;
pub use objects::{objects, objects_unthreaded};
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ where
cache,
progress,
should_interrupt,
true,
)
}
},
Expand Down Expand Up @@ -125,6 +126,7 @@ where
pack_cache,
&mut progress,
should_interrupt,
false,
)
}

Expand All @@ -139,6 +141,7 @@ fn expand_inner<Find, IterErr, Oid>(
cache: &mut impl crate::cache::DecodeEntry,
progress: &mut impl Progress,
should_interrupt: &AtomicBool,
allow_pack_lookups: bool,
) -> Result<find::existing::Error<Find::Error>, IterErr>
where
Find: crate::Find + Send + Sync,
Expand Down Expand Up @@ -205,12 +208,20 @@ where
&mut tree_traversal_state,
|oid, buf| {
stats.decoded_objects += 1;
db.find_existing_tree_iter(oid, buf, cache).ok()
match db.find_existing(oid, buf, cache).ok() {
Some(obj) => {
progress.inc();
stats.expanded_objects += 1;
out.push(output::Count::from_data(oid, &obj));
obj.into_tree_iter()
}
None => None,
}
},
&mut traverse_delegate,
)
.map_err(Error::TreeTraverse)?;
&traverse_delegate.objects
&traverse_delegate.non_trees
} else {
for commit_id in &parent_commit_ids {
let parent_tree_id = {
Expand Down Expand Up @@ -259,7 +270,7 @@ where
&changes_delegate.objects
};
for id in objects.iter() {
out.push(id_to_count(db, buf2, id, progress, stats));
out.push(id_to_count(db, buf2, id, progress, stats, allow_pack_lookups));
}
break;
}
Expand All @@ -280,13 +291,21 @@ where
&mut tree_traversal_state,
|oid, buf| {
stats.decoded_objects += 1;
db.find_existing_tree_iter(oid, buf, cache).ok()
match db.find_existing(oid, buf, cache).ok() {
Some(obj) => {
progress.inc();
stats.expanded_objects += 1;
out.push(output::Count::from_data(oid, &obj));
obj.into_tree_iter()
}
None => None,
}
},
&mut traverse_delegate,
)
.map_err(Error::TreeTraverse)?;
for id in traverse_delegate.objects.iter() {
out.push(id_to_count(db, buf1, id, progress, stats));
for id in traverse_delegate.non_trees.iter() {
out.push(id_to_count(db, buf1, id, progress, stats, allow_pack_lookups));
}
break;
}
Expand Down Expand Up @@ -318,7 +337,7 @@ where

mod tree {
pub mod changes {
use crate::data::output::count::iter_from_objects::util::InsertImmutable;
use crate::data::output::count::objects::util::InsertImmutable;
use git_diff::tree::{
visit::{Action, Change},
Visit,
Expand Down Expand Up @@ -374,13 +393,13 @@ mod tree {
}

pub mod traverse {
use crate::data::output::count::iter_from_objects::util::InsertImmutable;
use crate::data::output::count::objects::util::InsertImmutable;
use git_hash::ObjectId;
use git_object::{bstr::BStr, immutable::tree::Entry};
use git_traverse::tree::visit::{Action, Visit};

pub struct AllUnseen<'a, H> {
pub objects: Vec<ObjectId>,
pub non_trees: Vec<ObjectId>,
all_seen: &'a H,
}

Expand All @@ -390,12 +409,12 @@ mod tree {
{
pub fn new(all_seen: &'a H) -> Self {
AllUnseen {
objects: Default::default(),
non_trees: Default::default(),
all_seen,
}
}
pub fn clear(&mut self) {
self.objects.clear();
self.non_trees.clear();
}
}

Expand All @@ -414,7 +433,6 @@ mod tree {
fn visit_tree(&mut self, entry: &Entry<'_>) -> Action {
let inserted = self.all_seen.insert(entry.oid.to_owned());
if inserted {
self.objects.push(entry.oid.to_owned());
Action::Continue
} else {
Action::Skip
Expand All @@ -424,14 +442,15 @@ mod tree {
fn visit_nontree(&mut self, entry: &Entry<'_>) -> Action {
let inserted = self.all_seen.insert(entry.oid.to_owned());
if inserted {
self.objects.push(entry.oid.to_owned());
self.non_trees.push(entry.oid.to_owned());
}
Action::Continue
}
}
}
}

#[inline]
fn push_obj_count_unique(
out: &mut Vec<output::Count>,
all_seen: &impl util::InsertImmutable<ObjectId>,
Expand All @@ -452,18 +471,24 @@ fn push_obj_count_unique(
}
}

#[inline]
fn id_to_count<Find: crate::Find>(
db: &Find,
buf: &mut Vec<u8>,
id: &oid,
progress: &mut impl Progress,
statistics: &mut Outcome,
allow_pack_lookups: bool,
) -> output::Count {
progress.inc();
statistics.expanded_objects += 1;
output::Count {
id: id.to_owned(),
entry_pack_location: db.location_by_oid(id, buf),
entry_pack_location: if allow_pack_lookups {
PackLocation::LookedUp(db.location_by_oid(id, buf))
} else {
PackLocation::NotLookedUp
},
}
}

Expand Down Expand Up @@ -625,6 +650,7 @@ mod types {
Interrupted,
}
}
use crate::data::output::count::PackLocation;
use std::cell::RefCell;
use std::collections::HashSet;
pub use types::{Error, ObjectExpansion, Options, Outcome};
Expand Down