diff --git a/gix-object/src/data.rs b/gix-object/src/data.rs index abf5e1377f..e663603573 100644 --- a/gix-object/src/data.rs +++ b/gix-object/src/data.rs @@ -69,11 +69,7 @@ pub mod verify { /// hash of `self`. pub fn verify_checksum(&self, desired: impl AsRef) -> Result<(), Error> { let desired = desired.as_ref(); - let mut hasher = gix_features::hash::hasher(desired.kind()); - hasher.update(&crate::encode::loose_header(self.kind, self.data.len())); - hasher.update(self.data); - - let actual_id = gix_hash::ObjectId::from(hasher.digest()); + let actual_id = crate::compute_hash(desired.kind(), self.kind, self.data); if desired != actual_id { return Err(Error::ChecksumMismatch { desired: desired.into(), diff --git a/gix-object/src/lib.rs b/gix-object/src/lib.rs index 46aa7efd9a..5f6f9d33a9 100644 --- a/gix-object/src/lib.rs +++ b/gix-object/src/lib.rs @@ -375,3 +375,14 @@ pub mod decode { Ok((kind, size, size_end + 1)) } } + +/// A standalone function to compute a hash of kind `hash_kind` for an object of `object_kind` and its `data`. +pub fn compute_hash(hash_kind: gix_hash::Kind, object_kind: Kind, data: &[u8]) -> gix_hash::ObjectId { + let header = encode::loose_header(object_kind, data.len()); + + let mut hasher = gix_features::hash::hasher(hash_kind); + hasher.update(&header); + hasher.update(data); + + hasher.digest().into() +} diff --git a/gix-object/tests/loose.rs b/gix-object/tests/loose/mod.rs similarity index 100% rename from gix-object/tests/loose.rs rename to gix-object/tests/loose/mod.rs diff --git a/gix-object/tests/object.rs b/gix-object/tests/object.rs index f9cc6aa0c7..ed537f2339 100644 --- a/gix-object/tests/object.rs +++ b/gix-object/tests/object.rs @@ -4,6 +4,20 @@ use gix_hash::ObjectId; mod encode; mod immutable; +mod loose; + +#[test] +fn compute_hash() { + let hk = gix_hash::Kind::Sha1; + assert_eq!( + gix_object::compute_hash(hk, gix_object::Kind::Blob, &[]), + gix_hash::ObjectId::empty_blob(hk) + ); + assert_eq!( + gix_object::compute_hash(hk, gix_object::Kind::Tree, &[]), + gix_hash::ObjectId::empty_tree(hk) + ); +} type Result = std::result::Result>; diff --git a/gix-odb/src/lib.rs b/gix-odb/src/lib.rs index a63ea544f4..08b14238c7 100644 --- a/gix-odb/src/lib.rs +++ b/gix-odb/src/lib.rs @@ -65,8 +65,7 @@ pub fn sink(object_hash: gix_hash::Kind) -> Sink { } } -/// -pub mod sink; +mod sink; /// pub mod find; diff --git a/gix-odb/src/sink.rs b/gix-odb/src/sink.rs index 1befd6fdfb..44a4061517 100644 --- a/gix-odb/src/sink.rs +++ b/gix-odb/src/sink.rs @@ -30,7 +30,6 @@ impl crate::traits::Write for Sink { mut from: impl io::Read, ) -> Result { let mut size = size.try_into().expect("object size to fit into usize"); - use gix_features::hash::Sha1; let mut buf = [0u8; 8096]; let header = gix_object::encode::loose_header(kind, size); @@ -40,27 +39,24 @@ impl crate::traits::Write for Sink { } Ok(()) }; - match self.object_hash { - gix_hash::Kind::Sha1 => { - let mut hasher = Sha1::default(); - hasher.update(&header); - possibly_compress(&header)?; - while size != 0 { - let bytes = size.min(buf.len()); - from.read_exact(&mut buf[..bytes])?; - hasher.update(&buf[..bytes]); - possibly_compress(&buf[..bytes])?; - size -= bytes; - } - if let Some(compressor) = self.compressor.as_ref() { - let mut c = compressor.borrow_mut(); - c.flush()?; - c.reset(); - } + let mut hasher = gix_features::hash::hasher(self.object_hash); + hasher.update(&header); + possibly_compress(&header)?; - Ok(hasher.digest().into()) - } + while size != 0 { + let bytes = size.min(buf.len()); + from.read_exact(&mut buf[..bytes])?; + hasher.update(&buf[..bytes]); + possibly_compress(&buf[..bytes])?; + size -= bytes; + } + if let Some(compressor) = self.compressor.as_ref() { + let mut c = compressor.borrow_mut(); + c.flush()?; + c.reset(); } + + Ok(hasher.digest().into()) } } diff --git a/gix-odb/src/store_impls/loose/write.rs b/gix-odb/src/store_impls/loose/write.rs index e87462e4c3..912426bba1 100644 --- a/gix-odb/src/store_impls/loose/write.rs +++ b/gix-odb/src/store_impls/loose/write.rs @@ -98,6 +98,16 @@ impl crate::traits::Write for Store { type CompressedTempfile = deflate::Write; +/// Access +impl Store { + /// Return the path to the object with `id`. + /// + /// Note that is may not exist yet. + pub fn object_path(&self, id: &gix_hash::oid) -> PathBuf { + loose::hash_path(id, self.path.clone()) + } +} + impl Store { fn dest(&self) -> Result, Error> { Ok(hash::Write::new( @@ -126,7 +136,36 @@ impl Store { } } let file = file.into_inner(); - file.persist(&object_path).map_err(|err| Error::Persist { + let res = file.persist(&object_path); + // On windows, we assume that such errors are due to its special filesystem semantics, + // on any other platform that would be a legitimate error though. + #[cfg(windows)] + if let Err(err) = &res { + if err.error.kind() == std::io::ErrorKind::PermissionDenied + || err.error.kind() == std::io::ErrorKind::AlreadyExists + { + return Ok(id); + } + } + #[cfg(unix)] + if let Ok(mut perm) = object_path.metadata().map(|m| m.permissions()) { + use std::os::unix::fs::PermissionsExt; + /// For now we assume the default with standard umask. This can be more sophisticated, + /// but we have the bare minimum. + fn comp_mode(_mode: u32) -> u32 { + 0o444 + } + let new_mode = comp_mode(perm.mode()); + if (perm.mode() ^ new_mode) & !0o170000 != 0 { + perm.set_mode(new_mode); + std::fs::set_permissions(&object_path, perm).map_err(|err| Error::Io { + source: err, + message: "Failed to set permission bits", + path: object_path.clone(), + })?; + } + } + res.map_err(|err| Error::Persist { source: err, target: object_path, })?; diff --git a/gix-odb/tests/fixtures/generated-archives/.gitignore b/gix-odb/tests/fixtures/generated-archives/.gitignore new file mode 100644 index 0000000000..f6a0d3bb31 --- /dev/null +++ b/gix-odb/tests/fixtures/generated-archives/.gitignore @@ -0,0 +1 @@ +repo_with_loose_objects.tar.xz diff --git a/gix-odb/tests/fixtures/repo_with_loose_objects.sh b/gix-odb/tests/fixtures/repo_with_loose_objects.sh new file mode 100644 index 0000000000..cb055cb351 --- /dev/null +++ b/gix-odb/tests/fixtures/repo_with_loose_objects.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -eu -o pipefail + +git init -q + +git checkout -b main +touch this +git add this +git commit -q -m c1 +echo hello >> this +git commit -q -am c2 + diff --git a/gix-odb/tests/odb/store/loose.rs b/gix-odb/tests/odb/store/loose.rs index a8c9553481..d9eaba99a0 100644 --- a/gix-odb/tests/odb/store/loose.rs +++ b/gix-odb/tests/odb/store/loose.rs @@ -70,6 +70,62 @@ mod write { } Ok(()) } + + #[test] + #[cfg(unix)] + fn it_writes_objects_with_similar_permissions() -> crate::Result { + let hk = gix_hash::Kind::Sha1; + let git_store = loose::Store::at( + gix_testtools::scripted_fixture_read_only("repo_with_loose_objects.sh")?.join(".git/objects"), + hk, + ); + let expected_perm = git_store + .object_path(&gix_hash::ObjectId::empty_blob(hk)) + .metadata()? + .permissions(); + + let tmp = tempfile::TempDir::new()?; + let store = loose::Store::at(tmp.path(), hk); + store.write_buf(gix_object::Kind::Blob, &[])?; + let actual_perm = store + .object_path(&gix_hash::ObjectId::empty_blob(hk)) + .metadata()? + .permissions(); + assert_eq!( + actual_perm, expected_perm, + "we explicitly equalize permissions to be similar to what `git` would do" + ); + Ok(()) + } + + #[test] + fn collisions_do_not_cause_failure() -> crate::Result { + let dir = tempfile::tempdir()?; + + fn write_empty_trees(dir: &std::path::Path) { + let db = loose::Store::at(dir, gix_hash::Kind::Sha1); + let empty_tree = gix_object::Tree::empty(); + for _ in 0..2 { + let id = db.write(&empty_tree).expect("works"); + assert!(db.contains(id), "written objects are actually available"); + + let empty_blob = db.write_buf(gix_object::Kind::Blob, &[]).expect("works"); + assert!(db.contains(empty_blob), "written objects are actually available"); + let id = db + .write_stream(gix_object::Kind::Blob, 0, &mut [].as_slice()) + .expect("works"); + assert_eq!(id, empty_blob); + assert!(db.contains(empty_blob), "written objects are actually available"); + } + } + + gix_features::parallel::threads(|scope| { + scope.spawn(|| write_empty_trees(dir.path())); + scope.spawn(|| write_empty_trees(dir.path())); + }); + + Ok(()) + } } mod contains { diff --git a/gix-pack/src/index/traverse/mod.rs b/gix-pack/src/index/traverse/mod.rs index 42c820b0e1..68a502bcad 100644 --- a/gix-pack/src/index/traverse/mod.rs +++ b/gix-pack/src/index/traverse/mod.rs @@ -216,11 +216,7 @@ where E: std::error::Error + Send + Sync + 'static, { if check.object_checksum() { - let mut hasher = gix_features::hash::hasher(index_entry.oid.kind()); - hasher.update(&gix_object::encode::loose_header(object_kind, decompressed.len())); - hasher.update(decompressed); - - let actual_oid = gix_hash::ObjectId::from(hasher.digest()); + let actual_oid = gix_object::compute_hash(index_entry.oid.kind(), object_kind, decompressed); if actual_oid != index_entry.oid { return Err(Error::PackObjectMismatch { actual: actual_oid, diff --git a/gix-worktree/src/status/content.rs b/gix-worktree/src/status/content.rs index f68ab1640c..d47749ef8f 100644 --- a/gix-worktree/src/status/content.rs +++ b/gix-worktree/src/status/content.rs @@ -53,11 +53,7 @@ impl CompareBlobs for FastEq { return Ok(Some(())); } let blob = worktree_blob.read_data()?; - let header = loose_header(gix_object::Kind::Blob, blob.len()); - let mut hasher = hash::hasher(entry.id.kind()); - hasher.update(&header); - hasher.update(blob); - let file_hash: ObjectId = hasher.digest().into(); + let file_hash = gix_object::compute_hash(entry.id.kind(), gix_object::Kind::Blob, blob); Ok((entry.id != file_hash).then_some(())) } } diff --git a/gix/src/object/mod.rs b/gix/src/object/mod.rs index 206007889c..d0a37db6c7 100644 --- a/gix/src/object/mod.rs +++ b/gix/src/object/mod.rs @@ -90,6 +90,14 @@ impl<'repo> Object<'repo> { } } + /// Transform this object into a tag, or panic if it is none. + pub fn into_tag(self) -> Tag<'repo> { + match self.try_into() { + Ok(tag) => tag, + Err(this) => panic!("Tried to use {} as commit, but was {}", this.id, this.kind), + } + } + /// Transform this object into a commit, or return it as part of the `Err` if it is no commit. pub fn try_into_commit(self) -> Result, try_into::Error> { self.try_into().map_err(|this: Self| try_into::Error { diff --git a/gix/src/object/tag.rs b/gix/src/object/tag.rs index ce9d7360a6..77eaaa2594 100644 --- a/gix/src/object/tag.rs +++ b/gix/src/object/tag.rs @@ -1,6 +1,17 @@ use crate::{ext::ObjectIdExt, Tag}; impl<'repo> Tag<'repo> { + /// Decode the entire tag object and return it for accessing all tag information. + /// + /// This never allocates. + /// + /// Note that the returned commit object does make lookup easy and should be + /// used for successive calls to string-ish information to avoid decoding the object + /// more than once. + pub fn decode(&self) -> Result, gix_object::decode::Error> { + gix_object::TagRef::from_bytes(&self.data) + } + /// Decode this tag partially and return the id of its target. pub fn target_id(&self) -> Result, gix_object::decode::Error> { gix_object::TagRefIter::from_bytes(&self.data) diff --git a/gix/src/repository/object.rs b/gix/src/repository/object.rs index 60ceed4bf3..f4592475fb 100644 --- a/gix/src/repository/object.rs +++ b/gix/src/repository/object.rs @@ -1,5 +1,6 @@ #![allow(clippy::result_large_err)] use std::convert::TryInto; +use std::ops::DerefMut; use gix_hash::ObjectId; use gix_odb::{Find, FindExt, Write}; @@ -58,32 +59,71 @@ impl crate::Repository { } } + fn shared_empty_buf(&self) -> std::cell::RefMut<'_, Vec> { + let mut bufs = self.bufs.borrow_mut(); + if bufs.last().is_none() { + bufs.push(Vec::with_capacity(512)); + } + std::cell::RefMut::map(bufs, |bufs| { + let buf = bufs.last_mut().expect("we assure one is present"); + buf.clear(); + buf + }) + } + /// Write the given object into the object database and return its object id. + /// + /// Note that we hash the object in memory to avoid storing objects that are already present. That way, + /// we avoid writing duplicate objects using slow disks that will eventually have to be garbage collected. pub fn write_object(&self, object: impl gix_object::WriteTo) -> Result, object::write::Error> { + let mut buf = self.shared_empty_buf(); + object.write_to(buf.deref_mut())?; + + let oid = gix_object::compute_hash(self.object_hash(), object.kind(), &buf); + if self.objects.contains(oid) { + return Ok(oid.attach(self)); + } + self.objects - .write(object) + .write_buf(object.kind(), &buf) .map(|oid| oid.attach(self)) .map_err(Into::into) } /// Write a blob from the given `bytes`. + /// + /// We avoid writing duplicate objects to slow disks that will eventually have to be garbage collected by + /// pre-hashing the data, and checking if the object is already present. pub fn write_blob(&self, bytes: impl AsRef<[u8]>) -> Result, object::write::Error> { + let bytes = bytes.as_ref(); + let oid = gix_object::compute_hash(self.object_hash(), gix_object::Kind::Blob, bytes); + if self.objects.contains(oid) { + return Ok(oid.attach(self)); + } self.objects - .write_buf(gix_object::Kind::Blob, bytes.as_ref()) + .write_buf(gix_object::Kind::Blob, bytes) .map(|oid| oid.attach(self)) } /// Write a blob from the given `Read` implementation. + /// + /// Note that we hash the object in memory to avoid storing objects that are already present. That way, + /// we avoid writing duplicate objects using slow disks that will eventually have to be garbage collected. + /// + /// If that is prohibitive, use the object database directly. pub fn write_blob_stream( &self, mut bytes: impl std::io::Read + std::io::Seek, ) -> Result, object::write::Error> { - let current = bytes.stream_position()?; - let len = bytes.seek(std::io::SeekFrom::End(0))? - current; - bytes.seek(std::io::SeekFrom::Start(current))?; + let mut buf = self.shared_empty_buf(); + std::io::copy(&mut bytes, buf.deref_mut())?; + let oid = gix_object::compute_hash(self.object_hash(), gix_object::Kind::Blob, &buf); + if self.objects.contains(oid) { + return Ok(oid.attach(self)); + } self.objects - .write_stream(gix_object::Kind::Blob, len, bytes) + .write_buf(gix_object::Kind::Blob, &buf) .map(|oid| oid.attach(self)) } diff --git a/gix/tests/fixtures/generated-archives/make_packed_and_loose.tar.xz b/gix/tests/fixtures/generated-archives/make_packed_and_loose.tar.xz new file mode 100644 index 0000000000..8b9c2b871e --- /dev/null +++ b/gix/tests/fixtures/generated-archives/make_packed_and_loose.tar.xz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:930d562f57aa9c1a1894d715dbb3e4f70beeb27b356b9c88a9d39eb7d211dc6f +size 10872 diff --git a/gix/tests/fixtures/make_packed_and_loose.sh b/gix/tests/fixtures/make_packed_and_loose.sh new file mode 100644 index 0000000000..180cd79c27 --- /dev/null +++ b/gix/tests/fixtures/make_packed_and_loose.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -eu -o pipefail + +git init -q + +git checkout -b main +touch this +git add this +git commit -q -m c1 +echo hello >> this +git commit -q -am c2 + +git gc + +echo hello >> this +echo hello >> that +git add . && git commit -m "loose" diff --git a/gix/tests/repository/object.rs b/gix/tests/repository/object.rs index a651ae2af9..aeb3dd4036 100644 --- a/gix/tests/repository/object.rs +++ b/gix/tests/repository/object.rs @@ -54,6 +54,55 @@ mod write_blob { } } +#[test] +fn writes_avoid_io_using_duplicate_check() -> crate::Result { + let repo = crate::named_repo("make_packed_and_loose.sh")?; + let store = gix::odb::loose::Store::at(repo.git_dir().join("objects"), repo.object_hash()); + let loose_count = store.iter().count(); + assert_eq!(loose_count, 3, "there are some loose objects"); + assert_eq!( + repo.objects.iter()?.count() - loose_count, + 6, + "there is packed objects as well" + ); + + for id in repo.objects.iter()? { + let id = id?; + let obj = repo.find_object(id)?; + use gix_object::Kind::*; + match obj.kind { + Commit => { + let commit = obj.into_commit(); + let new_id = repo.write_object(commit.decode()?)?; + assert_eq!(new_id, id); + } + Tag => { + let tag = obj.into_tag(); + let new_id = repo.write_object(tag.decode()?)?; + assert_eq!(new_id, id); + } + Tree => { + let tree = obj.into_tree(); + let new_id = repo.write_object(tree.decode()?)?; + assert_eq!(new_id, id); + } + Blob => { + let new_id = repo.write_blob(&obj.data)?; + assert_eq!(new_id, id); + let new_id = repo.write_blob_stream(std::io::Cursor::new(&obj.data))?; + assert_eq!(new_id, id); + } + } + } + + assert_eq!( + store.iter().count(), + loose_count, + "no new object was written as all of them already existed" + ); + Ok(()) +} + mod find { use gix_pack::Find; diff --git a/src/plumbing/progress.rs b/src/plumbing/progress.rs index 807c49bebc..005a9b06be 100644 --- a/src/plumbing/progress.rs +++ b/src/plumbing/progress.rs @@ -214,6 +214,22 @@ static GIT_CONFIG: &[Record] = &[ config: "core.eol", usage: Planned {note: Some("needed for filters, but also for doing diffs correctly")} }, + Record { + config: "core.fsync", + usage: Planned {note: Some("more safety for disk write operations is a good thing, definitely on the server")} + }, + Record { + config: "core.fsyncMethod", + usage: Planned {note: Some("needed to support `core.fsync`")} + }, + Record { + config: "core.sharedRepository", + usage: NotPlanned {reason: "on demand"} + }, + Record { + config: "core.createObject", + usage: NotPlanned {reason: "it's valuable not to do writes unless needed on the lowest level, but we hope to avoid issues by not writing duplicate objects in the first place"} + }, Record { config: "clone.filterSubmodules,", usage: Planned {