Skip to content

Commit

Permalink
Improve RevisionGraph log parsing
Browse files Browse the repository at this point in the history
Prior code used a more complex approach and did a lot more string
allocation. This commit introduces a regex, and modifies the git
log format so that there's only one \0 character, and it separates
log entries.

We also know the dates are numeric, so can parse confidently
(without TryParse).

The file name code was unused and has been removed.

The state machine is no longer needed as each log item
is separated by a \0 and can therefore be processed in one
go via the regex.
  • Loading branch information
drewnoakes committed May 27, 2018
1 parent fb8358d commit 1f6b0ca
Showing 1 changed file with 57 additions and 121 deletions.
178 changes: 57 additions & 121 deletions GitCommands/RevisionGraph.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Threading;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using GitUI;
using GitUIPluginInterfaces;
Expand All @@ -28,11 +28,8 @@ public enum RefFilterOptions

public sealed class RevisionGraph : IDisposable
{
private static readonly char[] _hexChars = "0123456789ABCDEFabcdef".ToCharArray();
private static readonly char[] ShellGlobCharacters = { '?', '*', '[' };

private const string CommitBegin = "<(__BEGIN_COMMIT__)>"; // Something unlikely to show up in a comment

public event EventHandler Exited;
public event EventHandler<RevisionGraphUpdatedEventArgs> Updated;
public event EventHandler<AsyncErrorEventArgs> Error;
Expand All @@ -41,11 +38,9 @@ public sealed class RevisionGraph : IDisposable
private readonly GitModule _module;

[CanBeNull] private Dictionary<string, List<IGitRef>> _refs;
private ReadStep _nextStep = ReadStep.Commit;
private GitRevision _revision;
public RefFilterOptions RefsOptions = RefFilterOptions.All | RefFilterOptions.Boundary;
private string _selectedBranchName;
private string _previousFileName;

public string RevisionFilter { get; set; } = string.Empty;
public string PathFilter { get; set; } = string.Empty;
Expand Down Expand Up @@ -77,6 +72,24 @@ public void Execute()
});
}

private static readonly Regex _commitRegex = new Regex(@"
^
(?<objectid>[0-9a-f]{40})\n
((?<parent>[0-9a-f]{40})\ ?)*\n # note root commits have no parent
(?<tree>[0-9a-f]{40})\n
(?<authorname>[^\n]+)\n
(?<authoremail>[^\n]+)\n
(?<authordate>\d+)\n
(?<committername>[^\n]+)\n
(?<committeremail>[^\n]+)\n
(?<commitdate>\d+)\n
(?<encoding>[^\n]*)\n
(?<subject>.+)
(\n+(?<body>(.|\n)*))?
$
",
RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);

private async Task ExecuteAsync()
{
ThreadHelper.ThrowIfNotOnUIThread();
Expand All @@ -91,7 +104,6 @@ private async Task ExecuteAsync()
_refs = GetRefs().ToDictionaryOfList(head => head.Guid);

const string fullFormat =
/* <COMMIT> */ CommitBegin + "%n" +
/* Hash */ "%H%n" +
/* Parents */ "%P%n" +
/* Tree */ "%T%n" +
Expand All @@ -100,14 +112,9 @@ private async Task ExecuteAsync()
/* Author Date */ "%at%n" +
/* Committer Name */ "%cN%n" +
/* Committer Email */ "%cE%n" +
/* Committer Date */ "%ct%n" +
/* Commit message encoding */ "%e%x00" + // there is a bug: git does not recode commit message when format is given
/* Commit Subject */ "%s%x00" +
/* Commit Body */ "%B%x00";

// NOTE:
// when called from FileHistory and FollowRenamesInFileHistory is enabled the "--name-only" argument is set.
// the filename is the next line after the commit-format defined above.
/* Commit Date */ "%ct%n" +
/* Commit message encoding */ "%e%n" + // there is a bug: git does not recode commit message when format is given
/* Commit Body */ "%B";

var arguments = new ArgumentBuilder
{
Expand Down Expand Up @@ -146,26 +153,20 @@ private async Task ExecuteAsync()
return;
}

_previousFileName = null;

_nextStep = ReadStep.Commit;
foreach (string data in p.StandardOutput.ReadNullTerminatedLines())
foreach (var logItem in p.StandardOutput.ReadNullTerminatedLines())
{
if (token.IsCancellationRequested)
{
break;
}

DataReceived(data);
ProcessLogItem(logItem);
}

await ThreadHelper.JoinableTaskFactory.SwitchToMainThreadAsync(token);

if (!token.IsCancellationRequested)
{
FinishRevision();
_previousFileName = null;

Exited?.Invoke(this, EventArgs.Empty);
}
}
Expand Down Expand Up @@ -197,122 +198,57 @@ private IReadOnlyList<IGitRef> GetRefs()
return result;
}

private void FinishRevision()
private void ProcessLogItem(string s)
{
if (_revision != null && _revision.Guid == null)
{
_revision = null;
}
s = GitModule.ReEncodeString(s, GitModule.LosslessEncoding, _module.LogOutputEncoding);

var match = _commitRegex.Match(s);

if (_revision != null)
if (!match.Success || match.Index != 0)
{
if (_revision.Name == null)
{
_revision.Name = _previousFileName;
}
else
{
_previousFileName = _revision.Name;
}
Debug.Fail("Commit regex did not match");
return;
}

if (_revision.Guid.Trim(_hexChars).Length == 0 &&
(RevisionPredicate == null || RevisionPredicate(_revision)))
{
// Remove full commit message to reduce memory consumption (28% for a repo with 69K commits)
// Full commit message is used in InMemFilter but later it's not needed
_revision.Body = null;
var encoding = match.Groups["encoding"].Value;

RevisionCount++;
Updated?.Invoke(this, new RevisionGraphUpdatedEventArgs(_revision));
}
}
}
_revision = new GitRevision(null)
{
// TODO use ObjectId (when merged) and parse directly from underlying string, avoiding copy
Guid = match.Groups["objectid"].Value,
ParentGuids = match.Groups["parent"].Captures.OfType<Capture>().Select(c => c.Value).ToArray(),
TreeGuid = match.Groups["tree"].Value,
Author = match.Groups["authorname"].Value,
AuthorEmail = match.Groups["authoremail"].Value,
AuthorDate = DateTimeUtils.ParseUnixTime(match.Groups["authordate"].Value),
Committer = match.Groups["committername"].Value,
CommitterEmail = match.Groups["committeremail"].Value,
CommitDate = DateTimeUtils.ParseUnixTime(match.Groups["commitdate"].Value),
MessageEncoding = encoding,
Subject = _module.ReEncodeCommitMessage(match.Groups["subject"].Value, encoding),
Body = _module.ReEncodeCommitMessage(match.Groups["body"].Value, encoding)
};

private void DataReceived(string data)
{
if (data.StartsWith(CommitBegin))
if (_refs.TryGetValue(_revision.Guid, out var gitRefs))
{
// a new commit finalizes the last revision
FinishRevision();
_nextStep = ReadStep.Commit;
_revision.Refs.AddRange(gitRefs);
}

switch (_nextStep)
if (RevisionPredicate == null || RevisionPredicate(_revision))
{
case ReadStep.Commit:
data = GitModule.ReEncodeString(data, GitModule.LosslessEncoding, _module.LogOutputEncoding);

string[] lines = data.Split('\n');
Debug.Assert(lines.Length == 11, "lines.Length == 11");
Debug.Assert(lines[0] == CommitBegin, "lines[0] == CommitBegin");

_revision = new GitRevision(lines[1]);
// Remove full commit message to reduce memory consumption (28% for a repo with 69K commits)
// Full commit message is used in InMemFilter but later it's not needed
_revision.Body = null;

if (_refs.TryGetValue(_revision.Guid, out var gitRefs))
{
_revision.Refs.AddRange(gitRefs);
}

// RemoveEmptyEntries is required for root commits. They should have empty list of parents.
_revision.ParentGuids = lines[2].Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
_revision.TreeGuid = lines[3];

_revision.Author = lines[4];
_revision.AuthorEmail = lines[5];
{
if (DateTimeUtils.TryParseUnixTime(lines[6], out var dateTime))
{
_revision.AuthorDate = dateTime;
}
}

_revision.Committer = lines[7];
_revision.CommitterEmail = lines[8];
{
if (DateTimeUtils.TryParseUnixTime(lines[9], out var dateTime))
{
_revision.CommitDate = dateTime;
}
}

_revision.MessageEncoding = lines[10];
break;

case ReadStep.CommitSubject:
_revision.Subject = _module.ReEncodeCommitMessage(data, _revision.MessageEncoding);
break;

case ReadStep.CommitBody:
_revision.Body = _module.ReEncodeCommitMessage(data, _revision.MessageEncoding);
break;

case ReadStep.FileName:
if (!string.IsNullOrEmpty(data))
{
// Git adds \n between the format string (ends with \0 in our case)
// and the first file name. So, we need to remove it from the file name.
data = GitModule.ReEncodeFileNameFromLossless(data);
_revision.Name = data.TrimStart('\n');
}

break;
RevisionCount++;
Updated?.Invoke(this, new RevisionGraphUpdatedEventArgs(_revision));
}

_nextStep++;
}

public void Dispose()
{
_cancellationTokenSequence.Dispose();
}

private enum ReadStep
{
Commit,
CommitSubject,
CommitBody,
FileName,
}
}

public sealed class RevisionGraphUpdatedEventArgs : EventArgs
Expand Down

0 comments on commit 1f6b0ca

Please sign in to comment.