# Simple Docstrum v2
## Sources
- [_The Document Spectrum for Page Layout Analysis_](https://ieeexplore.ieee.org/document/244677), Lawrence O’Gorman
- https://inside.mines.edu/~whoff/courses/EENG510/projects/2015/Hoch.pdf
- https://en.wikipedia.org/wiki/Document_layout_analysis#Example_of_a_bottom_up_approach


- https://github.com/UglyToad/PdfPig

## Steps
0. Open pdf document, extract words and preprocess
1. Estimate within-line and between-line spacing
2. Get lines
3. Get Paragraphs

In [1]:
#r "nuget:PdfPig,0.1.2-alpha003"

In [2]:
using UglyToad.PdfPig;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.Core;
using UglyToad.PdfPig.DocumentLayoutAnalysis;
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
using XPlot.Plotly;
using System.IO;
using System.Linq;

In [3]:
/// <summary>
/// The bounds for the angle between two words for them to have a certain type of relationship.
/// </summary>
public struct AngleBounds
{
    /// <summary>
    /// The lower bound in degrees.
    /// </summary>
    public double Lower { get; }

    /// <summary>
    /// The upper bound in degrees.
    /// </summary>
    public double Upper { get; }

    /// <summary>
    /// Create a new <see cref="AngleBounds"/>.
    /// </summary>
    public AngleBounds(double lowerBound, double upperBound)
    {
        if (lowerBound >= upperBound)
        {
            throw new ArgumentException("The lower bound should be smaller than the upper bound.");
        }

        Lower = lowerBound;
        Upper = upperBound;
    }

    /// <summary>
    /// Whether the bounds contain the angle.
    /// </summary>
    public bool Contains(double angle)
    {
        return angle >= Lower && angle <= Upper;
    }
}

#### Parameters

In [4]:
// The bin size used when building the within-line distances distribution.
int wlBinSize = 10;

// The bin size used when building the between-line distances distribution.
int blBinSize = 10;

// Multiplier that gives the maximum perpendicular distance between
// text lines for blocking. Maximum distance will be this number times the between-line
// distance found by the analysis.
double betweenLineMultiplier = 1.3;

// Multiplier that gives the maximum euclidian distance between
// words for building lines. Maximum distance will be this number times the within-line
// distance found by the analysis.
double withinLineMultiplier = 3.0;

// Angle bounds for words to be considered as neighbours on the same line.
AngleBounds wlBounds = new AngleBounds(-30, 30);

// Angle bounds for words to be considered as neighbours on separate lines.
AngleBounds blBounds = new AngleBounds(45, 135);

// The angular difference bounds between two lines to be considered in the same block.
// This defines if two lines are parallel enough.
AngleBounds angularDifferenceBounds = new AngleBounds(-30, 30);

// Precision when testing equalities.
double epsilon = 1e-3;


string wordSeparator = " ";
string lineSeparator = "\n";

### 0. Open pdf document, extract words and preprocess

In [5]:
//string pdfName = @"doc/Random 2 Columns Lists Hyph.pdf";
//string pdfName = @"doc/Random 2 Columns Lists Hyph - Justified.pdf";
string pdfName = @"doc/complex rotated.pdf";

int pageNo = 1;

List<Word> wordsRaw = new List<Word>();
double width = 0;
double height = 0;

using (PdfDocument document = PdfDocument.Open(pdfName))
{
    var page = document.GetPage(pageNo);
    width = page.Width;
    height = page.Height;
    wordsRaw = page.GetWords(NearestNeighbourWordExtractor.Instance).ToList();
}

var words = new List<Word>();
// only keep non space words
foreach (var word in wordsRaw)
{
    if (string.IsNullOrWhiteSpace(word.Text.Trim())) continue;
    words.Add(word);
}

#### Plot words

In [6]:
var graphs = new List<Graph.Scatter>();
graphs.Add(new Graph.Scatter()
{
    x = new[] { 0, 0, width, width, 0 },
    y = new[] { 0, height, height, 0, 0 },
    mode = "line",
    name = "",
    text = "page",
    marker = new Graph.Marker() { color = "brown" }
});

foreach (var w in words)
{
    graphs.Add(new Graph.Scatter()
    {
        x = new[] 
        { 
            w.BoundingBox.BottomLeft.X, 
            w.BoundingBox.BottomRight.X, 
            w.BoundingBox.TopRight.X, 
            w.BoundingBox.TopLeft.X,
            w.BoundingBox.BottomLeft.X 
        },
        y = new[]
        { 
            w.BoundingBox.BottomLeft.Y,
            w.BoundingBox.BottomRight.Y,
            w.BoundingBox.TopRight.Y,
            w.BoundingBox.TopLeft.Y,
            w.BoundingBox.BottomLeft.Y
        },
        mode = "lines",
        marker = new Graph.Marker() { color = "black" }
    });
}

var topLefts = new Graph.Scatter()
{
    x = words.Select(w => w.BoundingBox.TopLeft.X),
    y = words.Select(w => w.BoundingBox.TopLeft.Y),
    name = "",
    mode = "markers",
    text = words.Select(w => w.Text + " (tl)"),
    marker = new Graph.Marker() { color = "blue", size = 4 }
};
graphs.Add(topLefts);

var bottomLefts = new Graph.Scatter()
{
    x = words.Select(w => w.BoundingBox.BottomLeft.X),
    y = words.Select(w => w.BoundingBox.BottomLeft.Y),
    name = "",
    mode = "markers",
    text = words.Select(w => w.Text + " (bl)"),
    marker = new Graph.Marker() { color = "yellow", size = 4 }
};
graphs.Add(bottomLefts);

var bottomRights = new Graph.Scatter()
{
    x = words.Select(w => w.BoundingBox.BottomRight.X),
    y = words.Select(w => w.BoundingBox.BottomRight.Y),
    name = "",
    mode = "markers",
    text = words.Select(w => w.Text + " (br)"),
    marker = new Graph.Marker() { color = "red", size = 4 }

};
graphs.Add(bottomRights);

var chart = Chart.Plot(graphs.ToArray());
chart.WithLayout(new Layout.Layout() 
                 { 
                     title = pdfName + " - Words",
                     showlegend = false,
                     hovermode = "closest",
                     width = width,
                     height = height,
                     xaxis = new Graph.Xaxis() {  range = new[] { 0, width } },
                     yaxis = new Graph.Yaxis() {  range = new[] { 0, height } },
                 });
chart.WithXTitle("X");
chart.WithYTitle("Y");
chart.Width = (int)width;
chart.Height = (int)height;
display(chart);

### 1.  Estimate within-line and between-line spacing

In [7]:
/// <summary>
/// Get the average distance value of the peak bucket of the histogram.
/// </summary>
/// <param name="distances">The set of distances to average.</param>
/// <param name="binLength"></param>
private static double? GetPeakAverageDistance(IEnumerable<double> distances, int binLength = 1)
{
    if (!distances.Any())
    {
        return null;
    }

    if (binLength <= 0)
    {
        throw new ArgumentException("DocstrumBoundingBoxes: the bin length must be positive when commputing peak average distance.", nameof(binLength));
    }

    var max = (int)Math.Ceiling(distances.Max());
    if (max == 0)
    {
        max = binLength;
    }
    else
    {
        binLength = binLength > max ? max : binLength;
    }

    var bins = Enumerable.Range(0, (int)Math.Ceiling(max / (double)binLength) + 1)
        .Select(x => x * binLength)
        .ToDictionary(x => x, _ => new List<double>());

    foreach (var distance in distances)
    {
        int bin = (int)Math.Floor(distance / binLength);
        if (bin < 0)
        {
            throw new ArgumentOutOfRangeException(nameof(bin), "DocstrumBoundingBoxes: Negative distance found while commputing peak average distance.");
        }
        bins[bins.Keys.ElementAt(bin)].Add(distance);
    }
    
    display(bins.ToDictionary(x => x.Key, x => Math.Round(x.Value.Count() / (double)distances.Count() * 100, 5)));
    
    var best = default(List<double>);
    foreach (var bin in bins)
    {
        if (best == null || bin.Value.Count > best.Count)
        {
            best = bin.Value;
        }
    }

    return best?.Average();
}

#### 1.1 Within-line (between words) spacing
For within-line spacing, the distance between the bottom left (in yellow) and bottom right (in red) point will be computed

In [8]:
/// <summary>
/// Helper function to compute the within line angle between the pivot's bottom
/// right and the candidate's bottom left points, taking in account the pivot's rotation.
/// <para>-90 ≤ θ ≤ 90.</para>
/// </summary>
private static double AngleWL(Word pivot, Word candidate)
{
    var angle = Distances.BoundAngle180(Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft) - pivot.BoundingBox.Rotation);

    // Angle is kept within [-90;90] degree to handle overlapping words
    if (angle > 90)
    {
        angle -= 180;
    }
    else if (angle < -90)
    {
        angle += 180;
    }

    return angle;
}

In [9]:
// 1. Estimate within line and between line spacing
KdTree<Word> kdTreeBottomLeft = new KdTree<Word>(words, w => w.BoundingBox.BottomLeft);

var withinLineDistList = new List<double>();

for (int i =0; i < words.Count; i++)
{
    var word = words[i];

    // Within-line distance
    // 1.1.1 Find the 2 closest neighbours words to the candidate, using euclidean distance.
    foreach (var n in kdTreeBottomLeft.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomRight, Distances.Euclidean))
    {
        // 1.1.2 Check if the neighbour word is within the angle of the candidate 
        if (wlBounds.Contains(AngleWL(word, n.Item1)))
        {
            withinLineDistList.Add(Distances.Euclidean(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft));
        }
    }
}

// Compute average peak value of distribution
double withinLinePeak = GetPeakAverageDistance(withinLineDistList, wlBinSize).Value;

// Plot histogram
var histoWl = new Graph.Histogram()
{ 
    x = withinLineDistList, 
    xbins = new Graph.Xbins()
    { 
        start = 0,
        end = Math.Ceiling(withinLineDistList.Max()),
        size = wlBinSize
    },
    text = "",
    name = "",
    histnorm = "percent"
};

var lineWlAvg = new Graph.Scatter()
{ 
    x = new[] { withinLinePeak },
    y = new[] { 0 },
    mode = "markers",
    name = "wl distance",
    marker = new Graph.Marker()
    { 
        color = "red",
        size = 10
    }
};

var plotWl = Chart.Plot(new Graph.Trace[] { histoWl, lineWlAvg });
plotWl.WithLayout(new Layout.Layout() { title = "Distribution of within-line distances", showlegend = false, hovermode = "closest" });
plotWl.WithXTitle("distance");
plotWl.WithYTitle("%");
display(plotWl);

key,value
0,85.3598
10,4.96278
20,2.97767
30,5.21092
40,1.48883
50,0.0


#### 1.2 Between-line spacing
For between-line spacing, the distance between the top left (blue) and bottom left (yellow) point will be computed

In [10]:
/// <summary>
/// Helper function to compute the between line angle between the pivot's
/// and the candidate's centroid points, taking in account the pivot's rotation.
/// <para>0 ≤ θ ≤ 180.</para>
/// </summary>
private static double AngleBL(Word pivot, Word candidate)
{
    var angle = Distances.BoundAngle180(Distances.Angle(pivot.BoundingBox.Centroid, candidate.BoundingBox.Centroid) - pivot.BoundingBox.Rotation);

    // Angle is kept within [0, 180] for the check
    if (angle < 0)
    {
        angle += 180;
    }

    return angle;
}

In [11]:
var betweenLineDistList = new List<double>();

for (int i =0; i < words.Count; i++)
{
    var word = words[i];

    // Between-line distance
    // 1.2.1 Find the 2 closest neighbours words to the candidate, using euclidean distance.
    foreach (var n in kdTreeBottomLeft.FindNearestNeighbours(word, 2, w => w.BoundingBox.TopLeft, Distances.Euclidean))
    {
        // 1.2.2 Check if the candidate words is within the angle
        var angle = AngleBL(word, n.Item1);
        if (blBounds.Contains(angle))
        {
            // 1.2.3 Compute the vertical (between-line) distance between the candidate
            // and the neighbour and add it to the between-line distances list
            double hypotenuse = Distances.Euclidean(word.BoundingBox.Centroid, n.Item1.BoundingBox.Centroid);

            // Angle is kept within [-90, 90] 
            if (angle > 90)
            {
                angle -= 180;
            }

            var dist = Math.Abs(hypotenuse * Math.Cos((90 - angle) * Math.PI / 180))
                - word.BoundingBox.Height / 2.0 - n.Item1.BoundingBox.Height / 2.0;

            // The perpendicular distance can be negative because of the subtractions.
            // Could occur when words are overlapping, we ignore that.
            if (dist >= 0) betweenLineDistList.Add(dist);
        }
    }
}

// Compute average peak value of distribution
var betweenLinePeak = GetPeakAverageDistance(betweenLineDistList, blBinSize).Value;

// Plot histogram``
var histoBl = new Graph.Histogram() 
{ 
    x = betweenLineDistList, 
    xbins = new Graph.Xbins() 
    { 
        start = 0,
        end = Math.Ceiling(betweenLineDistList.Max()),
        size = blBinSize
    },
    text = "",
    name = "",
    histnorm = "percent" 
};

var lineBlAvg = new Graph.Scatter() 
{ 
    x = new[] { betweenLinePeak },
    y = new[] { 0 },
    mode = "markers",
    name = "bl distance",
    marker = new Graph.Marker() 
    { 
        color = "red", 
        size = 10 
    } 
};

var plotBl = Chart.Plot(new Graph.Trace[] { histoBl, lineBlAvg });
plotBl.WithLayout(new Layout.Layout() { title = "Distribution of between-line distances", showlegend = false, hovermode = "closest" });
plotBl.WithXTitle("distance");
plotBl.WithYTitle("%");
display(plotBl);

key,value
0,71.85629
10,23.35329
20,4.79042
30,0.0


In [12]:
double maxWLDistance = withinLineMultiplier * withinLinePeak; // Math.Min(3.0 * wlDist, 1.4142 * blDist); sqrt(2) * blDist
double maxBlDistance = betweenLinePeak * betweenLineMultiplier;
display("maxWLDistance=" + Math.Round(maxWLDistance, 3));
display("maxBlDistance=" + Math.Round(maxBlDistance, 3));

maxWLDistance=15.177

maxBlDistance=7.897

### 2. Get lines

In [13]:
var lines = new List<TextLine>();

var groupedWords = Clustering.NearestNeighbours(words,
    2,
    Distances.Euclidean,
    (_, __) => maxWLDistance,
    pivot => pivot.BoundingBox.BottomRight,
    candidate => candidate.BoundingBox.BottomLeft,
    _ => true,
    (pivot, candidate) => wlBounds.Contains(AngleWL(pivot, candidate)),
    -1).ToList();

foreach (var g in groupedWords)
{
    lines.Add(new TextLine(g.OrderByReadingOrder(), wordSeparator));
}

#### Plot lines

In [14]:
var graphs = new List<Graph.Scatter>();
graphs.Add(new Graph.Scatter()
{
    x = new[] { 0, 0, width, width, 0 },
    y = new[] { 0, height, height, 0, 0 },
    mode = "line",
    name = "",
    text = "page",
    marker = new Graph.Marker() { color = "brown" }
});

foreach (var p in lines)
{
    graphs.Add(new Graph.Scatter()
    {
        x = new[]
        {
            p.BoundingBox.BottomLeft.X, 
            p.BoundingBox.BottomRight.X, 
            p.BoundingBox.TopRight.X, 
            p.BoundingBox.TopLeft.X,
            p.BoundingBox.BottomLeft.X 
        },
        y = new[]
        {
            p.BoundingBox.BottomLeft.Y,
            p.BoundingBox.BottomRight.Y,
            p.BoundingBox.TopRight.Y,
            p.BoundingBox.TopLeft.Y,
            p.BoundingBox.BottomLeft.Y
        },
        text = p.Text.Count() <= 25 ? p.Text : string.Join("", p.Text.Take(25)) + "...",
        mode = "lines",
        marker = new Graph.Marker() { color = "red" }
    });
}

var chart = Chart.Plot(graphs.ToArray());
chart.WithLayout(new Layout.Layout() 
                 { 
                     title = pdfName + " - Lines",
                     showlegend = false,
                     hovermode = "closest",
                     width = width,
                     height = height,
                     xaxis = new Graph.Xaxis() {  range = new[] { 0, width } },
                     yaxis = new Graph.Yaxis() {  range = new[] { 0, height } },
                 });
chart.WithXTitle("X");
chart.WithYTitle("Y");
chart.Width = (int)width;
chart.Height = (int)height;
display(chart);

### 3. Get paragraphs blocks

In [15]:

/// <summary>
/// Perpendicular overlapping distance.
/// </summary>
/// <param name="line1"></param>
/// <param name="line2"></param>
/// <param name="angularDifferenceBounds"></param>
/// <param name="epsilon"></param>
private static double PerpendicularOverlappingDistance(PdfLine line1, PdfLine line2, AngleBounds angularDifferenceBounds, double epsilon)
{
    if (GetStructuralBlockingParameters(line1, line2, epsilon, out double theta, out double overlap, out double ed))
    {
        // Angle is kept within [-90;90]
        if (theta > 90)
        {
            theta -= 180;
        }
        else if (theta < -90)
        {
            theta += 180;
        }

        if (!angularDifferenceBounds.Contains(theta))
        {
            // exclude because not parallel enough
            return double.PositiveInfinity;
        }

        return Math.Abs(ed);
    }
    else
    {
        // nonoverlapped
        return double.PositiveInfinity;
    }
}

/// <summary>
/// Get the structural blocking parameters.
/// </summary>
/// <param name="i"></param>
/// <param name="j"></param>
/// <param name="epsilon"></param>
/// <param name="angularDifference">The angle between the 2 lines.<para>-180 ≤ θ ≤ 180</para></param>
/// <param name="normalisedOverlap">Overlap of segment i onto j. Positive value if overlapped, negative value if nonoverlapped.<para>[-1, 1]?</para></param>
/// <param name="perpendicularDistance">Signed perpendicular distance.</param>
/// <returns>Return true if overlapped, false if nonoverlapped.</returns>
public static bool GetStructuralBlockingParameters(PdfLine i, PdfLine j, double epsilon,
    out double angularDifference, out double normalisedOverlap, out double perpendicularDistance)
{
    if (AlmostEquals(i, j, epsilon))
    {
        angularDifference = 0;
        normalisedOverlap = 1;
        perpendicularDistance = 0;
        return true;
    }

    double dXi = i.Point2.X - i.Point1.X;
    double dYi = i.Point2.Y - i.Point1.Y;
    double dXj = j.Point2.X - j.Point1.X;
    double dYj = j.Point2.Y - j.Point1.Y;

    angularDifference = Distances.BoundAngle180((Math.Atan2(dYj, dXj) - Math.Atan2(dYi, dXi)) * 180 / Math.PI);

    PdfPoint? Aj = GetTranslatedPoint(i.Point1.X, i.Point1.Y, j.Point1.X, j.Point1.Y, dXi, dYi, dXj, dYj, epsilon);
    PdfPoint? Bj = GetTranslatedPoint(i.Point2.X, i.Point2.Y, j.Point2.X, j.Point2.Y, dXi, dYi, dXj, dYj, epsilon);

    if (!Aj.HasValue || !Bj.HasValue)
    {
        // Might happen because lines are perpendicular
        // or have too small lengths
        normalisedOverlap = double.NaN;
        perpendicularDistance = double.NaN;
        return false;
    }

    // Get middle points
    var ps = new[] { j.Point1, j.Point2, Aj.Value, Bj.Value };

    if (dXj != 0)
    {
        ps = ps.OrderBy(p => p.X).ThenBy(p => p.Y).ToArray();
    }
    else if (dYj != 0)
    {
        ps = ps.OrderBy(p => p.Y).ToArray();
    }

    PdfPoint Cj = ps[1];
    PdfPoint Dj = ps[2];

    bool overlap = true;
    // Cj and Dj should be contained within both j and [Aj,Bj] if overlapped
    if (!PointInLine(j.Point1, j.Point2, Cj) || !PointInLine(j.Point1, j.Point2, Dj) ||
        !PointInLine(Aj.Value, Bj.Value, Cj) || !PointInLine(Aj.Value, Bj.Value, Dj))
    {
        // nonoverlapped
        overlap = false;
    }

    double pj = Distances.Euclidean(Cj, Dj);

    normalisedOverlap = (overlap ? pj : -pj) / j.Length;

    double xMj = (Cj.X + Dj.X) / 2.0;
    double yMj = (Cj.Y + Dj.Y) / 2.0;

    if (!dXi.AlmostEqualsToZero(epsilon) && !dYi.AlmostEqualsToZero(epsilon))
    {
        perpendicularDistance = ((xMj - i.Point1.X) - (yMj - i.Point1.Y) * dXi / dYi) / Math.Sqrt(dXi * dXi / (dYi * dYi) + 1);
    }
    else if (dXi.AlmostEqualsToZero(epsilon))
    {
        perpendicularDistance = xMj - i.Point1.X;
    }
    else
    {
        perpendicularDistance = yMj - i.Point1.Y;
    }

    return overlap;
}


private static PdfPoint? GetTranslatedPoint(double xPi, double yPi, double xPj, double yPj, double dXi, double dYi, double dXj, double dYj, double epsilon)
{
    double dYidYj = dYi * dYj;
    double dXidXj = dXi * dXj;
    double denominator = dYidYj + dXidXj;

    if (denominator.AlmostEqualsToZero(epsilon))
    {
        // The denominator is 0 when translating points, meaning the lines are perpendicular.
        return null;
    }

    double xAj;
    double yAj;

    if (!dXj.AlmostEqualsToZero(epsilon)) // dXj != 0
    {
        xAj = (xPi * dXidXj + xPj * dYidYj + dXj * dYi * (yPi - yPj)) / denominator;
        yAj = dYj / dXj * (xAj - xPj) + yPj;
    }
    else // If dXj = 0, then yAj is calculated first, and xAj is calculated from that.
    {
        // TODO: check that
        yAj = (yPi * dYidYj + yPj * dXidXj + dYj * dXi * (xPi - xPj)) / denominator;
        xAj = xPj;
    }

    return new PdfPoint(xAj, yAj);
}

/// <summary>
/// Helper function to check if the point belongs to the line./>
/// </summary>
/// <param name="pl1">Line's first point.</param>
/// <param name="pl2">Line's second point.</param>
/// <param name="point">The point to check.</param>
private static bool PointInLine(PdfPoint pl1, PdfPoint pl2, PdfPoint point)
{
    // /!\ Assuming the points are aligned (be careful)
    double ax = point.X - pl1.X;
    double ay = point.Y - pl1.Y;
    double bx = pl2.X - pl1.X;
    double by = pl2.Y - pl1.Y;

    double dotProd1 = ax * bx + ay * by;
    if (dotProd1 < 0) return false;

    return dotProd1 <= (bx * bx + by * by);
}

/// <summary>
/// Helper function to check if 2 lines are equal.
/// </summary>
/// <param name="line1"></param>
/// <param name="line2"></param>
/// <param name="epsilon"></param>
private static bool AlmostEquals(PdfLine line1, PdfLine line2, double epsilon)
{
    return (line1.Point1.X - line2.Point1.X).AlmostEqualsToZero(epsilon) &&
           (line1.Point1.Y - line2.Point1.Y).AlmostEqualsToZero(epsilon) &&
           (line1.Point2.X - line2.Point2.X).AlmostEqualsToZero(epsilon) &&
           (line1.Point2.Y - line2.Point2.Y).AlmostEqualsToZero(epsilon);
}

In [16]:
var paragraphs = new List<TextBlock>();

var groupedLines = Clustering.NearestNeighbours(
    lines,
    (l1, l2) => PerpendicularOverlappingDistance(l1, l2, angularDifferenceBounds, epsilon),
    (_, __) => maxBlDistance,
    pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
    candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
    _ => true,
    (_, __) => true,
    -1).ToList();

foreach (var g in groupedLines)
{
    paragraphs.Add(new TextBlock(g.OrderByReadingOrder(), lineSeparator));
}

#### Plot paragraphs

In [17]:
var graphs = new List<Graph.Scatter>();
graphs.Add(new Graph.Scatter()
{
    x = new[] { 0, 0, width, width, 0 },
    y = new[] { 0, height, height, 0, 0 },
    mode = "line",
    name = "",
    text = "page",
    marker = new Graph.Marker() { color = "brown" }
});

foreach (var p in paragraphs)
{
    graphs.Add(new Graph.Scatter()
    {
        x = new[] 
        { 
            p.BoundingBox.BottomLeft.X, 
            p.BoundingBox.BottomRight.X, 
            p.BoundingBox.TopRight.X, 
            p.BoundingBox.TopLeft.X,
            p.BoundingBox.BottomLeft.X 
        },
        y = new[]
        { 
            p.BoundingBox.BottomLeft.Y,
            p.BoundingBox.BottomRight.Y,
            p.BoundingBox.TopRight.Y,
            p.BoundingBox.TopLeft.Y,
            p.BoundingBox.BottomLeft.Y
        },
        mode = "lines",
        text = p.Text.Count() <= 25 ? p.Text : string.Join("", p.Text.Take(25)) + "...",
        marker = new Graph.Marker() { color = "red" }
    });
}

var chart = Chart.Plot(graphs.ToArray());
chart.WithLayout(new Layout.Layout() 
                 { 
                     title = pdfName + " - Paragraphs",
                     showlegend = false,
                     hovermode = "closest",
                     width = width,
                     height = height,
                     xaxis = new Graph.Xaxis() {  range = new[] { 0, width } },
                     yaxis = new Graph.Yaxis() {  range = new[] { 0, height } },
                 });
chart.WithXTitle("X");
chart.WithYTitle("Y");
chart.Width = (int)width;
chart.Height = (int)height;
display(chart);