Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

version 0.1.0-alpha001 #12

Merged
merged 1 commit into from
Sep 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions Tabula.Csv/Tabula.Csv.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@

<PropertyGroup>
<TargetFrameworks>netcoreapp3.1;netstandard2.0;net45;net451;net452;net46;net461;net462;net47</TargetFrameworks>
<Description>Extract tables from PDF files (port of tabula-java using PdfPig). Csv and Tsv writers.</Description>
<PackageProjectUrl>https://github.com/BobLd/tabula-sharp</PackageProjectUrl>
<Version>0.1.0-alpha001</Version>
</PropertyGroup>

<PropertyGroup Condition="'$(Configuration)|$(TargetFramework)|$(Platform)'=='Release|netcoreapp3.1|AnyCPU'">
<DocumentationFile>D:\VS2017\source\repos\tabula-sharp\Tabula.Csv\Tabula.Csv.xml</DocumentationFile>
<WarningLevel>3</WarningLevel>
</PropertyGroup>

<ItemGroup>
Expand Down
8 changes: 8 additions & 0 deletions Tabula.Csv/Tabula.Csv.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions Tabula.Json/Tabula.Json.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@

<PropertyGroup>
<TargetFrameworks>netcoreapp3.1;netstandard2.0;net45;net451;net452;net46;net461;net462;net47</TargetFrameworks>
<Description>Extract tables from PDF files (port of tabula-java using PdfPig). Json writer.</Description>
<PackageProjectUrl>https://github.com/BobLd/tabula-sharp</PackageProjectUrl>
<Version>0.1.0-alpha001</Version>
</PropertyGroup>

<PropertyGroup Condition="'$(Configuration)|$(TargetFramework)|$(Platform)'=='Release|netcoreapp3.1|AnyCPU'">
<DocumentationFile>D:\VS2017\source\repos\tabula-sharp\Tabula.Json\Tabula.Json.xml</DocumentationFile>
</PropertyGroup>

<ItemGroup>
Expand Down
8 changes: 8 additions & 0 deletions Tabula.Json/Tabula.Json.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 12 additions & 4 deletions Tabula/Cell.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ namespace Tabula
* ** tabula/Cell.java **
*/
/// <summary>
///
/// A cell in a table.
/// </summary>
public class Cell : RectangularTextContainer<TextChunk>
{
Expand All @@ -19,7 +19,7 @@ public class Cell : RectangularTextContainer<TextChunk>
public static Cell EMPTY => new Cell(new PdfRectangle());

/// <summary>
///
/// Create a cell in a table.
/// </summary>
/// <param name="pdfRectangle"></param>
public Cell(PdfRectangle pdfRectangle)
Expand All @@ -31,7 +31,7 @@ public Cell(PdfRectangle pdfRectangle)
}

/// <summary>
///
/// Create a cell in a table.
/// </summary>
/// <param name="chunk"></param>
public Cell(TextChunk chunk)
Expand All @@ -41,7 +41,7 @@ public Cell(TextChunk chunk)
}

/// <summary>
///
/// Create a cell in a table.
/// </summary>
/// <param name="topLeft"></param>
/// <param name="bottomRight"></param>
Expand All @@ -59,6 +59,10 @@ public Cell(PdfPoint topLeft, PdfPoint bottomRight)
}
}

/// <summary>
/// Gets the cell's text.
/// </summary>
/// <param name="useLineReturns"></param>
public override string GetText(bool useLineReturns)
{
if (base.textElements.Count == 0)
Expand All @@ -81,6 +85,9 @@ public override string GetText(bool useLineReturns)
return sb.ToString().Trim();
}

/// <summary>
/// Gets the cell's text.
/// </summary>
public override string GetText()
{
return GetText(true);
Expand All @@ -100,6 +107,7 @@ public void SetPlaceholder(bool placeholder)
this.IsPlaceholder = placeholder;
}

/// <inheritdoc/>
public override string ToString()
{
return GetText();
Expand Down
10 changes: 9 additions & 1 deletion Tabula/Detectors/IDetectionAlgorithm.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,20 @@
namespace Tabula.Detectors
{
// https://github.com/tabulapdf/tabula-java/blob/master/src/main/java/technology/
/**
/*
* ** tabula/detectors/DetectionAlgorithm.java **
* Created by matt on 2015-12-14.
*/

/// <summary>
/// Table detection algorithm.
/// </summary>
public interface IDetectionAlgorithm
{
/// <summary>
/// Detects the tables in the page.
/// </summary>
/// <param name="page">The page where to detect the tables.</param>
List<TableRectangle> Detect(PageArea page);
}
}
6 changes: 5 additions & 1 deletion Tabula/Detectors/NurminenDetectionAlgorithm.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,18 @@

namespace Tabula.Detectors
{
/**
/*
* ** tabula/detectors/NurminenDetectionAlgorithm.java **
* Created by matt on 2015-12-17.
* <p>
* Attempt at an implementation of the table finding algorithm described by
* Anssi Nurminen's master's thesis:
* http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
*/

/// <summary>
/// Nurminen detection algorithm.
/// </summary>
public class NurminenDetectionAlgorithm : IDetectionAlgorithm
{
private static int GRAYSCALE_INTENSITY_THRESHOLD = 25;
Expand Down
4 changes: 4 additions & 0 deletions Tabula/Detectors/SpreadsheetDetectionAlgorithm.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ namespace Tabula.Detectors
/// </summary>
public class SpreadsheetDetectionAlgorithm : IDetectionAlgorithm
{
/// <summary>
/// Detects the tables in the page.
/// </summary>
/// <param name="page">The page where to detect the tables.</param>
public List<TableRectangle> Detect(PageArea page)
{
List<Cell> cells = SpreadsheetExtractionAlgorithm.FindCells(page.HorizontalRulings, page.VerticalRulings);
Expand Down
19 changes: 15 additions & 4 deletions Tabula/Extractors/BasicExtractionAlgorithm.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,33 @@
namespace Tabula.Extractors
{
/// <summary>
/// stream
/// Stream extraction algorithm.
/// </summary>
public class BasicExtractionAlgorithm : IExtractionAlgorithm
{
private IReadOnlyList<Ruling> verticalRulings;

/// <summary>
/// stream
/// Stream extraction algorithm.
/// </summary>
public BasicExtractionAlgorithm()
{
}

/// <summary>
/// stream
/// Stream extraction algorithm.
/// </summary>
/// <param name="verticalRulings">List of vertical rulings.</param>
public BasicExtractionAlgorithm(IReadOnlyList<Ruling> verticalRulings)
{
this.verticalRulings = verticalRulings;
}

/// <summary>
/// Extracts the tables in the page.
/// </summary>
/// <param name="page">The page where to extract the tables.</param>
/// <param name="verticalRulingPositions">List of vertical rulings, indicated by there x position.</param>
public List<Table> Extract(PageArea page, IReadOnlyList<float> verticalRulingPositions)
{
List<Ruling> verticalRulings = new List<Ruling>(verticalRulingPositions.Count);
Expand All @@ -36,6 +42,10 @@ public List<Table> Extract(PageArea page, IReadOnlyList<float> verticalRulingPos
return this.Extract(page);
}

/// <summary>
/// Extracts the tables in the page.
/// </summary>
/// <param name="page">The page where to extract the tables.</param>
public List<Table> Extract(PageArea page)
{
List<TextElement> textElements = page.GetText();
Expand Down Expand Up @@ -113,13 +123,14 @@ public List<Table> Extract(PageArea page)
return new Table[] { table }.ToList();
}

/// <inheritdoc/>
public override string ToString()
{
return "stream";
}

/// <summary>
///
/// Gets columns positions.
/// </summary>
/// <param name="lines">Must be an array of lines sorted by their +top+ attribute.</param>
/// <returns>a list of column boundaries (x axis).</returns>
Expand Down
7 changes: 7 additions & 0 deletions Tabula/Extractors/IExtractionAlgorithm.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,15 @@

namespace Tabula.Extractors
{
/// <summary>
/// Table extraction algorithm.
/// </summary>
public interface IExtractionAlgorithm
{
/// <summary>
/// Extracts the tables in the page.
/// </summary>
/// <param name="page">The page where to extract the tables.</param>
List<Table> Extract(PageArea page);
}
}
33 changes: 24 additions & 9 deletions Tabula/Extractors/SpreadsheetExtractionAlgorithm.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@ namespace Tabula.Extractors
{
//https://github.com/tabulapdf/tabula-java/blob/master/src/main/java/technology/tabula/extractors/SpreadsheetExtractionAlgorithm.java
/// <summary>
/// lattice
/// Lattice extraction algorithm.
/// </summary>
public class SpreadsheetExtractionAlgorithm : IExtractionAlgorithm
{
/// <summary>
/// lattice
/// Lattice extraction algorithm.
/// </summary>
public SpreadsheetExtractionAlgorithm()
{
Expand Down Expand Up @@ -81,13 +81,17 @@ public int Compare(PdfPoint arg0, PdfPoint arg1)
}
}

/// <summary>
/// Extracts the tables in the page.
/// </summary>
/// <param name="page">The page where to extract the tables.</param>
public List<Table> Extract(PageArea page)
{
return Extract(page, page.GetRulings());
}

/// <summary>
/// Extract a list of Table from page using rulings as separators
/// Extracts the tables in the page using rulings as separators.
/// </summary>
/// <param name="page"></param>
/// <param name="rulings"></param>
Expand Down Expand Up @@ -159,6 +163,10 @@ public List<Table> Extract(PageArea page, IReadOnlyList<Ruling> rulings)
return spreadsheets;
}

/// <summary>
///
/// </summary>
/// <param name="page"></param>
public bool IsTabular(PageArea page)
{
// if there's no text at all on the page, it's not a table
Expand All @@ -177,6 +185,7 @@ public bool IsTabular(PageArea page)
{
return false;
}

Table table = tables[0];
int rowsDefinedByLines = table.RowCount;
int colsDefinedByLines = table.ColumnCount;
Expand All @@ -187,6 +196,7 @@ public bool IsTabular(PageArea page)
// TODO WHAT DO WE DO HERE?
System.Diagnostics.Debug.Write("SpreadsheetExtractionAlgorithm.isTabular(): no table found.");
}

table = tables[0];
int rowsDefinedWithoutLines = table.RowCount;
int colsDefinedWithoutLines = table.ColumnCount;
Expand All @@ -196,6 +206,11 @@ public bool IsTabular(PageArea page)
return ratio > MAGIC_HEURISTIC_NUMBER && ratio < (1 / MAGIC_HEURISTIC_NUMBER);
}

/// <summary>
/// Find cells from horizontal and vertical ruling lines.
/// </summary>
/// <param name="horizontalRulingLines"></param>
/// <param name="verticalRulingLines"></param>
public static List<Cell> FindCells(IReadOnlyList<Ruling> horizontalRulingLines, IReadOnlyList<Ruling> verticalRulingLines)
{
List<Cell> cellsFound = new List<Cell>();
Expand Down Expand Up @@ -266,6 +281,11 @@ public static List<Cell> FindCells(IReadOnlyList<Ruling> horizontalRulingLines,
return cellsFound;
}

/// <summary>
/// Find spreadsheets areas from cells.
/// <para>Based on O'Rourke's `Uniqueness of orthogonal connect-the-dots`.</para>
/// </summary>
/// <param name="cells"></param>
public static List<TableRectangle> FindSpreadsheetsFromCells(List<TableRectangle> cells)
{
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
Expand Down Expand Up @@ -390,6 +410,7 @@ public static List<TableRectangle> FindSpreadsheetsFromCells(List<TableRectangle
return rectangles;
}

/// <inheritdoc/>
public override string ToString()
{
return "lattice";
Expand Down Expand Up @@ -419,12 +440,6 @@ public override bool Equals(object other)
return this.point.Equals(o.point);
}
return false;
/*
if (this == other)
return true;
if (!(other is PolygonVertex)) return false;
return this.point.Equals(((PolygonVertex)other).point);
*/
}

public override int GetHashCode()
Expand Down
Loading