# JSON accuracy example

### Create copies of ground truth

In [1]:
from json_accuracy import *
from pathlib import Path
import pandas as pd
root_dir = Path.cwd().parent.parent

gt_df_raw = load_json_safely(root_dir / "data" / "ground-truth" / "json" / "gt_kbaa-p100.json")
gt_df = filter_expected_columns(gt_df_raw)

pred_df = gt_df.copy()

gt_df_original = gt_df.copy()

pred_df_cell_mismatch = pred_df.copy()
pred_df_cell_mismatch.loc[2, "description"] = "no description"
pred_df_cell_mismatch.loc[3, "description"] = "Civil War soldier and governor of Illinoi." # Very similar, should pass Jaro-Winkler
pred_df_cell_mismatch.loc[3, "publishyear"] = 9999

pred_df_top_row_mismatch = pred_df.copy()
pred_df_top_row_mismatch = pred_df_top_row_mismatch.drop(0, axis="index").reset_index(drop=True)

pred_df_bottom_row_mismatch = pred_df.copy()
pred_df_bottom_row_mismatch = pred_df_bottom_row_mismatch.drop(19, axis="index").reset_index(drop=True)

pred_df_two_rows_mismatch = pred_df.copy()
pred_df_two_rows_mismatch = pred_df_two_rows_mismatch.drop([0, 1], axis="index").reset_index(drop=True)

pred_df_removed_col = pred_df.copy()
pred_df_removed_col = pred_df_removed_col.drop(columns=["index", "description"]).reset_index(drop=True)

- Notice that before running the `filter_expected_columns`, empty values are populated as `NaN` and all values have correct capitalization.

In [2]:
gt_df_raw

Unnamed: 0,lastname,firstname,title,city,publisher,publishyear,pagecount,library,description,index,birthyear,deathyear,maidenname
0,Field,Joseph E.,Three years in Texas...,Boston,Abel Tompkins,1836.0,47.0,DLC,His experiences in the War with Mexico.,1920.0,,,
1,Field,Richard,Richard Field.,"Lexington, Mo.",,1930.0,38.0,MoK,Missouri lawyer and judge tells also of his yo...,1921.0,1843.0,,
2,Field,Stephen Johnson,Personal reminiscences of early days in Califo...,San Francisco?,,1880.0,248.0,WHi,By a judge and member of the state legislature.,1922.0,1816.0,1899.0,
3,Fifer,Joseph Wilson,"""Private Joe"" Fifer. Memories of war & peace.","Bloomington, Ill.",Pantagraph pr. co.,1936.0,104.0,WHi,Civil War soldier and governor of Illinois.,1923.0,1840.0,,
4,Finch,Edwin Ward,"The frontier, army and professional life of Ed...",N.Y.,"Press of Simmonds, Manning & Dawson",1909.0,119.0,DLC,A New York physician tells of his boyhood on a...,1924.0,1831.0,,
5,Finck,Henry Theophilos,My adventures in the golden age of music.,N.Y.,Funk & Wagnalls,1926.0,462.0,WU,Music critic.,1925.0,1854.0,1926.0,
6,Finerty,John Frederick,War-path and bivouac... the Big Horn and Yello...,Chicago,,1890.0,460.0,DLC,"Newspaper correspondent with Crook, 1866-67, i...",1926.0,1846.0,1908.0,
7,Finley,James Bradley,"Autobiography of Rev. James B. Finley; or, pio...",Cinc.,"pr. at the Methodist book concern, for the author",1853.0,455.0,WHi,Methodist in the old Northwest.,1927.0,1781.0,1856.0,
8,Finley,James Bradley,Life among the Indians...,Cinc.,Hitchcock & Walden,1868.0,507.0,WHi,An enlarged version of the previous item.,1928.0,1781.0,1856.0,
9,Finn,Francis James,"Father Finn, S.J.; the story of his life told ...",N.Y.,Benziger bros.,1929.0,236.0,NN,Teacher in St. Louis and Cincinnati. A pioneer...,1929.0,1859.0,1928.0,


- After running the `filter_expected_columns`:
  - empty values are converted to the specified default values per column:
    - "" (empty string) for strings
    - 0 for numbers
  - non-existent columns are added with default values
  - everything is lower-case 

In [3]:
gt_df

Unnamed: 0,lastname,firstname,maidenname,birthyear,deathyear,title,city,publisher,publishyear,pagecount,library,description,index
0,field,joseph e.,,0.0,0.0,three years in texas...,boston,abel tompkins,1836.0,47.0,dlc,his experiences in the war with mexico.,1920.0
1,field,richard,,1843.0,0.0,richard field.,"lexington, mo.",,1930.0,38.0,mok,missouri lawyer and judge tells also of his yo...,1921.0
2,field,stephen johnson,,1816.0,1899.0,personal reminiscences of early days in califo...,san francisco?,,1880.0,248.0,whi,by a judge and member of the state legislature.,1922.0
3,fifer,joseph wilson,,1840.0,0.0,"""private joe"" fifer. memories of war & peace.","bloomington, ill.",pantagraph pr. co.,1936.0,104.0,whi,civil war soldier and governor of illinois.,1923.0
4,finch,edwin ward,,1831.0,0.0,"the frontier, army and professional life of ed...",n.y.,"press of simmonds, manning & dawson",1909.0,119.0,dlc,a new york physician tells of his boyhood on a...,1924.0
5,finck,henry theophilos,,1854.0,1926.0,my adventures in the golden age of music.,n.y.,funk & wagnalls,1926.0,462.0,wu,music critic.,1925.0
6,finerty,john frederick,,1846.0,1908.0,war-path and bivouac... the big horn and yello...,chicago,,1890.0,460.0,dlc,"newspaper correspondent with crook, 1866-67, i...",1926.0
7,finley,james bradley,,1781.0,1856.0,"autobiography of rev. james b. finley; or, pio...",cinc.,"pr. at the methodist book concern, for the author",1853.0,455.0,whi,methodist in the old northwest.,1927.0
8,finley,james bradley,,1781.0,1856.0,life among the indians...,cinc.,hitchcock & walden,1868.0,507.0,whi,an enlarged version of the previous item.,1928.0
9,finn,francis james,,1859.0,1928.0,"father finn, s.j.; the story of his life told ...",n.y.,benziger bros.,1929.0,236.0,nn,teacher in st. louis and cincinnati. a pioneer...,1929.0


In [4]:
pred_df

Unnamed: 0,lastname,firstname,maidenname,birthyear,deathyear,title,city,publisher,publishyear,pagecount,library,description,index
0,field,joseph e.,,0.0,0.0,three years in texas...,boston,abel tompkins,1836.0,47.0,dlc,his experiences in the war with mexico.,1920.0
1,field,richard,,1843.0,0.0,richard field.,"lexington, mo.",,1930.0,38.0,mok,missouri lawyer and judge tells also of his yo...,1921.0
2,field,stephen johnson,,1816.0,1899.0,personal reminiscences of early days in califo...,san francisco?,,1880.0,248.0,whi,by a judge and member of the state legislature.,1922.0
3,fifer,joseph wilson,,1840.0,0.0,"""private joe"" fifer. memories of war & peace.","bloomington, ill.",pantagraph pr. co.,1936.0,104.0,whi,civil war soldier and governor of illinois.,1923.0
4,finch,edwin ward,,1831.0,0.0,"the frontier, army and professional life of ed...",n.y.,"press of simmonds, manning & dawson",1909.0,119.0,dlc,a new york physician tells of his boyhood on a...,1924.0
5,finck,henry theophilos,,1854.0,1926.0,my adventures in the golden age of music.,n.y.,funk & wagnalls,1926.0,462.0,wu,music critic.,1925.0
6,finerty,john frederick,,1846.0,1908.0,war-path and bivouac... the big horn and yello...,chicago,,1890.0,460.0,dlc,"newspaper correspondent with crook, 1866-67, i...",1926.0
7,finley,james bradley,,1781.0,1856.0,"autobiography of rev. james b. finley; or, pio...",cinc.,"pr. at the methodist book concern, for the author",1853.0,455.0,whi,methodist in the old northwest.,1927.0
8,finley,james bradley,,1781.0,1856.0,life among the indians...,cinc.,hitchcock & walden,1868.0,507.0,whi,an enlarged version of the previous item.,1928.0
9,finn,francis james,,1859.0,1928.0,"father finn, s.j.; the story of his life told ...",n.y.,benziger bros.,1929.0,236.0,nn,teacher in st. louis and cincinnati. a pioneer...,1929.0


## Verify exact match works

In [5]:
compare_dataframes(gt_df, pred_df, "exact")

{'__COL__:lastname': np.int64(20),
 '__COL__:firstname': np.int64(20),
 '__COL__:maidenname': np.int64(20),
 '__COL__:birthyear': np.int64(20),
 '__COL__:deathyear': np.int64(20),
 '__COL__:title': np.int64(20),
 '__COL__:city': np.int64(20),
 '__COL__:publisher': np.int64(20),
 '__COL__:publishyear': np.int64(20),
 '__COL__:pagecount': np.int64(20),
 '__COL__:library': np.int64(20),
 '__COL__:description': np.int64(20),
 '__COL__:index': np.int64(20),
 'matches': np.int64(260),
 'total': 260,
 'mismatch_bool': False,
 'pred_nrows': 20,
 'pred_adj_nrows': 20,
 'gt_nrows': 20}

### Verify that compare_dataframes does not modify dataframes

In [6]:
(gt_df == gt_df_original).count().sum()

np.int64(260)

### Check more detailed results

In [7]:
detailed_results = compare_dataframes_core(gt_df, pred_df, "exact")

In [8]:
detailed_results["match_df"]

Unnamed: 0,lastname,firstname,maidenname,birthyear,deathyear,title,city,publisher,publishyear,pagecount,library,description,index
0,True,True,True,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,True,True
5,True,True,True,True,True,True,True,True,True,True,True,True,True
6,True,True,True,True,True,True,True,True,True,True,True,True,True
7,True,True,True,True,True,True,True,True,True,True,True,True,True
8,True,True,True,True,True,True,True,True,True,True,True,True,True
9,True,True,True,True,True,True,True,True,True,True,True,True,True


## Ensure cell mismatches are handled properly

In [9]:
pred_df_cell_mismatch.head()

Unnamed: 0,lastname,firstname,maidenname,birthyear,deathyear,title,city,publisher,publishyear,pagecount,library,description,index
0,field,joseph e.,,0.0,0.0,three years in texas...,boston,abel tompkins,1836.0,47.0,dlc,his experiences in the war with mexico.,1920.0
1,field,richard,,1843.0,0.0,richard field.,"lexington, mo.",,1930.0,38.0,mok,missouri lawyer and judge tells also of his yo...,1921.0
2,field,stephen johnson,,1816.0,1899.0,personal reminiscences of early days in califo...,san francisco?,,1880.0,248.0,whi,no description,1922.0
3,fifer,joseph wilson,,1840.0,0.0,"""private joe"" fifer. memories of war & peace.","bloomington, ill.",pantagraph pr. co.,9999.0,104.0,whi,Civil War soldier and governor of Illinoi.,1923.0
4,finch,edwin ward,,1831.0,0.0,"the frontier, army and professional life of ed...",n.y.,"press of simmonds, manning & dawson",1909.0,119.0,dlc,a new york physician tells of his boyhood on a...,1924.0


In [10]:
detailed_results = compare_dataframes_core(gt_df, pred_df_cell_mismatch, "exact")
print(detailed_results["results"])
detailed_results["match_df"]

{'__COL__:lastname': np.int64(20), '__COL__:firstname': np.int64(20), '__COL__:maidenname': np.int64(20), '__COL__:birthyear': np.int64(20), '__COL__:deathyear': np.int64(20), '__COL__:title': np.int64(20), '__COL__:city': np.int64(20), '__COL__:publisher': np.int64(20), '__COL__:publishyear': np.int64(19), '__COL__:pagecount': np.int64(20), '__COL__:library': np.int64(20), '__COL__:description': np.int64(18), '__COL__:index': np.int64(20), 'matches': np.int64(257), 'total': 260, 'mismatch_bool': False, 'pred_nrows': 20, 'pred_adj_nrows': 20, 'gt_nrows': 20}


Unnamed: 0,lastname,firstname,maidenname,birthyear,deathyear,title,city,publisher,publishyear,pagecount,library,description,index
0,True,True,True,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True,False,True
3,True,True,True,True,True,True,True,True,False,True,True,False,True
4,True,True,True,True,True,True,True,True,True,True,True,True,True
5,True,True,True,True,True,True,True,True,True,True,True,True,True
6,True,True,True,True,True,True,True,True,True,True,True,True,True
7,True,True,True,True,True,True,True,True,True,True,True,True,True
8,True,True,True,True,True,True,True,True,True,True,True,True,True
9,True,True,True,True,True,True,True,True,True,True,True,True,True


In [11]:
# Should only have one False description instead of two
detailed_results = compare_dataframes_core(gt_df, pred_df_cell_mismatch, "fuzzy")
print(detailed_results["results"])
detailed_results["match_df"]

{'__COL__:lastname': np.int64(20), '__COL__:firstname': np.int64(20), '__COL__:maidenname': np.int64(20), '__COL__:birthyear': np.int64(20), '__COL__:deathyear': np.int64(20), '__COL__:title': np.int64(20), '__COL__:city': np.int64(20), '__COL__:publisher': np.int64(20), '__COL__:publishyear': np.int64(19), '__COL__:pagecount': np.int64(20), '__COL__:library': np.int64(20), '__COL__:description': np.int64(19), '__COL__:index': np.int64(20), 'matches': np.int64(258), 'total': 260, 'mismatch_bool': False, 'pred_nrows': 20, 'pred_adj_nrows': 20, 'gt_nrows': 20}


Unnamed: 0,lastname,firstname,maidenname,birthyear,deathyear,title,city,publisher,publishyear,pagecount,library,description,index
0,True,True,True,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True,False,True
3,True,True,True,True,True,True,True,True,False,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,True,True
5,True,True,True,True,True,True,True,True,True,True,True,True,True
6,True,True,True,True,True,True,True,True,True,True,True,True,True
7,True,True,True,True,True,True,True,True,True,True,True,True,True
8,True,True,True,True,True,True,True,True,True,True,True,True,True
9,True,True,True,True,True,True,True,True,True,True,True,True,True


## Ensure top row mismatches are handled properly
- Should add a row in the processed dataframe.

In [12]:
pred_df_top_row_mismatch.head()

Unnamed: 0,lastname,firstname,maidenname,birthyear,deathyear,title,city,publisher,publishyear,pagecount,library,description,index
0,field,richard,,1843.0,0.0,richard field.,"lexington, mo.",,1930.0,38.0,mok,missouri lawyer and judge tells also of his yo...,1921.0
1,field,stephen johnson,,1816.0,1899.0,personal reminiscences of early days in califo...,san francisco?,,1880.0,248.0,whi,by a judge and member of the state legislature.,1922.0
2,fifer,joseph wilson,,1840.0,0.0,"""private joe"" fifer. memories of war & peace.","bloomington, ill.",pantagraph pr. co.,1936.0,104.0,whi,civil war soldier and governor of illinois.,1923.0
3,finch,edwin ward,,1831.0,0.0,"the frontier, army and professional life of ed...",n.y.,"press of simmonds, manning & dawson",1909.0,119.0,dlc,a new york physician tells of his boyhood on a...,1924.0
4,finck,henry theophilos,,1854.0,1926.0,my adventures in the golden age of music.,n.y.,funk & wagnalls,1926.0,462.0,wu,music critic.,1925.0


In [13]:
detailed_results = compare_dataframes_core(gt_df, pred_df_top_row_mismatch, "exact")
print(detailed_results["results"])
detailed_results["match_df"]

{'__COL__:lastname': np.int64(19), '__COL__:firstname': np.int64(19), '__COL__:maidenname': np.int64(20), '__COL__:birthyear': np.int64(20), '__COL__:deathyear': np.int64(20), '__COL__:title': np.int64(19), '__COL__:city': np.int64(19), '__COL__:publisher': np.int64(19), '__COL__:publishyear': np.int64(19), '__COL__:pagecount': np.int64(19), '__COL__:library': np.int64(19), '__COL__:description': np.int64(19), '__COL__:index': np.int64(19), 'matches': np.int64(250), 'total': 260, 'mismatch_bool': False, 'pred_nrows': 19, 'pred_adj_nrows': 20, 'gt_nrows': 20}


Unnamed: 0,lastname,firstname,maidenname,birthyear,deathyear,title,city,publisher,publishyear,pagecount,library,description,index
0,False,False,True,True,True,False,False,False,False,False,False,False,False
1,True,True,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,True,True
5,True,True,True,True,True,True,True,True,True,True,True,True,True
6,True,True,True,True,True,True,True,True,True,True,True,True,True
7,True,True,True,True,True,True,True,True,True,True,True,True,True
8,True,True,True,True,True,True,True,True,True,True,True,True,True
9,True,True,True,True,True,True,True,True,True,True,True,True,True


Note that above, the maiden name, birth year, and death year are empty in the original, so the top row is True for these three columns only and False otherwise.

In [14]:
detailed_results["pred_df_adj"]

Unnamed: 0,lastname,firstname,maidenname,birthyear,deathyear,title,city,publisher,publishyear,pagecount,library,description,index
0,,,,0.0,0.0,,,,0.0,0.0,,,0.0
1,field,richard,,1843.0,0.0,richard field.,"lexington, mo.",,1930.0,38.0,mok,missouri lawyer and judge tells also of his yo...,1921.0
2,field,stephen johnson,,1816.0,1899.0,personal reminiscences of early days in califo...,san francisco?,,1880.0,248.0,whi,by a judge and member of the state legislature.,1922.0
3,fifer,joseph wilson,,1840.0,0.0,"""private joe"" fifer. memories of war & peace.","bloomington, ill.",pantagraph pr. co.,1936.0,104.0,whi,civil war soldier and governor of illinois.,1923.0
4,finch,edwin ward,,1831.0,0.0,"the frontier, army and professional life of ed...",n.y.,"press of simmonds, manning & dawson",1909.0,119.0,dlc,a new york physician tells of his boyhood on a...,1924.0
5,finck,henry theophilos,,1854.0,1926.0,my adventures in the golden age of music.,n.y.,funk & wagnalls,1926.0,462.0,wu,music critic.,1925.0
6,finerty,john frederick,,1846.0,1908.0,war-path and bivouac... the big horn and yello...,chicago,,1890.0,460.0,dlc,"newspaper correspondent with crook, 1866-67, i...",1926.0
7,finley,james bradley,,1781.0,1856.0,"autobiography of rev. james b. finley; or, pio...",cinc.,"pr. at the methodist book concern, for the author",1853.0,455.0,whi,methodist in the old northwest.,1927.0
8,finley,james bradley,,1781.0,1856.0,life among the indians...,cinc.,hitchcock & walden,1868.0,507.0,whi,an enlarged version of the previous item.,1928.0
9,finn,francis james,,1859.0,1928.0,"father finn, s.j.; the story of his life told ...",n.y.,benziger bros.,1929.0,236.0,nn,teacher in st. louis and cincinnati. a pioneer...,1929.0


The modified top row is completely empty with the correct default values.

## Ensure bottom row mismatches are handled properly

In [15]:
pred_df_bottom_row_mismatch.tail()

Unnamed: 0,lastname,firstname,maidenname,birthyear,deathyear,title,city,publisher,publishyear,pagecount,library,description,index
14,fischer,augusta catherine,,1891.0,0.0,searchlight; an autobiography.,seattle,,1937.0,233.0,nn,the story of the causes leading to the author'...,1934.0
15,fisher,clara,,0.0,0.0,,,,0.0,0.0,,"see maeder, mrs. clara (fisher).",0.0
16,fisher,daniel webster,,1838.0,1913.0,a human life...,n.y.,revell,1909.0,325.0,pu,presbyterian clergyman who was also president ...,1935.0
17,fisher,elizabeth,munro,1759.0,0.0,"memoirs, of mrs. elizabeth fisher, of the city...",n.y.,,1810.0,48.0,dlc,by an american tory.,1936.0
18,fisher,george adams,,1835.0,0.0,"the yankee conscript, or eighteen months in di...",phila.,j. w. daughaday,1864.0,251.0,wu,by a union soldier.,1937.0


In [16]:
detailed_results = compare_dataframes_core(gt_df, pred_df_bottom_row_mismatch, "exact")
print(detailed_results["results"])
detailed_results["match_df"]

{'__COL__:lastname': np.int64(9), '__COL__:firstname': np.int64(2), '__COL__:maidenname': np.int64(18), '__COL__:birthyear': np.int64(2), '__COL__:deathyear': np.int64(10), '__COL__:title': np.int64(0), '__COL__:city': np.int64(3), '__COL__:publisher': np.int64(2), '__COL__:publishyear': np.int64(0), '__COL__:pagecount': np.int64(0), '__COL__:library': np.int64(4), '__COL__:description': np.int64(0), '__COL__:index': np.int64(0), 'matches': np.int64(50), 'total': 260, 'mismatch_bool': False, 'pred_nrows': 19, 'pred_adj_nrows': 20, 'gt_nrows': 20}


Unnamed: 0,lastname,firstname,maidenname,birthyear,deathyear,title,city,publisher,publishyear,pagecount,library,description,index
0,False,False,True,True,True,False,False,False,False,False,False,False,False
1,True,False,True,False,True,False,False,False,False,False,False,False,False
2,True,False,True,False,False,False,False,True,False,False,False,False,False
3,False,False,True,False,False,False,False,False,False,False,True,False,False
4,False,False,True,False,True,False,False,False,False,False,False,False,False
5,False,False,True,False,False,False,True,False,False,False,False,False,False
6,False,False,True,False,False,False,False,False,False,False,False,False,False
7,False,False,True,False,False,False,False,False,False,False,False,False,False
8,True,True,True,True,True,False,True,False,False,False,True,False,False
9,False,False,True,False,False,False,False,False,False,False,False,False,False


In [17]:
detailed_results["pred_df_adj"]

Unnamed: 0,lastname,firstname,maidenname,birthyear,deathyear,title,city,publisher,publishyear,pagecount,library,description,index
0,,,,0.0,0.0,,,,0.0,0.0,,,0.0
1,field,joseph e.,,0.0,0.0,three years in texas...,boston,abel tompkins,1836.0,47.0,dlc,his experiences in the war with mexico.,1920.0
2,field,richard,,1843.0,0.0,richard field.,"lexington, mo.",,1930.0,38.0,mok,missouri lawyer and judge tells also of his yo...,1921.0
3,field,stephen johnson,,1816.0,1899.0,personal reminiscences of early days in califo...,san francisco?,,1880.0,248.0,whi,by a judge and member of the state legislature.,1922.0
4,fifer,joseph wilson,,1840.0,0.0,"""private joe"" fifer. memories of war & peace.","bloomington, ill.",pantagraph pr. co.,1936.0,104.0,whi,civil war soldier and governor of illinois.,1923.0
5,finch,edwin ward,,1831.0,0.0,"the frontier, army and professional life of ed...",n.y.,"press of simmonds, manning & dawson",1909.0,119.0,dlc,a new york physician tells of his boyhood on a...,1924.0
6,finck,henry theophilos,,1854.0,1926.0,my adventures in the golden age of music.,n.y.,funk & wagnalls,1926.0,462.0,wu,music critic.,1925.0
7,finerty,john frederick,,1846.0,1908.0,war-path and bivouac... the big horn and yello...,chicago,,1890.0,460.0,dlc,"newspaper correspondent with crook, 1866-67, i...",1926.0
8,finley,james bradley,,1781.0,1856.0,"autobiography of rev. james b. finley; or, pio...",cinc.,"pr. at the methodist book concern, for the author",1853.0,455.0,whi,methodist in the old northwest.,1927.0
9,finley,james bradley,,1781.0,1856.0,life among the indians...,cinc.,hitchcock & walden,1868.0,507.0,whi,an enlarged version of the previous item.,1928.0


This solution causes row misalignment, so the behavior of many rows being false is expected.

The only exception is repeating rows (i.e. last name) or sparse rows (i.e. missing names).

## Ensure mismatches of more than 1 row are not compared

In [18]:
pred_df_two_rows_mismatch.head()

Unnamed: 0,lastname,firstname,maidenname,birthyear,deathyear,title,city,publisher,publishyear,pagecount,library,description,index
0,field,stephen johnson,,1816.0,1899.0,personal reminiscences of early days in califo...,san francisco?,,1880.0,248.0,whi,by a judge and member of the state legislature.,1922.0
1,fifer,joseph wilson,,1840.0,0.0,"""private joe"" fifer. memories of war & peace.","bloomington, ill.",pantagraph pr. co.,1936.0,104.0,whi,civil war soldier and governor of illinois.,1923.0
2,finch,edwin ward,,1831.0,0.0,"the frontier, army and professional life of ed...",n.y.,"press of simmonds, manning & dawson",1909.0,119.0,dlc,a new york physician tells of his boyhood on a...,1924.0
3,finck,henry theophilos,,1854.0,1926.0,my adventures in the golden age of music.,n.y.,funk & wagnalls,1926.0,462.0,wu,music critic.,1925.0
4,finerty,john frederick,,1846.0,1908.0,war-path and bivouac... the big horn and yello...,chicago,,1890.0,460.0,dlc,"newspaper correspondent with crook, 1866-67, i...",1926.0


In [19]:
detailed_results = compare_dataframes_core(gt_df, pred_df_two_rows_mismatch, "exact")
detailed_results["results"]

{'matches': nan,
 'total': nan,
 'mismatch_bool': True,
 'pred_nrows': 18,
 'pred_adj_nrows': 18,
 'gt_nrows': 20}

## Test how missing columns are handled

In [20]:
pred_df_removed_col

Unnamed: 0,lastname,firstname,maidenname,birthyear,deathyear,title,city,publisher,publishyear,pagecount,library
0,field,joseph e.,,0.0,0.0,three years in texas...,boston,abel tompkins,1836.0,47.0,dlc
1,field,richard,,1843.0,0.0,richard field.,"lexington, mo.",,1930.0,38.0,mok
2,field,stephen johnson,,1816.0,1899.0,personal reminiscences of early days in califo...,san francisco?,,1880.0,248.0,whi
3,fifer,joseph wilson,,1840.0,0.0,"""private joe"" fifer. memories of war & peace.","bloomington, ill.",pantagraph pr. co.,1936.0,104.0,whi
4,finch,edwin ward,,1831.0,0.0,"the frontier, army and professional life of ed...",n.y.,"press of simmonds, manning & dawson",1909.0,119.0,dlc
5,finck,henry theophilos,,1854.0,1926.0,my adventures in the golden age of music.,n.y.,funk & wagnalls,1926.0,462.0,wu
6,finerty,john frederick,,1846.0,1908.0,war-path and bivouac... the big horn and yello...,chicago,,1890.0,460.0,dlc
7,finley,james bradley,,1781.0,1856.0,"autobiography of rev. james b. finley; or, pio...",cinc.,"pr. at the methodist book concern, for the author",1853.0,455.0,whi
8,finley,james bradley,,1781.0,1856.0,life among the indians...,cinc.,hitchcock & walden,1868.0,507.0,whi
9,finn,francis james,,1859.0,1928.0,"father finn, s.j.; the story of his life told ...",n.y.,benziger bros.,1929.0,236.0,nn


In [21]:
filter_expected_columns(pred_df_removed_col)

  df = df.fillna(EXPECTED_COLUMNS_DEFAULTS)


Unnamed: 0,lastname,firstname,maidenname,birthyear,deathyear,title,city,publisher,publishyear,pagecount,library,description,index
0,field,joseph e.,,0.0,0.0,three years in texas...,boston,abel tompkins,1836.0,47.0,dlc,,0
1,field,richard,,1843.0,0.0,richard field.,"lexington, mo.",,1930.0,38.0,mok,,0
2,field,stephen johnson,,1816.0,1899.0,personal reminiscences of early days in califo...,san francisco?,,1880.0,248.0,whi,,0
3,fifer,joseph wilson,,1840.0,0.0,"""private joe"" fifer. memories of war & peace.","bloomington, ill.",pantagraph pr. co.,1936.0,104.0,whi,,0
4,finch,edwin ward,,1831.0,0.0,"the frontier, army and professional life of ed...",n.y.,"press of simmonds, manning & dawson",1909.0,119.0,dlc,,0
5,finck,henry theophilos,,1854.0,1926.0,my adventures in the golden age of music.,n.y.,funk & wagnalls,1926.0,462.0,wu,,0
6,finerty,john frederick,,1846.0,1908.0,war-path and bivouac... the big horn and yello...,chicago,,1890.0,460.0,dlc,,0
7,finley,james bradley,,1781.0,1856.0,"autobiography of rev. james b. finley; or, pio...",cinc.,"pr. at the methodist book concern, for the author",1853.0,455.0,whi,,0
8,finley,james bradley,,1781.0,1856.0,life among the indians...,cinc.,hitchcock & walden,1868.0,507.0,whi,,0
9,finn,francis james,,1859.0,1928.0,"father finn, s.j.; the story of his life told ...",n.y.,benziger bros.,1929.0,236.0,nn,,0


The dropped columns should be filled back with default values, signifying that the code handles non-existent columns properly.