In [1]:
!ls "input/1850/GNA_bk_1850.txt"

input/1850/GNA_bk_1850.txt


In [2]:
#load data
import glob # import glob to retrieve files/pathnames matching a specified pattern
file_path = "input/1850/GNA_bk_1850.txt"
with open(file_path, 'r', encoding="cp1252") as f: # The encoding was different from the previous 2
  data = f.readlines()  

In [3]:
import pandas as pd
pd.DataFrame(data).shape

(17089, 1)

There are 17,089 rows

In [4]:
data[0:50]

['HE ARNES’\n',
 'BROOKLYN CITY\n',
 'DIRECTORY.\n',
 '1850-1851.\n',
 'NOTE.—Names having a * are the names of colored people\n',
 'Abbreviationsh. stands for house, n, for near, c. for corner, op. for\n',
 'opposite,b. for between, The precise location of residences thus de-\n',
 'scribed, may be ascertained by reference to the SHEET DIREOTORY.\n',
 'A\n',
 'Abberly Richard, shoemaker 168 Hudson av\n',
 'Abberly Samuel, mason Dean n Vanderbilt av\n',
 'Abbey A C, carpenter 82 Poplar\n',
 'ABBES HORATIO G, Columbia Institute for the education of\n',
 'boys 75 Columbia\n',
 'Abbey W S, merchant NYh Union n Court\n',
 'Abbott Abraham, marble sawyer Pacific n Boerum\n',
 'Abbott Daniel, 25 Front\t[Court\n',
 'Abbott Francis H, merchant 141 Front NYh Joralemon n\n',
 'Abbott H B, milkman Kent av north\n',
 'Abbott James, laborer Bond c State\t[Hoyt\n',
 'Abbott John D, merchant 295.5 Pearl NYh 362 Atlantic n\n',
 'Abbott Moses, furrier 74 Concord\n',
 'Abbott Samuel D, butcher Myrtle c Ja

## Clean Data

In [5]:
# Remove first 8 lines of the Entry
data = data[8:17089]

In [6]:
# Remove Headers from data
header = ['BROOKLYN DIRECTORY.\n', 'BROOKLYN DIRECTORY,\n', 'BROOKLYN DIR-ECTORY,\n', 'BROOKLYN DIRECTOE.Y.\n', 'BROOKLYN DIBECTORY.\n', 'BROOKLYN DIR-ECTORY\n', 'BROOKLYN BIRECTORY,\n', 'BROOK.LYN DIRECTORY.', '160 BROOKLYN DIRECTORY.', 'BROOKLYN HIRECTORY.', 'BROOKLYN DIRECTOHY.', 'BROOKLYN tlKECTORY.', 'HIIOOKLYN DIRECTORY.', 'BROOKLYN DIKECTOKY.', 'BH00KL1N DIRECTORY.']
data = [entry for entry in data if entry not in header]

# Remove \n
data = [entry.replace('\n', '') for entry in data]

# Remove \t
data = [entry.replace('\t', ' ') for entry in data]

# Remove Blanks
data = [entry for entry in data if entry != ' ']

# Remove Page Numbers
import re
data = [entry for entry in data if not re.search('^[0-9]+$', entry)]

In [7]:
# Remove any BROOKLYN DIRECTORY. Page number entries
import re
for index in range(0, len(data)):
    clear = re.findall(r'^BROOKLYN DIRECTORY. [0-9]+$', data[index])
    if clear:
        data[index] = ""
    else:
        pass

In [8]:
# Change 'IFF' to N
for index in range(0, len(data)):
    n = re.findall('^Iff$', data[index])
    if n:
        data[index] = "N"
    else:
        pass

In [9]:
len(data)

16071

### Moving the [... down to the next row

In [10]:
import re
for index in range(0, len(data)):
    match = re.findall(r'\[.*$', data[index])
    if match:
        data[index + 1] = data[index + 1] + ' ' + str(match[0]).strip('[')
        data[index] = data[index].replace(str(match[0]), '')
    else:
        pass

In [11]:
len(data)

16071

### Issues with Paranthesis
- Adams J W, civil engineer Hoyt n State (Classon av
- Addy Edward 50 Middagh (tionary 262 Fulton

#### Resolve By:
- If the length of the line before is longer than the length of the line after, add the (... to the line before
    - vice versa

In [12]:
for index in range(0, len(data)):
    match = re.findall(r'\(.*$', data[index])
    if match:
        if len(data[index + 1]) > len(data[index - 1]):
            data[index + 1] = data[index + 1] + ' ' + str(match[0]).strip('(')
            data[index] = data[index].replace(str(match[0]), '')
        else:
            data[index - 1] = data[index - 1] + ' ' + str(match[0]).strip('(')
            data[index] = data[index].replace(str(match[0]), '')
    else:
        pass    

In [13]:
# Remove those Sta- ionary types
for index in range(0, len(data)):
    data[index] = data[index].replace('- ', '')

In [14]:
len(data)

16071

### Occupations need to be separated from Address by ,

In [15]:
# Deal with one word Occupations
pattern1 = ', [a-z]+ [0-9A-Z]' # If occupation is 1 word
for index in range(0, len(data)):
    if re.findall(pattern1, data[index]):
        x = re.findall(pattern1, data[index])[0]
        y = x.replace(' ', ', ')
        z = y.replace(',,', ',')
        data[index] = data[index].replace(x, z)
    else:
        pass

In [16]:
# Deal with two word Occupations
pattern2 = ', [a-z]+ [a-z]* [0-9A-Z]' # If occupation is 2 words
for index in range(0, len(data)):
    if re.findall("widow", data[index]): # Ignore widow cases
        pass
    else:
        if re.findall(pattern2, data[index]):
            x = re.findall(pattern2, data[index])[0]
            y = re.findall(' [0-9A-Z]', x)[0]
            z = re.sub(y, ',' + y, x)
            data[index] = data[index].replace(x, z)
        else:
            pass

In [17]:
# Deal with 3 word Occupations
pattern3 = ', [a-z]+ [a-z]* [a-z]* [0-9A-Z]' # If occupation is 3 words
for index in range(0, len(data)):
    if re.findall("widow", data[index]): # Ignore widow cases
        pass
    else:
        if re.findall(pattern3, data[index]):
            x = re.findall(pattern3, data[index])[0]
            y = re.findall(' [0-9A-Z]', x)[0]
            z = re.sub(y, ',' + y, x)
            data[index] = data[index].replace(x, z)
        else:
            pass

In [18]:
# Deal with 4 word Occupations
pattern4 = ', [a-z]+ [a-z]* [a-z]* [a-z]* [0-9A-Z]' # If occupation is 4 words
for index in range(0, len(data)):
    if re.findall("widow", data[index]): # Ignore widow cases
        pass
    else:
        if re.findall(pattern4, data[index]):
            x = re.findall(pattern4, data[index])[0]
            y = re.findall(' [0-9A-Z]', x)[0]
            z = re.sub(y, ',' + y, x)
            data[index] = data[index].replace(x, z)
        else:
            pass

In [19]:
# Deal with 5 word Occupations
pattern5 = ', [a-z]+ [a-z]* [a-z]* [a-z]* [a-z]* [0-9A-Z]' # If occupation is 5 words
for index in range(0, len(data)):
    if re.findall("widow", data[index]): # Ignore widow cases
        pass
    else:
        if re.findall(pattern5, data[index]):
            x = re.findall(pattern5, data[index])[0]
            y = re.findall(' [0-9A-Z]', x)[0]
            z = re.sub(y, ',' + y, x)
            data[index] = data[index].replace(x, z)
        else:
            pass

In [20]:
# Deal with 6 word Occupations
pattern6 = ', [a-z]+ [a-z]* [a-z]* [a-z]* [a-z]* [a-z]* [0-9A-Z]' # If occupation is 6 words
for index in range(0, len(data)):
    if re.findall("widow", data[index]): # Ignore widow cases
        pass
    else:
        if re.findall(pattern6, data[index]):
            x = re.findall(pattern6, data[index])[0]
            y = re.findall(' [0-9A-Z]', x)[0]
            z = re.sub(y, ',' + y, x)
            data[index] = data[index].replace(x, z)
        else:
            pass

In [21]:
# Deal with 7 word Occupations
pattern7 = ', [a-z]+ [a-z]* [a-z]* [a-z]* [a-z]* [a-z]* [a-z]* [0-9A-Z]' # If occupation is 7 words
for index in range(0, len(data)):
    if re.findall("widow", data[index]): # Ignore widow cases
        pass
    else:
        if re.findall(pattern7, data[index]):
            x = re.findall(pattern7, data[index])[0]
            y = re.findall(' [0-9A-Z]', x)[0]
            z = re.sub(y, ',' + y, x)
            data[index] = data[index].replace(x, z)
        else:
            pass

In [22]:
for index in range(0, len(data)):
    print(data[index])

A
Abberly Richard, shoemaker, 168 Hudson av
Abberly Samuel, mason, Dean n Vanderbilt av
Abbey A C, carpenter, 82 Poplar
ABBES HORATIO G, Columbia Institute for the education of
boys 75 Columbia
Abbey W S, merchant, NYh Union n Court
Abbott Abraham, marble sawyer, Pacific n Boerum
Abbott Daniel, 25 Front 
Abbott Francis H, merchant, 141 Front NYh Joralemon n Court
Abbott H B, milkman, Kent av north
Abbott James, laborer, Bond c State 
Abbott John D, merchant, 295.5 Pearl NYh 362 Atlantic n Hoyt
Abbott Moses, furrier, 74 Concord
Abbott Samuel D, butcher, Myrtle c Jay h 115 Johnson
Abburty John, shoemaker, 261 Marshall
Abel E L, accountant, 301 Gold
Aber Hiram, sash and blind maker, 33 Prince
Aber Smith M, sashmaker, 288 Hudson av
Abercrombie George, shoemaker, Navy c Sands
Abercrombie John, shoemaker, 116 Concord
Aborn Robert W, merchant, NYh 180 Henry
Abraham David, capmaker, 5 York
Abrahams Joseph, laborer, 3 Kelsey’s alley
Abrahams Jane, widow of James, 95 Concord
*Abrahams Thompson, 

Buck John B, lithographer, Skillman n Myrtle av
Buck Josephus, carman, 1-16 Nassau c Stanton 
Buck Richard P, merchant, 29 South N Y h 97 State c Clinton
Buck Mary, widow, 109 Front
Euckbee Susan, widow, 51 Main
Buckland William, mason, Court n Sackett
Buckelew Alfred, painter, 14 Prince 
Bukelew Ira C, stair case maker, Raymond n Myrtle h 61 Carlton n Myrtle av
Buckenbergen Anthony, cabinet maker, Kent n Myrtle av
Buckley James, machinist, 132 Water
Buckley Jeremiah, carman, Butler n Bond
Buckley John, laborer, 212 Front
Buckley Amon, merchant, NYh Clinton n Harrison
Buckley Elizabeth, Pearl c Concord
Buckley John, carman, Butler n Bond
Buckley John, tavern, Myrtle av n Adelphi
Buckley John, porterhouse, Park n Vanderbilt av
Buckley Michael, laborer, Butler n Bond
Buckley T, boat builder, 43 Carlton av
Buckley William, laborer, Penny Bridge rd Gowanus
Bucklin Thomas P, merchant, N Y h 36 Willow
Buckey William,jeweller 145 Concord
Budd , lieutcntant, U s N 41 Butler
Budden Frederick P,

Cronk Alexander, engineer, 62 Pacific n Smith
Cronk Daniel, 62 Pacific n Powers
Crook Gabriel B, ship joiner, 47 High 
Crook George, fishing tackle manufacturer, 50 Fulton N Y h 31 Front
Crook John H, refectory, 19 Fulton marketN Yh 174 Adams
Crook Phebe, widow of John, 126 Pearl
Crook Richard L, mer. 22 Pine N Y h Clinton n Fulton av
Crook Robert, coachman, 256 Hudson av
Crook Samuel H, dining saloon, 27 Fulton NYh 191 Front
Crook Samuel, refectory, N Y h 113 High
Crook Thomas, finding store, 132 Atlantic n Henry 
Crooke Philip S, attorney and counsellor, 307 Fulton h Flatbush
Crooker A C, seaman, 156 Atlantic
Crooker Catherinc, school, 109 Livingston 
Crooker Geo. R, lock manf. Atlantic n Clinton h Hicks n President Livingston
Crooker Henry, pump maker, 12 Plymouth 
Crooker Zenas, lock manufacturer, Atlantic n Clinton h 109
Crooker Z B, nail factory c, Water & Jay office Atlantic n
Clinton h Court n Degraw
Crooker’s Brooklyn lock company Atlantic n Clinton
Croos Helena, 142 Sands
Cro

Farrell Thomas, grocer, 19th st n 4th av Gowanus
Farrelly Catherine, widow fancy store 60 Atlantic
Farren Daniel, stone cutter, 184 York
Farren Dennis, grocer, Furman c Joralemon
Farren James, mason, 42 Main
Farren James, stone cutter, 260 Marshall
Farren John, 1st W'ard Hotel Furman n Doughty
Farren John, laborer, John c Gold
Farren John, engineer, 9S Gold
Farren John, laborer, 110 York
Farren Michael, sawyer, Raymond n Tillary
Farren Michael, 38 Little
Farren M, engineer, 33 Hudson av
Farren Neil, teamster, Raymond n Tillary
Farren Richard, machinist, Graham n Flushing av
Fairer Henry, grocer, Charles c Prospect
Farrigan Owen, grocer, 64 Hudson av 
F'arrington Hiram, backsmith, Flushing n Kent av & FrankJin n Myrtle av
Farrington J M, butcher, 79 Bridge
Farrington L, cooper, Hartt’s Alley n Gold 
Farwell John W, salesman, 10 Cottage row Columbia n Joralemon
Fassold John, musician, Navy n York
Faulkner George, broker, NYh 139 Duffield 
Faulkner James T, cap maker, 145 Water N Y h 81 W

Holdridge Daniel L M, tailor, 131 Pearl
Holdsworth John, hatter, 74 Fulton NYh Clinton n Myrtle av
Holin John C, shoemaker, Fulton c Hudson av
Hollahan Thomas, contractor, Pacific n Smith
Hollahan Tnomas, laborer, Bergen c Smith
Holland Sarah, widow, 193 Pearl
Holland Stephen, printer, 2 Prince
Hollely Joseph, brass turner, 250 Marshall
Hollely Thomas C, brass finisherr, 141 Front
Hollely Thomas, turner, 134 Hudson av
Hollis Edward, clerk, 67 Lawrence
Hollis Sihs, shoemaker, Myrtle n Carlton av
Hollister Nathan, grocer, 73 Atlantic h 98 Harrison
Hollister W H, agent union life, Ins Co 37 Wall NYh 279 Jay
Holly John, grocer, Hamilton av n Atlantic Dock 
Holly William, carpenter, 138 W'illoughby
Holly Lawrence K, mason, 269 Gold
Holman Harman, grocer, Myrtle av c Raymond
Holmes Charles, grocer, 298 Gold b Willoughby & Myrtle av
Holmes Henry A, ship master, Carlton n Park av
Hol mes John, clerk, Mill road Gowanus
Hol mes Miss Lucy, milliner, 253 Fulton c Johnson
Holmes Nathaniel, lithogra

Magbey James, japanner, Smith n Hamilton av
Magie James K, printer, 149 Carll n Myrtle av
Magiff Michael, laborer, 30 Amity n Columbia
Magill A H, physician, Myrtle av b Raymond & Canton
Magner Trebius, 202 Columbia n State
Magushen Patrick, laborer, Amity n Columbia
Mal,ady James, laborer, 80 Prince
Mahady John, laborer, 6 Hart’s alley n Bridge
Mahady William, laborer, Prince n Tillary
Mahan Edward, Kent n Myrtle av
'Mahan Henry,jeweller 138 Carll
Mahan Michael, laborer, Harris buildings Plymouth n Little
Mahar Daniel, cooper, NYh Kent n Park av
Maher Daniel, laborer, Pearl b Plymouth and Water
Maher Edward, rigger, 3 Howard’s Court Main st
BROOKLYN BIEECTOKY.
Maher Edward, laborer, Jay c Harper court
Maher James, rigger, 206 High
Maher John F, printer, 150 York
Maher P, grocery, Columbia c Doughty
Maher Patrick, laborer, Pacific n Powers
Maher Thomas, carman, 4 Tiffany place n Harrison
Mahon John, morocco f'actory 17 & 19 Adams
Mahon Julia, widow, Amity n Columbia
Mahon Patrick, tail

NEVIN JAMES, book store, 252 Fulton h 267 Washington
Nevin John, grocer, 108 Court
BE00KLYN D1KECT0EY.
NEVIN MICHAEL, book store, 176 Fulton n Orange
Nevins William, sash and blind maker, 168 Johnson 
Nevins Edward M, sash and blind maker, Adelphi n Myrtle av
Newbould J A, com. mer. N Y h 92 Columbia
Newbrook B, tailor, Flushing av op Spencer
Newcomb Harvey C, 11 Tillary
Newcomb James W, builder, 88 Duffield
Newcomb James, carpenter, 14 Prince
Newell Daniel, comb maker, 59 Tillary
Newell John, engineer, 214 Pearl
Newell William, machinist, Navy c DeKalb av
Newell W M, shoe dealer, 111 Pearl N Y h 4 Sydney place
Newett John, clerk, 212 Jay
Newey Samuel, 42 State
Newham James, Willow c Poplar
Newland James W, machinist, 58 Adams
Newlin Edward, bookeeper, NYh Hamilton av n Court st Sands
Newman Christopher, Skillman n DeKalb av 
Newman Clement D, att’y & coun. 80 Nassau N Y h 51
Newman 'Christopher, laborer, Hudson av c Bolivar
Newman Franklin, com. mer. 50 Broad NYh Clinton n Myrtle av
N

Schultz John D, importer, 39 John NYh Dean n Powers
SCHULTZ & STRIKER, lumber yard, Front b Bridge & Gold
Schuyler William, 86 Middagh
*Schuyler Joseph, 101 Nassau
Scoles Frederick, Kent av north op Navy yard
Scoles Maria, widow, Kent av north op Navy yard
Scott Alexander, carpenter, 212 Jay
Scott Bernard, laborer, Court n 4th Place
Scott George, miller, 316 Atlantic
Scott Gould, upholsterer, 154 Atlantic
Scott Hugh, grocer, 75 Furman
Scott James, carpenter, Schermerhorn n Powers
,Scott James, lighterman, 157 Furman
Scott J L, druggist, 7 Hamilton av
Scott James L, Columbia n President
Scott John, laborer, Adams b Front and Water
Scott John, printer, 60 Stanton
Scott John, grocer, Dean c Underbill av
Scott John, carpenter, Bondc State
Scott John, mason, Johnson c Gold
Scott John H, hardware, NY h 89 Henry
Scott Mary, dressmaker, Pearl c Concord
Scott Sands, Washington st Shades 259 Washington
Scott Thomas P, book agent, 26 John NYh Navy n DeKalb av
Scott YVilliam, baker, 114 Navy 
Scot

Van Brunt Nicholas R, coal yard, S3 Myrtle av h Fulton av op Bridge
Van Brunt Nicholas, Pacific n Hoyt 
Van Brunt Robert B, carpenter, 1U7 Navy
Van Brunt Rulif, staircaser, 30 Stanton
Van Brunt Tunis, oyst,er saloon 23 Fulton
Van Brunt Thomas H, gold pen maker, N Y h 88 Duffield
Van Brunt William B, undertakerand sexton, South Presby-
terian Church h 133 Atlantic
Van Brunt William, silver plater, Cumberland n Myrtle av
Van Buren M B, 6 Cottage pl Columbia n Degraw
VanP,uren Thomas W, carpenter, 144 York
Van Buskirk Abraham, merchant, N Y’ h 126 Hicks
Van Cleef Cornelius, Jamaica rd n Clinton av
Van Cleef John, fancy dry goods, 259 Fu]ton
Van Cleef Rulif S, ferrymaster, 156 Joralemon n Court
Van Cott Ann, widow boarding house 13 Atlantic
Van Cott Catherine, widow, 3 Howards ex pl Watcr c Main
Van Cott J M, attorney at law, NYh Henry c Montague pl
Van Cott & Silliman, coffee and spice mills, 141 Adams
Van Doren C W, dry goods store, 264 Fulton c Pierrepont
Van Duren P, laborer, Concord n

### Divide records into different sections based on first letter

In [23]:
d = {}
value = []
i = 0
for line in data:
  line = line.strip()
  if line:
    if len(line) == 1: 
      if re.match("^[A-Z]$", line):
        # save previous value
        if value:
          if value[0] == "'":
            value = value[1:]
          d[key] = value
        
        key = line
        value = []
    else:
      value.append(line)

In [24]:
# Check if function above works
len(d)

24

## Combline lines that belong to the same record

Criteria:

1) First letter of next line not equal to the initial letter in this section

2) Special cases in the start of next line: B’klyn, B'way, E., W.

In [25]:
for key,value in d.items():
  initial = key[0]
  new_value = []
  n = len(value)
  i = 0
  
  while i < n:
    if i < n-1:
      if (value[i+1][0] != initial and value[i+1][0] != "*") or (value[i+1][0:2] in ["B’", "E.", "W."]) or (value[i+1][0:2] == ". "):
        if value[i][-1] == "-":
          s = value[i][:-1] + value[i+1]
        else:
          s = value[i] + ' ' + value[i+1]
        new_value.append(s)
        i += 2
      else:
        new_value.append(value[i])
        i += 1 
    else:
      if value[i][0] == initial:
        new_value.append(value[i])
      i += 1
  
  d[key] = new_value      

## Identify business vs person records

Made an update on 27/03/20 to change the index notation.

* The first letter of Company names were being removed by the function
    * ZUCCA BROTHERS
    * ARNAULT PETER
    * APPELL JACOB
    * APPLETON’S AMERICAN CYCLOPG5DIA
    * BURTON A. B 

In [26]:
# add an index to each record
# in another dictionary, for each index specify whether it's business or person record
index = 1
d_bp = []
for key,value in d.items():
  n = len(value)
  for i in range(n):
    ls = value[i].split()
    if ls:
      if ls[0][0] == "*" or (len(ls[0]) > 1 and ls[0].isupper()): # Upper case as well
        value[i] = value[i][0:] # Changed from [0:] to [1:]
        d_bp.append([str(index), "b"])
      else:
        d_bp.append([str(index), "p"])
    
      value[i] = str(index) + "\t" + value[i]
      index += 1

## Save files

In [27]:
file = open("input/1850/nypl_1850_bk_starred_clean.txt", "w")
for key,value in d.items():
  file.write(key + '\n')
  for record in value:
    file.write(record + '\n')
  file.write('\n')

file.close()

file2 = open("input/1850/bk_index_business_person.txt", "w")
for item in d_bp:
  file2.write(item[0] + ' ' + item[1] + '\n')
  
file2.close()