In [95]:
!ls "input/1850/GNA_bk_1850.txt"

input/1850/GNA_bk_1850.txt


In [96]:
#load data
import glob # import glob to retrieve files/pathnames matching a specified pattern
file_path = "input/1850/GNA_bk_1850.txt"
with open(file_path, 'r', encoding="cp1252") as f: # The encoding was different from the previous 2
  data = f.readlines()  

In [97]:
import pandas as pd
pd.DataFrame(data).shape

(17089, 1)

There are 17,089 rows

In [98]:
data[0:10]

['HE ARNES’\n',
 'BROOKLYN CITY\n',
 'DIRECTORY.\n',
 '1850-1851.\n',
 'NOTE.—Names having a * are the names of colored people\n',
 'Abbreviationsh. stands for house, n, for near, c. for corner, op. for\n',
 'opposite,b. for between, The precise location of residences thus de-\n',
 'scribed, may be ascertained by reference to the SHEET DIREOTORY.\n',
 'A\n',
 'Abberly Richard, shoemaker 168 Hudson av\n']

## Clean Data

In [99]:
# Remove Headers from data
header = ['BROOKLYN DIRECTORY.\n', 'BROOKLYN DIRECTORY,\n', 'BROOKLYN DIR-ECTORY,\n', 'BROOKLYN DIRECTOE.Y.\n', 'BROOKLYN DIBECTORY.\n', 'BROOKLYN DIR-ECTORY\n']
data = [entry for entry in data if entry not in header]

# Remove \n
data = [entry.replace('\n', '') for entry in data]

# Remove \t
data = [entry.replace('\t', ' ') for entry in data]

# Remove Blanks
data = [entry for entry in data if entry != ' ']

# Remove Page Numbers
import re
data = [entry for entry in data if not re.search('^[0-9]+$', entry)]


In [100]:
len(data)

16081

### Moving the [... down to the next row

In [101]:
import re
for index in range(0, len(data)):
    match = re.findall(r'\[.*$', data[index])
    if match:
        data[index + 1] = data[index + 1] + ' ' + str(match[0]).strip('[')
        data[index] = data[index].replace(str(match[0]), '')
    else:
        pass

### Issues with Paranthesis
- Adams J W, civil engineer Hoyt n State (Classon av
- Addy Edward 50 Middagh (tionary 262 Fulton

Abbreviations h. stands for house, n, for near, c. for corner, op. for\n',
 'opposite,b. for between

In [102]:
data[0:7]

['HE ARNES’',
 'BROOKLYN CITY',
 'DIRECTORY.',
 '1850-1851.',
 'NOTE.—Names having a * are the names of colored people',
 'Abbreviationsh. stands for house, n, for near, c. for corner, op. for',
 'opposite,b. for between, The precise location of residences thus de-']

In [103]:
# Remove first 8 lines of the Entry
data = data[8:16081]

### Occupations need to be separated from Address by ,

- Make a list of Occupations from 1850 MN dataset

In [195]:
pattern = ', [A-Za-z]+ [0-9A-Z]'
for index in range(0, len(data)):
    if len(data[index]) <3:
        pass
    else:
        if re.findall(pattern, data[index]):
            x = str(re.findall(pattern, data[index]))
            y = str(x).replace(' ', ', ')
            z = y.replace(',,', ',')
            data[index] = data[index].replace(str(x), str(z))

## Divide records into different sections based on first letter

In [88]:
import re

d = {}
value = []
i = 0
for line in data:
  line = line.strip()
  if line:
    if len(line) < 3: 
      # 1850 BK data is not split by '***' instead it is split by 'B, C, D'
      if bool(re.match("([A-Z])", line)) == True: # Removed \. because this dataset does not have that
        # save previous value
        if value:
          if value[0] == "'":
            value = value[1:]
          d[key] = value
        
        key = line
        value = []
    else:
      value.append(line)

In [89]:
# Check if function above works
len(d)

25

## Combline lines that belong to the same record

Criteria:

1) First letter of next line not equal to the initial letter in this section

2) Special cases in the start of next line: B’klyn, B'way, E., W.

In [90]:
for key,value in d.items():
  initial = key[0]
  new_value = []
  n = len(value)
  i = 0
  
  while i < n:
    if i < n-1:
      if (value[i+1][0] != initial and value[i+1][0] != "*") or (value[i+1][0:2] in ["B’", "E.", "W."]) or (value[i+1][0:2] == ". "):
        if value[i][-1] == "-":
          s = value[i][:-1] + value[i+1]
        else:
          s = value[i] + ' ' + value[i+1]
        new_value.append(s)
        i += 2
      else:
        new_value.append(value[i])
        i += 1 
    else:
      if value[i][0] == initial:
        new_value.append(value[i])
      i += 1
  
  d[key] = new_value      

In [91]:
# remove "(see adv...."
for key,value in d.items():
  n = len(value)
  for i in range(n):
    ls = value[i].split()
    n2 = len(ls)
    for j in range(n2):
      if "see" in ls[j]:
        value[i] = ' '.join(ls[:j])
        break

## Identify business vs person records

Made an update on 27/03/20 to change the index notation.

* The first letter of Company names were being removed by the function
    * ZUCCA BROTHERS
    * ARNAULT PETER
    * APPELL JACOB
    * APPLETON’S AMERICAN CYCLOPG5DIA
    * BURTON A. B 

In [93]:
# add an index to each record
# in another dictionary, for each index specify whether it's business or person record
index = 1
d_bp = []
for key,value in d.items():
  n = len(value)
  for i in range(n):
    ls = value[i].split()
    if ls:
      if ls[0][0] == "*" or (len(ls[0]) > 1 and ls[0].isupper()): # Upper case as well
        value[i] = value[i][0:] # Changed from [0:] to [1:]
        d_bp.append([str(index), "b"])
      else:
        d_bp.append([str(index), "p"])
    
      value[i] = str(index) + "\t" + value[i]
      index += 1

## Save files

In [94]:
file = open("input/1850/nypl_1850_bk_starred_clean.txt", "w")
for key,value in d.items():
  file.write(key + '\n')
  for record in value:
    file.write(record + '\n')
  file.write('\n')

file.close()

file2 = open("input/1850/bk_index_business_person.txt", "w")
for item in d_bp:
  file2.write(item[0] + ' ' + item[1] + '\n')
  
file2.close()