In [None]:
import pandas as pd
import os

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

### Classifying Scientists
Aim: To classify Scientists into those with links to Germany and those without links to Germany.

<b> First Loading in each xlsx into 1 dataframe </b>

Loading the excel sheets into dataframes, fixing the column names and adding a column to indicate the source file.

[ The Ocr_Index is a guess - it's not defined for manually inserted rows so am assuming it's from the OCR process(?) - won't matter either way. ]

In [None]:
cols = ['Ocr_Index', 'Page', 'Surname', 'Other_Names', 'Affiliation', 'Field', 'Full_Text', 'Indicator', 'sheet_name', 'EMPTY', 'Cleaning_Comments']
# get all xls in the folder
sheet_paths = [f for f in os.listdir(
    '.') if os.path.isfile(f) and f.endswith('.xls')]
dfs = []
for sheet_path in sheet_paths:
    df = pd.read_excel(sheet_path)

    df.loc[-1] = df.columns.tolist() # the sheets have no header, so pandas is using the first row as header
    df.index = df.index + 1  # so we have to re-add the columns as the first row
    df.sort_index(inplace=True)

    df_cols = cols[:len(df.columns)] # then fill in the correct column names
    df.columns = df_cols

    dfs.append(df)
# concat all dataframes
df = pd.concat(dfs, ignore_index=True)
df.head(3)

Unnamed: 0,Ocr_Index,Page,Surname,Other_Names,Affiliation,Field,Full_Text,Indicator,sheet_name,EMPTY,Cleaning_Comments
0,22968.0,1200,Rogers,Prof. Charles E(dwin),33 Concord St,CIVIL ENGINEERING,"Saratoga Co, N. Y, June 5, 74. C.E, Rensselaer...",0.0,6_1200_1400,Unnamed: 9,Unnamed: 10
1,22969.0,1200,Rogers,Charles F(letcher),University Farm,BIOCHEMISTRY,"Denver, Colo, June 15, 02. A.B, Nebr. Wesleyan...",0.0,6_1200_1400,,
2,22970.0,1200,Rogers,Prof. C(harles) G(ardner),378 Reamer Place,PHYSIOLOGY,"Perry, N. Y, March 4, 75. A.B, Syracuse, 97, A...",0.0,6_1200_1400,,


and just fixing str types

In [None]:
# Converting NaNs and other to empty strings
df = df.fillna('')

## First Approach: Just search for 'Germany', 'German' and 'Deutschland' etc
This is easy, and should be a good start.  There are likely unlabeled Germans in the dataset but unsure how many so far. Doing this will yield a list of institutions which can be searched for in futher steps

e.g: if a "Gortmund, Germany" is found, then scientists with just "Gortmund" later on can be classified as German

In [None]:
search_terms = ["German", "Germany", "Deutschland"]

In [None]:
def is_de(full_text, search_terms):
    global counter
    counter+=1
    if(counter%5000==0):
        print(f"Processed {counter} Rows")
    matched_terms = []
    for term in search_terms:
        if term in full_text:
            matched_terms.append(term)
    return matched_terms

In [None]:
# Filter for rows where full_text contains any of the search terms
counter = 0
df['matched_terms'] = df.apply(lambda x: is_de(x['Full_Text'], search_terms), axis=1)
df['de'] = df.apply(lambda x: len(x['matched_terms']) > 0, axis=1)
df['de'].value_counts()

Processed 5000 Rows
Processed 10000 Rows
Processed 15000 Rows
Processed 20000 Rows
Processed 25000 Rows


False    27294
True       436
Name: de, dtype: int64

Here we've only matched 436 of the 27,000 - around 16% which could be an undercount.
However, if we compare term frequencies in matched rows with non-matched rows we can gain an insight into other terms to consider.

We can do this by using term frequency-inverse document frequency (tf-idf) which is a measure of how important a term is to a document in a collection or corpus.  If we find tf-idf for each full-text row, we can compare the tf-idf of the matched rows with the non-matched rows.

In [None]:
# Find term counts in de=True rows
# Calculate tf–idf for rows with German and non-German matches
v = TfidfVectorizer()
x = v.fit_transform(df['full_text'])

In [15]:
436/27294

0.015974206785374076

##### Constructing the Term List

The German government has a (modern and incomplete) list of some cities <a href="https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.cities.csv"> here <a>

In [48]:
de_cities_df = pd.read_csv("https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.cities.csv")

In [53]:
# Get ascii version of the cities
de_cities_df['ascii_cities']=de_cities_df.City.apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))
# Append the ascii version and the standard version to the search terms
search_terms.extend(de_cities_df.ascii_cities.tolist())
search_terms.extend(de_cities_df.City.tolist())

In [69]:
# Filter for rows where full_text contains any of the search terms
counter = 0
df['de'] = df.apply(lambda x: is_de(x['Full_Text']), axis=1)

2500
5000
7500
10000
12500
15000
17500
20000
22500
25000
27500


An issue here is with German city names that are also English words, e.g: "March" or "Bell"
Limiting the usefullness of searching across the whole Full_Text

In [80]:
# Filter for rows where de isn't empty
df[df['de'].map(lambda d: len(d)) > 0]

Unnamed: 0,Ocr_Index,Page,Surname,Other_Names,Affiliation,Field,Full_Text,Indicator,sheet_name,EMPTY,Cleaning_Comments,de
2,22970.0,1200,Rogers,Prof. C(harles) G(ardner),378 Reamer Place,PHYSIOLOGY,"Perry, N. Y, March 4, 75. A.B, Syracuse, 97, A...",0.0,6_1200_1400,,,"[March, March]"
4,22972.0,1200,Rogers,Prof. Charles (Herbert),University of Minnesota,CHEMISTRY,"Belleville, 111, May 28, 89. Ph.C, St. Louis C...",0.0,6_1200_1400,,,"[Bell, Bell, Bel, Bell, Bell]"
6,22974.0,1200,Rogers,David Banks,Museum of Natural History,"ANTHROPOLOGY, PALEONTOLOGY","Pawpaw, 111, Sept. 13, 68. Kansas; Nebraska. T...",0.0,6_1200_1400,,,"[Nebra, Nebra]"
10,22978.0,1200,Rogers,Dr. Donald P(hillip),Oregon State College,BOTANY,"Toledo, Ohio, Feb. 5, 08. A.B, Oberlin Col, 29...",0.0,6_1200_1400,,,"[Nebra, Nebra]"
16,22984.0,1200,Rogers,Pres. H(arry) S(tanley),Polytechnic Institute of Brooklyn,ENGINEERING,"Detroit, Mich, Aug. 7, 90. B.S, Wyoming, 14, C...",0.0,6_1200_1400,,,"[Au, Au]"
...,...,...,...,...,...,...,...,...,...,...,...,...
27716,11995.0,599,Harper,Dr. W(illiam) E(dmund),Dominion Astro- physical Observatory,ASTROPHYSICS,"Bruce Co, Ont, March 20, 78. B.A, Toronto, 06,...",0.0,6_200_600,,,"[March, March]"
27717,11996.0,599,Harradon,H(arry) D(urward),1234 Hamilton St,GEOPHYSICS,"Auburn. Maine, Sept. 22, 83. A.B, Bates Col, 0...",0.0,6_200_600,,,"[Au, Berlin, Aub, Au, Berlin, Aub]"
27718,11998.0,599,Harrah,Prof. E(zra) C(Iarence),Colorado State College of Education,ZOOLOGY,"Douglass, Kans, March 9, 89. A.B, Southwestern...",0.0,6_200_600,,,"[March, March]"
27725,12005.0,599,Harries,Dr. F(ord) H(owell),Box 1100,ENTOMOLOGY,"Salt Lake City, Utah, Aug. 2, 03. A.B, Utah, 2...",0.0,6_200_600,,,"[Au, Au]"


In [47]:
df[df.Full_Text.str.contains('German')]

'. Berlin, Germany, Dec. 27, 76. B.S, Michigan, 97; Ph.D, Chicago, 16. Instr. psychol, Chicago, 16- 20; biometrist, Juvenile Psychopathic Inst, 17-20; asst. Prof. psychol, Kansas, 20-22, assoc, Prof. 22-27; biometrist, Inst. Child Guidance, 27-32. Fel. Orthopsychiat. Asn. Biometry; abnormal and clinical psychology.'

In [None]:
german

In [46]:
len(df)

27730

In [31]:
df

Unnamed: 0,Ocr_Index,Page,Surname,Other_Names,Affiliation,Field,Full-Text,Indicator,sheet_name,EMPTY,Cleaning_Comments
0,22968.0,1200,Rogers,Prof. Charles E(dwin),33 Concord St,CIVIL ENGINEERING,"Saratoga Co, N. Y, June 5, 74. C.E, Rensselaer...",0.0,6_1200_1400,Unnamed: 9,Unnamed: 10
1,22969.0,1200,Rogers,Charles F(letcher),University Farm,BIOCHEMISTRY,"Denver, Colo, June 15, 02. A.B, Nebr. Wesleyan...",0.0,6_1200_1400,,
2,22970.0,1200,Rogers,Prof. C(harles) G(ardner),378 Reamer Place,PHYSIOLOGY,"Perry, N. Y, March 4, 75. A.B, Syracuse, 97, A...",0.0,6_1200_1400,,
3,22971.0,1200,Rogers,Charles H(enry),,,,0.0,6_1200_1400,,
4,22972.0,1200,Rogers,Prof. Charles (Herbert),University of Minnesota,CHEMISTRY,"Belleville, 111, May 28, 89. Ph.C, St. Louis C...",0.0,6_1200_1400,,
...,...,...,...,...,...,...,...,...,...,...,...
27725,12005.0,599,Harries,Dr. F(ord) H(owell),Box 1100,ENTOMOLOGY,"Salt Lake City, Utah, Aug. 2, 03. A.B, Utah, 2...",0.0,6_200_600,,
27726,12006.0,599,Harriman,N(orman) F(ollett),U. S. Treasury Depart-ment,CHEMICAL ENGINEERING,"Cameron, Mo, Jan. 12, 78. Chief chemist and en...",0.0,6_200_600,,
27727,12007.0,599,Harrington,Arthur W(illiam),526 Federal Bldg,HYDRAULIC ENGINEERING,"Watertown, N. Y, June 7, 88. C. E, Cornell, 09...",0.0,6_200_600,,
27728,12008.0,599,Harrington,Prof. Carlos E(Imer),University of Buffalo,"MATHEMATICS, ENGINEERING","Buffalo, N. Y, Feb. 9, 92. M.E, Cornell, 18; M...",0.0,6_200_600,,


In [29]:
dfs[0]

Unnamed: 0,Ocr_Index,Page,Surname,Other_Names,Affiliation,Field,Full-Text,Indicator,sheet_name,EMPTY,Cleaning_Comments
0,22968.0,1200,Rogers,Prof. Charles E(dwin),33 Concord St,CIVIL ENGINEERING,"Saratoga Co, N. Y, June 5, 74. C.E, Rensselaer...",0.0,6_1200_1400,Unnamed: 9,Unnamed: 10
1,22969.0,1200,Rogers,Charles F(letcher),University Farm,BIOCHEMISTRY,"Denver, Colo, June 15, 02. A.B, Nebr. Wesleyan...",0.0,6_1200_1400,,
2,22970.0,1200,Rogers,Prof. C(harles) G(ardner),378 Reamer Place,PHYSIOLOGY,"Perry, N. Y, March 4, 75. A.B, Syracuse, 97, A...",0.0,6_1200_1400,,
3,22971.0,1200,Rogers,Charles H(enry),,,,0.0,6_1200_1400,,
4,22972.0,1200,Rogers,Prof. Charles (Herbert),University of Minnesota,CHEMISTRY,"Belleville, 111, May 28, 89. Ph.C, St. Louis C...",0.0,6_1200_1400,,
...,...,...,...,...,...,...,...,...,...,...,...
3461,26627.0,1399,Taussig*,Prof. Albert E(rnst),4500 Olive St,MEDICINE,"St. Louis, Mo, May 6, 71. A.B, Harvard, 91; M....",0.0,6_1200_1400,,
3462,26628.0,1399,Taussig,Dr. Fred(erlck) J(oseph),Washington University,"ECOLOGY, OBSTETRICS","Brooklyn, N. Y. Oct. 26, 72. A.B, Harvard, 93,...",0.0,6_1200_1400,,
3463,26629.0,1399,Tavanlar,E(ligio) J,Carrier Corp,ENGINEERING,"Binalonan, Pangasinan, P. I, Dec, 05. B.S, Yal...",0.0,6_1200_1400,,
3464,26630.0,1399,Taverner,P(ercy) A(lgernon),National Museum of Canada,ORNITHOLOGY,"Guelph, Ont, June 10, 75. Asst, curator, NAT. ...",0.0,6_1200_1400,,


In [14]:
df  = pd.read_excel(sheet_paths[0])

In [15]:
df

Unnamed: 0,22968,1200,Rogers,Prof. Charles E(dwin),33 Concord St,CIVIL ENGINEERING,"Saratoga Co, N. Y, June 5, 74. C.E, Rensselaer Polytech, 96; M.C.E, Harvard, 15. Practising engineer, 96-01; instr. civil eng, Lehigh, 01—04; Prof. math, and civil eng, Clarkson Tech, 04-05; CIVIL ENG, TRINITY COL. (.CONN), 05- A.A; Eng. Educ; Astron. Soc; Conn. Soc. Civil Eng. Solar radiation.",0,6_1200_1400,Unnamed: 9,Unnamed: 10
0,22969.0,1200,Rogers,Charles F(letcher),University Farm,BIOCHEMISTRY,"Denver, Colo, June 15, 02. A.B, Nebr. Wesleyan...",0.0,6_1200_1400,,
1,22970.0,1200,Rogers,Prof. C(harles) G(ardner),378 Reamer Place,PHYSIOLOGY,"Perry, N. Y, March 4, 75. A.B, Syracuse, 97, A...",0.0,6_1200_1400,,
2,22971.0,1200,Rogers,Charles H(enry),,,,0.0,6_1200_1400,,
3,22972.0,1200,Rogers,Prof. Charles (Herbert),University of Minnesota,CHEMISTRY,"Belleville, 111, May 28, 89. Ph.C, St. Louis C...",0.0,6_1200_1400,,
4,22973.0,1200,Rogers,Dr. C(olonel) H(oyt),Temple,PLANT PATHOLOGY,"Mullins, S. C, Jan. 6, 06. B.S, Clemson Col, 2...",0.0,6_1200_1400,,
...,...,...,...,...,...,...,...,...,...,...,...
3460,26627.0,1399,Taussig*,Prof. Albert E(rnst),4500 Olive St,MEDICINE,"St. Louis, Mo, May 6, 71. A.B, Harvard, 91; M....",0.0,6_1200_1400,,
3461,26628.0,1399,Taussig,Dr. Fred(erlck) J(oseph),Washington University,"ECOLOGY, OBSTETRICS","Brooklyn, N. Y. Oct. 26, 72. A.B, Harvard, 93,...",0.0,6_1200_1400,,
3462,26629.0,1399,Tavanlar,E(ligio) J,Carrier Corp,ENGINEERING,"Binalonan, Pangasinan, P. I, Dec, 05. B.S, Yal...",0.0,6_1200_1400,,
3463,26630.0,1399,Taverner,P(ercy) A(lgernon),National Museum of Canada,ORNITHOLOGY,"Guelph, Ont, June 10, 75. Asst, curator, NAT. ...",0.0,6_1200_1400,,


In [13]:
dfs[0]


Unnamed: 0,22968,1200,Rogers,Prof. Charles E(dwin),33 Concord St,CIVIL ENGINEERING,"Saratoga Co, N. Y, June 5, 74. C.E, Rensselaer Polytech, 96; M.C.E, Harvard, 15. Practising engineer, 96-01; instr. civil eng, Lehigh, 01—04; Prof. math, and civil eng, Clarkson Tech, 04-05; CIVIL ENG, TRINITY COL. (.CONN), 05- A.A; Eng. Educ; Astron. Soc; Conn. Soc. Civil Eng. Solar radiation.",0,6_1200_1400,Unnamed: 9,Unnamed: 10,filename
0,22969.0,1200,Rogers,Charles F(letcher),University Farm,BIOCHEMISTRY,"Denver, Colo, June 15, 02. A.B, Nebr. Wesleyan...",0.0,6_1200_1400,,,6_1200_1400_afterRachel.xls
1,22970.0,1200,Rogers,Prof. C(harles) G(ardner),378 Reamer Place,PHYSIOLOGY,"Perry, N. Y, March 4, 75. A.B, Syracuse, 97, A...",0.0,6_1200_1400,,,6_1200_1400_afterRachel.xls
2,22971.0,1200,Rogers,Charles H(enry),,,,0.0,6_1200_1400,,,6_1200_1400_afterRachel.xls
3,22972.0,1200,Rogers,Prof. Charles (Herbert),University of Minnesota,CHEMISTRY,"Belleville, 111, May 28, 89. Ph.C, St. Louis C...",0.0,6_1200_1400,,,6_1200_1400_afterRachel.xls
4,22973.0,1200,Rogers,Dr. C(olonel) H(oyt),Temple,PLANT PATHOLOGY,"Mullins, S. C, Jan. 6, 06. B.S, Clemson Col, 2...",0.0,6_1200_1400,,,6_1200_1400_afterRachel.xls
...,...,...,...,...,...,...,...,...,...,...,...,...
3460,26627.0,1399,Taussig*,Prof. Albert E(rnst),4500 Olive St,MEDICINE,"St. Louis, Mo, May 6, 71. A.B, Harvard, 91; M....",0.0,6_1200_1400,,,6_1200_1400_afterRachel.xls
3461,26628.0,1399,Taussig,Dr. Fred(erlck) J(oseph),Washington University,"ECOLOGY, OBSTETRICS","Brooklyn, N. Y. Oct. 26, 72. A.B, Harvard, 93,...",0.0,6_1200_1400,,,6_1200_1400_afterRachel.xls
3462,26629.0,1399,Tavanlar,E(ligio) J,Carrier Corp,ENGINEERING,"Binalonan, Pangasinan, P. I, Dec, 05. B.S, Yal...",0.0,6_1200_1400,,,6_1200_1400_afterRachel.xls
3463,26630.0,1399,Taverner,P(ercy) A(lgernon),National Museum of Canada,ORNITHOLOGY,"Guelph, Ont, June 10, 75. Asst, curator, NAT. ...",0.0,6_1200_1400,,,6_1200_1400_afterRachel.xls


In [6]:
files


['6_1200_1400_afterRachel.xls',
 '6_600_800_afterRachel.xls',
 '6_1400_1600_afterNisha.xls',
 '6_800_1000_afterRachel.xls',
 '6_100_200_afterRachel.xls',
 '6_1000_1200_afterNisha.xls',
 '6_1_100_afterRachel.xls',
 '6_200_600_afterRachel.xls']