In [24]:
import pandas as pd
import os

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

from collections import Counter

### Classifying Scientists
Aim: To classify Scientists into those with links to Germany and those without links to Germany.

<b> First Loading in each xlsx into 1 dataframe </b>

Loading the excel sheets into dataframes, fixing the column names and adding a column to indicate the source file.

[ The Ocr_Index is a guess - it's not defined for manually inserted rows so am assuming it's from the OCR process(?) - won't matter either way. ]

In [25]:
cols = ['Ocr_Index', 'Page', 'Surname', 'Other_Names', 'Affiliation', 'Field', 'Full_Text', 'Indicator', 'sheet_name', 'EMPTY', 'Cleaning_Comments']
# get all xls in the folder
sheet_paths = [f for f in os.listdir(
    '.') if os.path.isfile(f) and f.endswith('.xls')]
dfs = []
for sheet_path in sheet_paths:
    df = pd.read_excel(sheet_path)

    df.loc[-1] = df.columns.tolist() # the sheets have no header, so pandas is using the first row as header
    df.index = df.index + 1  # so we have to re-add the columns as the first row
    df.sort_index(inplace=True)

    df_cols = cols[:len(df.columns)] # then fill in the correct column names
    df.columns = df_cols
    df['sheet_name'] = sheet_path

    dfs.append(df)
# concat all dataframes
df = pd.concat(dfs, ignore_index=True)
df.head(3)

Unnamed: 0,Ocr_Index,Page,Surname,Other_Names,Affiliation,Field,Full_Text,Indicator,sheet_name,EMPTY,Cleaning_Comments
0,22968.0,1200,Rogers,Prof. Charles E(dwin),33 Concord St,CIVIL ENGINEERING,"Saratoga Co, N. Y, June 5, 74. C.E, Rensselaer...",0.0,6_1200_1400_afterRachel.xls,Unnamed: 9,Unnamed: 10
1,22969.0,1200,Rogers,Charles F(letcher),University Farm,BIOCHEMISTRY,"Denver, Colo, June 15, 02. A.B, Nebr. Wesleyan...",0.0,6_1200_1400_afterRachel.xls,,
2,22970.0,1200,Rogers,Prof. C(harles) G(ardner),378 Reamer Place,PHYSIOLOGY,"Perry, N. Y, March 4, 75. A.B, Syracuse, 97, A...",0.0,6_1200_1400_afterRachel.xls,,


and just fixing str types and making Full_Text lower case

In [26]:
# Converting NaNs and other to empty strings
df = df.fillna('')

df['Full_Text'] = df['Full_Text'].str.lower()

<img src="https://raw.githubusercontent.com/FM-ds/FM-ds.github.io/main/misc_resource/men_of_science.png"> </img>

## First Approach: Just search for 'Germany', 'German' and 'Deutschland' etc
This is easy, and should be a good start.  There are likely unlabeled Germans in the dataset but unsure how many so far. Doing this will yield a list of institutions which can be searched for in futher steps

e.g: if a "Gortmund, Germany" is found, then scientists with just "Gortmund" later on can be classified as German

In [27]:
search_terms = ["German", "Germany", "Deutschland"]
search_terms = [term.lower() for term in search_terms]

In [28]:
def is_de(full_text, search_terms):
    global counter
    counter+=1
    if(counter%5000==0):
        print(f"Processed {counter} Rows")
    matched_terms = []
    for term in search_terms:
        if term in full_text:
            matched_terms.append(term)
    return matched_terms

In [29]:
# Filter for rows where full_text contains any of the search terms
counter = 0
df['matched_terms'] = df.apply(lambda x: is_de(x['Full_Text'], search_terms), axis=1)
df['de'] = df.apply(lambda x: len(x['matched_terms']) > 0, axis=1)
df['de'].value_counts()

Processed 5000 Rows
Processed 10000 Rows
Processed 15000 Rows
Processed 20000 Rows
Processed 25000 Rows


False    27267
True       463
Name: de, dtype: int64

Here we've only matched 436 of the 27,000 - around 1.7% which is almost definitely an undercount.
However, if we compare term frequencies in matched rows with non-matched rows we can gain an insight into other terms to consider.

We can do this by using term frequency-inverse document frequency (tf-idf) which is a measure of how important a term is to a document in a collection or corpus.  If we find tf-idf for each full-text row, we can compare the tf-idf of the matched rows with the non-matched rows.

In [30]:
# Find term counts in de=True rows
# Calculate tf–idf for rows with German and non-German matches
v = TfidfVectorizer()
tf_idf = v.fit_transform(df['Full_Text'])

Now We can put this in its own data frame to find the 10 terms in the matched rows and the top terms in the non-matched rows. This takes about 40 seconds to run on my laptop.

In [31]:
tf_idf_df = pd.DataFrame(tf_idf.toarray(), columns=v.get_feature_names())
tf_idf_df.head(3)
# Merge de from df with tf_idf_df
tf_idf_df = tf_idf_df.merge(df[['de']], left_index=True, right_index=True)

tf_grouped_df = tf_idf_df.groupby('de_y').mean().T
tf_grouped_df.columns = ['Not Matched', 'Matched']

tf_grouped_df['diff'] = tf_grouped_df['Matched']-tf_grouped_df['Not Matched']
tf_grouped_df = tf_grouped_df.sort_values(by=['diff'], ascending=False)
tf_grouped_df.head(25)



Unnamed: 0,Not Matched,Matched,diff
germany,0.0,0.095555,0.095555
berlin,0.001619,0.031425,0.029805
german,0.0,0.028542,0.028542
munich,0.000847,0.018043,0.017196
gesell,0.001385,0.018177,0.016792
and,0.04535,0.059665,0.014316
inst,0.015114,0.029351,0.014237
gottingen,0.00082,0.014707,0.013886
med,0.01943,0.03246,0.01303
germanium,0.0,0.012586,0.012586


There aren't any huge surprises here but probably sensible to start grabbing terms from this list to build a bigger list.

### 2. Expanding our Term List with the tf-idf Insights

From this list I've selected ~35 terms that are unambiguous and are likely to be good indicators of German scientists. Let's expand our search to include these terms and see how many more we can find.

In [32]:
new_terms = ['germany',  'berlin',  'german',  'munich',  'gottingen',  'deuts',  'hamburg',  'heidelberg',  'hochschule',  'freiburg',  'breslau',  'kiel',  'karlsruhe',  'stuttgart',  'darmstadt',  'leipzig',  'baden',  'tubingen',  'dresden',  'cologne',  'erlangen',  'montefiore',  'strassburg',  'wiesbaden',  'charlottenburg',  'hanover',  'chemnitz',  'bielefeld',  'konigsberg',  'gnissau',  'bavarian',  'eberswalde',  'schoeneberg',  'braunschweig'] 
search_terms = list(set(search_terms + new_terms))

In [33]:
# Filter for rows where full_text contains any of the search terms
counter = 0
df['matched_terms'] = df.apply(lambda x: is_de(x['Full_Text'], search_terms), axis=1)
df['de_2'] = df.apply(lambda x: len(x['matched_terms']) > 0, axis=1)
df['de_2'].value_counts()

Processed 5000 Rows
Processed 10000 Rows
Processed 15000 Rows
Processed 20000 Rows
Processed 25000 Rows


False    25803
True      1927
Name: de_2, dtype: int64

1927 matches is a lot better! 7.5% is much higher than 1.6%. Let's look at a random sample of these rows to check the match:

In [34]:
df[df['de_2'] == True][["Surname", "Other_Names", "Affiliation", "Full_Text"]].sample(20)

Unnamed: 0,Surname,Other_Names,Affiliation,Full_Text
670,Saxton,Prof. Ren G(eorge),Oklahoma Agricultural and Mechanical College,"berlin, wis, 86. b.s, wisconsin, 09, c.e, 15; ..."
6680,Kleckner,Prof. Martin E(zra),333 E. Market St,"davis, 111, may 6, 61. a.b, heidelberg col. (o..."
25947,Gabel,Dr. Charles E(rnst),1107 Neal Ave,"milwaukee, wis, march 9, 77. 13.s,wisconsin, 9..."
10441,Kraus,Dean E(dward) H(enry),University of Michigan,"syracuse, n. y, dec. 1, 75. b.s, syracuse, 96,..."
19544,Angell,Dr. James R(owland),Yale University,"burlington, vt, may 8, 69. a.b, michigan, 90, ..."
6378,Kenrick,Prof. Frank B(oteler),77 Lonsdale Road,"send, eng. feb. 4, 74. b.a, toronto, 94, 1851 ..."
23975,Dubin,Dr. H(arry) E(nnis),250 E. 43rd St,"russia, march 4, 91. b.s, col. city of n. y, 1..."
5975,Jones,Dr. Lynds,352 W. College St,"jefferson, ohio, jan. 5, 65. grinnell col, 88-..."
10640,Laird,Prof. E(lizabeth) R(ebecca),Mt. Holyoke College,"owen sound, ont, can, dec. 6, 74. b.a, toronto..."
5637,Jellinek,E(lvin) Morton,Worcester State Hospital,"new york, n. y, aug. 15, 90. berlin; grenoble;..."


and just the full_texts:

In [35]:
list(df[df['de_2'] == True][["Surname", "Other_Names", "Affiliation", "Full_Text"]].sample(20)["Full_Text"])

['new york, n. y, sept. 11, 02. b.s, n. y. univ, 27, i.e. 28; a.m, fordham, 32, ph.d, 36. indust, engineer, bosch magneto co, stuttgart, germany, 28-29; otis elevator co, yonkers, n. y, 29-30; prof. psychol, villa maria col, 34-35; indust. eng. and head dept, manhattan col, 35- rep, psychol. corp, new york, 35. a.a; assoc. psychol. asn; soc. adv. management (sec’y- treas, new york chap, 36-); eng. educ; mech. eng; statist. asn. industrial and educational psychology; tests and) measurements; statistics.',
 'clinton, ind, nov. 25, 68. b.pd, mo. state teachers col; b.s, drury col; ph.b, chicago; ph.d, leipzig. teacher science, southern (ala), 03-09; supervisor german, high sch, calif, 14-18; teacher chem, birmingham-southern col, 18-20; tennessee col, 20-21; okla. baptist univ, 21-22; howard col, 22-23; prof, mercer, 23- chem. soc.',
 'bowling green, ohio, jan. 27, 00. b.s, michigan, 21, m.s, 22, ph.d, 24. instr. chem, michigan, 23-25; lecturer and heckscher research fellow, cornell, 25-2

and finally comparing some of the terms that matched the second time but not the first:

In [36]:
list(df.query('de_2 == True & de == False')[ "Full_Text"].sample(20))

['centralia, 111, oct. 13, 75. b.s, nebraska, 96, a. m, 97; ph.d, berlin, 00. instr. physics, nebraska, 00-02, adj. prof, 02-05, asst, prof, 05-07, assoc, prof, 07-09, prof, 09-, chairman dept, 19-20. a.a; physical soc; nebr. acad. accidental double refraction; spark potentials; discharge through gases; dielectric strength glasses and double refracting media; cathode potential; minimum spark potentials; corona-discharge currents.',
 'ports-mouth, ohio, sept. 22, 76. b.s, baldwin-wallace col, 02, lion. sc.d, 35; western reserve, 05; a.m, ohio state, 06; austin fellow, harvard, 07-08. prof, commerce and prin. sch. commerce, baldwin-wallace col, 02-05, instr. biol, 04-05; teaching fellow, ohio state, 05-06; acting head dept, zool, ohio wesleyan, 06-07; asst. prof, entom, california, 08-12, parasitol, 12-15, assoc, prof, 15-20, prof, and head div. entom. and parasitol, 20- consulting entomologist, state board health, calif. summer, visiting prof, ohio state, 30. nat. malaria cmn. chevalier

This shows a lot of matches of scientists with German education but not neccesarily birth in Germany. This is positive but could be a sign we're searching too broadly.

1927 is a lot more than 436.  We can use the tf-idf method again to see if there are any more terms we can add to our list. It's unlikely to be as fruitful as the first time but there's no harm in trying.

In [37]:
tf_idf_df = tf_idf_df.merge(df[['de_2']], left_index=True, right_index=True)

In [38]:
tf_grouped_df = tf_idf_df.groupby('de_2').mean().T
tf_grouped_df.columns = ['Not Matched', 'Matched']

tf_grouped_df['diff'] = tf_grouped_df['Matched']-tf_grouped_df['Not Matched']
tf_grouped_df = tf_grouped_df.sort_values(by=['diff'], ascending=False)
tf_grouped_df.head(25)

# filter for rows where the index is not in search_terms
tf_grouped_df[tf_grouped_df.index.isin(search_terms) == False].head(25)

Unnamed: 0,Not Matched,Matched,diff
de_y,0.0,0.24027,0.24027
chem,0.030998,0.054219,0.023221
oberlin,0.0,0.019609,0.019609
soc,0.036316,0.054307,0.017991
gesell,0.000437,0.018112,0.017675
prof,0.03618,0.051634,0.015454
and,0.044615,0.058622,0.014007
med,0.018712,0.03218,0.013469
pres,0.012078,0.025321,0.013242
of,0.036965,0.050118,0.013153


yeah, there's not anything particularly interesting here

<h3> 3. Expanding our Term List with a list of German Cities - <b style="color:darkred"> Poor Solution </b> </h3>

<h5> Constructing the Term List </h5>

The German government has a (modern and incomplete) list of some cities <a href="https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.cities.csv"> here <a>

In [39]:
de_cities_df = pd.read_csv("https://raw.githubusercontent.com/robbi5/german-gov-domains/master/data/domains.cities.csv")

In [40]:
# Get ascii version of the cities
de_cities_df['ascii_cities']=de_cities_df.City.apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))
# Append the ascii version and the standard version to the search terms
search_terms.extend(de_cities_df.ascii_cities.tolist())
search_terms.extend(de_cities_df.City.tolist())
search_terms = [term.lower() for term in search_terms]

In [41]:
# Filter for rows where full_text contains any of the search terms
counter = 0
df['de_city_list'] = df.apply(lambda x: is_de(x['Full_Text'], search_terms), axis=1)

Processed 5000 Rows
Processed 10000 Rows
Processed 15000 Rows
Processed 20000 Rows
Processed 25000 Rows


An issue here is with German city names that are also English words, e.g: "March" or "Bell"
Limiting the usefullness of searching across the whole Full_Text

In [42]:
# Find most common items in matched_terms
Counter([item for sublist in df[df['de_city_list'].apply(lambda x: len(x) > 0)]['de_city_list'].tolist() for item in sublist]).most_common(15)

[('au', 9392),
 ('burg', 6488),
 ('march', 3620),
 ('berg', 3260),
 ('vil', 2316),
 ('rain', 2312),
 ('berlin', 2136),
 ('bell', 2052),
 ('ering', 1672),
 ('riol', 1464),
 ('sen', 1446),
 ('rust', 1186),
 ('nebra', 1112),
 ('lam', 1100),
 ('lf', 993)]

### Exporting back to Excel

In [50]:
# divide by sheetname and save to xlsx
for sheet_name in list(df['sheet_name'].unique()):
    sheet_df = df[df['sheet_name'] == sheet_name]
    # drop de and de_city_list columns
    sheet_df = sheet_df.drop(columns=['de', 'de_city_list'])
    # rename de_2 to de
    sheet_df = sheet_df.rename(columns={'de_2': 'de'})
    sheet_df.to_excel(f"matched/{sheet_name[:-5]}.xlsx", index=False)

In [20]:
df

Unnamed: 0,Ocr_Index,Page,Surname,Other_Names,Affiliation,Field,Full_Text,Indicator,sheet_name,EMPTY,Cleaning_Comments,matched_terms,de,de_2,de_city_list
0,22968.0,1200,Rogers,Prof. Charles E(dwin),33 Concord St,CIVIL ENGINEERING,"saratoga co, n. y, june 5, 74. c.e, rensselaer...",0.0,6_1200_1400,Unnamed: 9,Unnamed: 10,[],False,False,"[laer, ssel, vil, laer]"
1,22969.0,1200,Rogers,Charles F(letcher),University Farm,BIOCHEMISTRY,"denver, colo, june 15, 02. a.b, nebr. wesleyan...",0.0,6_1200_1400,,,[],False,False,[]
2,22970.0,1200,Rogers,Prof. C(harles) G(ardner),378 Reamer Place,PHYSIOLOGY,"perry, n. y, march 4, 75. a.b, syracuse, 97, a...",0.0,6_1200_1400,,,[berlin],False,True,"[berlin, berlin, march, berlin, march]"
3,22971.0,1200,Rogers,Charles H(enry),,,,0.0,6_1200_1400,,,[],False,False,[]
4,22972.0,1200,Rogers,Prof. Charles (Herbert),University of Minnesota,CHEMISTRY,"belleville, 111, may 28, 89. ph.c, st. louis c...",0.0,6_1200_1400,,,[],False,False,"[bell, bell, bel, vil, bell, bell]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27725,12005.0,599,Harries,Dr. F(ord) H(owell),Box 1100,ENTOMOLOGY,"salt lake city, utah, aug. 2, 03. a.b, utah, 2...",0.0,6_200_600,,,[],False,False,"[au, au]"
27726,12006.0,599,Harriman,N(orman) F(ollett),U. S. Treasury Depart-ment,CHEMICAL ENGINEERING,"cameron, mo, jan. 12, 78. chief chemist and en...",0.0,6_200_600,,,[],False,False,[sen]
27727,12007.0,599,Harrington,Arthur W(illiam),526 Federal Bldg,HYDRAULIC ENGINEERING,"watertown, n. y, june 7, 88. c. e, cornell, 09...",0.0,6_200_600,,,[],False,False,"[au, flo, riol, sen, vil, au, riol]"
27728,12008.0,599,Harrington,Prof. Carlos E(Imer),University of Buffalo,"MATHEMATICS, ENGINEERING","buffalo, n. y, feb. 9, 92. m.e, cornell, 18; m...",0.0,6_200_600,,,[],False,False,[bel]
