In [1]:
__author__ = "Jon Ball"
__version__ = "Summer 2022"

In [2]:
!python --version

Python 3.10.5


# Data preprocessing for "The Collective Mind and the Machine"

1. [*Sociology of Education* and peer journals ](#soced) (1990-2019)
    1. [*American Sociological Review*](#asr)
    2. [*American Journal of Sociology*](#ajs)
    3. [*American Journal of Education*](#aje)
    4. [*American Education Research Journal*](#aerj)
2. [ERIC in its entirety](#eric) (1990-2019)
3. [Token counts for ETM train datasets](#toks)

All data pertaining to specific journals accessed using the <a href="https://eric.ed.gov/?api">ERIC API</a>.

In [3]:
from eric_utils import (
    parse_eric_api,
    parse_eric_database,
    make_df,
    save_author_list
)
from nltk.tokenize import word_tokenize, sent_tokenize
from string import punctuation
from lxml import etree
from tqdm import tqdm
import pandas as pd
import spacy
import html
import json
import time
import re
import os

### *Sociology of Education* (*n* = 540) <a id="soced"></a>

In [4]:
%time soced1 = parse_eric_api(os.path.join("eric_data", "soced1.xml"))

2000 docs parsed.
CPU times: user 95.6 ms, sys: 6.96 ms, total: 103 ms
Wall time: 103 ms


In [5]:
%time soced2 = parse_eric_api(os.path.join("eric_data", "soced2.xml"))

489 docs parsed.
CPU times: user 15.4 ms, sys: 2.21 ms, total: 17.6 ms
Wall time: 18.6 ms


The ERIC API query for "source:'Sociology of Education'" returned articles published in *Sociology of Education*, *British Journal of Sociology of Education*, and *International Studies in Sociology of Education*.

In [6]:
soced = soced1 + soced2 # Add the results of the two queries
seDF = make_df(soced)

1742 relevant articles published from 1990-2019.


Save author names from *British Journal of Sociology of Education* to check Brint's claims about the U.S. focus of the subfield:

In [7]:
bjseDF = seDF[seDF.issn == "ISSN-0142-5692"]
save_author_list(bjseDF, "bjse")

1193 individual authors indexed in bjse publications from 1990-2019.
Author list saved as bjse_authors.json


Save author names from *International Studies in Sociology of Education* to check Brint's claims about the U.S. focus of the subfield:

In [8]:
isseDF = seDF[seDF.issn == "ISSN-0962-0214"]
save_author_list(isseDF, "isse")

438 individual authors indexed in isse publications from 1990-2019.
Author list saved as isse_authors.json


Finally, save author names from *Sociology of Education* itself:

In [9]:
seDF = seDF[seDF.issn == "ISSN-0038-0407"]
save_author_list(seDF, "soced")
seDF.info()

706 individual authors indexed in soced publications from 1990-2019.
Author list saved as soced_authors.json
<class 'pandas.core.frame.DataFrame'>
Int64Index: 540 entries, 0 to 539
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   index        540 non-null    int64 
 1   title        540 non-null    object
 2   author       540 non-null    object
 3   description  540 non-null    object
 4   subject      540 non-null    object
 5   year         540 non-null    int64 
 6   issn         540 non-null    object
 7   etm_input    540 non-null    object
dtypes: int64(2), object(6)
memory usage: 38.0+ KB


### *American Sociological Review* (*n* = 106) <a id="asr"></a>

In [10]:
asr = parse_eric_api(os.path.join("eric_data", "asr.xml"))
asrDF = make_df(asr)
save_author_list(asrDF, "asr")

350 docs parsed.
106 relevant articles published from 1990-2019.
172 individual authors indexed in asr publications from 1990-2019.
Author list saved as asr_authors.json


### *American Journal of Sociology* (*n* = 38) <a id="ajs"></a>

In [11]:
ajs = parse_eric_api(os.path.join("eric_data", "ajs.xml"))
ajsDF = make_df(ajs)
save_author_list(ajsDF, "ajs")

333 docs parsed.
38 relevant articles published from 1990-2019.
68 individual authors indexed in ajs publications from 1990-2019.
Author list saved as ajs_authors.json


Create a list of sociologists who published in *Soc of Ed*, *ASR*, or *AJS* and are indexed in ERIC.<br>
This list will be used to filter articles published in *AJE* and *AERJ* which have at least one sociologist as a coauthor.<br>
It provides a definition of the *AJE* and *AERJ* articles which "count" as the Sociology of Education.

In [12]:
sociologists = []
for filename in ["soced_authors.json", "asr_authors.json", "ajs_authors.json"]:
    with open(os.path.join("eric_data", filename), "r") as infile:
        sociologists.extend(
            json.load(infile)
        )
        
sociologists = list(set(sociologists))
print("%s individual authors published in Soc of Ed, ASR, or AJS from 1990-2019 and indexed in ERIC." % (len(sociologists),))

887 individual authors published in Soc of Ed, ASR, or AJS from 1990-2019 and indexed in ERIC.


### *American Journal of Education* (*n* = 138) <a id="aje"></a>

In [13]:
aje = parse_eric_api(os.path.join("eric_data", "aje.xml"))
ajeDF = make_df(aje)

735 docs parsed.
504 relevant articles published from 1990-2019.


Filter *AJE* articles for those with sociologist coauthors:

In [14]:
aje_ids = []
for index, row in ajeDF.iterrows():
    for author in sociologists:
        if author in row["author"]:
            aje_ids.append(index)

ajeDF = ajeDF.loc[aje_ids]    
print("%s AJE articles (1990-2019) were co-authored by researchers who also published in Soc." % (ajeDF.shape[0],))

138 AJE articles (1990-2019) were co-authored by researchers who also published in Soc.


### *American Educational Research Journal* (*n* = 251) <a id="aerj"></a>

In [15]:
aerj = parse_eric_api(os.path.join("eric_data", "aerj.xml"))
aerjDF = make_df(aerj)

1872 docs parsed.
1036 relevant articles published from 1990-2019.


Filter *AERJ* articles for those with sociologist coauthors:

In [16]:
aerj_ids = []
for index, row in aerjDF.iterrows():
    for author in sociologists:
        if author in row["author"]:
            aerj_ids.append(index)

aerjDF = aerjDF.loc[aerj_ids]    
print("%s AERJ articles (1990-2019) were co-authored by researchers who also published in Soc." % (aerjDF.shape[0],))

251 AERJ articles (1990-2019) were co-authored by researchers who also published in Soc.


### Create Soc of Ed inputs for embedded topic modeling:

In [17]:
etm_inputs = []
for df in [seDF, asrDF, ajsDF, ajeDF, aerjDF]:
    etm_inputs.extend(df["etm_input"].tolist())

soced_sents = []
for doc in etm_inputs:
    soced_sents.extend(
        sent_tokenize(doc) # Dieng. et al used sentences as docs; this is conventional for word2vec as well
    )
del etm_inputs
    
with open(os.path.join("eric_data", "soced_etm_inputs.txt"), "w") as outfile:
    for sent in soced_sents:
        outfile.write(
            "%s\n" % (sent,)
        )
print("%s sentences saved as inputs for embedded topic modeling." % (len(soced_sents),))

5721 sentences saved as inputs for embedded topic modeling.


In [18]:
del soced, soced1, soced2, seDF, bjseDF, isseDF, asr, asrDF, ajs, ajsDF, sociologists
del aje, ajeDF, aerj, aerjDF, soced_sents

## All journal articles indexed in ERIC, 1990-2019 <br><a id="eric"></a>
.xml files accessed at https://eric.ed.gov/?download.

In [19]:
%%time

for roor, dirs, files in os.walk("eric_data"):
    
    eric_articles = []
    
    for file in sorted(files):
        if re.search(r"\d{4}", file): # Match files with year in filename; "eric1990.xml"
            
            eric_articles.extend(
                parse_eric_database(os.path.join("eric_data", file),
                                    item_type="Journal Articles")
            )

print("%s journal articles parsed in total." % len(eric_articles))

808094 journal articles parsed in total.
CPU times: user 1min 18s, sys: 3.08 s, total: 1min 21s
Wall time: 1min 21s


In [20]:
eDF = make_df(eric_articles)
eDF.info()

793971 relevant articles published from 1990-2019.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 793971 entries, 0 to 793970
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   index        793971 non-null  int64 
 1   subject      793971 non-null  object
 2   author       793971 non-null  object
 3   description  793971 non-null  object
 4   title        793971 non-null  object
 5   year         793971 non-null  int64 
 6   source       793940 non-null  object
 7   etm_input    793971 non-null  object
dtypes: int64(2), object(6)
memory usage: 48.5+ MB


Check that the year range is correct:

In [21]:
print(sorted(eDF["year"].unique()))

[1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]


Remove duplicates:

In [22]:
eDF["title"].duplicated().sum()

4158

In [23]:
eDF.drop_duplicates(subset=["title"], inplace=True)
print(eDF.shape)

(789813, 8)


In [24]:
eDF["etm_input"].duplicated().any() or eDF["etm_input"].isnull().any() or eDF["author"].isnull().any()

False

In [25]:
del eric_articles

Save to .csv:

In [26]:
eDF.to_csv(os.path.join("eric_data", "eric_articles_1990-2019.csv"), index=False)

Save a list of tuples matching each author to a journal in which they published.<br>
An author's name will be the first element in multiple tuples if they published in multiple venues.

In [27]:
author2source = []
for author_list, source in list(zip(eDF["author"], eDF["source"])):
    for author in author_list:
        author2source.append((author, source))
        
author2source = list(set(author2source))
print("%s unique author to journal pairings." % len(author2source))

1287361 unique author to journal pairings.


In [28]:
with open(os.path.join("eric_data", "author2jsource.json"), "w") as outfile:
    json.dump(author2source, outfile)
del author2source

### Save the cleaned ETM / Word2Vec inputs for all of ERIC (1990-2019):

In [29]:
etm_inputs = eDF["etm_input"].tolist()
del eDF

eric_sents = []
for doc in etm_inputs:
    eric_sents.extend(
        sent_tokenize(doc) # Gensim word2vec requires sentences
    )
del etm_inputs

with open(os.path.join("eric_data", "eric_etm_inputs.txt"), "w") as outfile:
    for sent in eric_sents:
        outfile.write(
            "%s\n" % (sent,)
        )
    
print("%s sentences saved as inputs for embedded topic modeling." % (len(eric_sents),))

4807373 sentences saved as inputs for embedded topic modeling.


In [30]:
del eric_sents

## Token counts for ETM train datasets: <a id="toks"></a>

In [31]:
n_toks = 0
with open(os.path.join("eric_data", "soced_etm_inputs.txt"), "r") as infile:
    sents = [line for line in infile.readlines()]

for sent in sents:
    n_toks += len([tok for tok in word_tokenize(sent) if tok not in punctuation])

print("%s tokens in Soc of Ed ETM inputs." % (n_toks,))
del n_toks, sents

121670 tokens in Soc of Ed ETM inputs.


In [32]:
n_toks = 0
with open(os.path.join("eric_data", "eric_etm_inputs.txt"), "r") as infile:
    sents = [line for line in infile.readlines()]

for sent in tqdm(sents):
    n_toks += len([tok for tok in word_tokenize(sent) if tok not in punctuation])

print("%s tokens in ERIC ETM inputs." % (n_toks,))

100%|██████████████████████████████| 4809053/4809053 [05:30<00:00, 14554.15it/s]

103929107 tokens in ERIC ETM inputs.



