In [None]:
%load_ext autoreload
%autoreload 2
import os
import sys
import pandas as pd
import numpy as np
sys.path.insert(0, '../src')
import utils

# Results from other Text Analyzers

To test the performance of our application <i> AutoLibrary </i>, we experiment it with three other text analyzers.

- AutoLibrary: Used AutoPhrase to extract phrases and keywords and assigns quality scores to each of them. Requires human labor to label quality scores.

- Jstor: https://www.jstor.org/analyze/. This search engine has its own text analyzer. It requires human labor to label quality scores.

- Webtools:  https://www.webtools.services/text-analyzer
    This website tool helps us find the frequency of phrases in a document. We then standardize it to form scores in range of 0 to 1.

- MonkeyLearn: https://monkeylearn.com/text-analyzer/
    This text analyzer extracts keywords from the document. It requires human labor to label quality scores.

In [None]:
# dataframe for accuracy
top40_acc_df = pd.DataFrame(columns = ["Analyzer", "Accuracy", "Domain"])
top10_acc_df = pd.DataFrame(columns = ["Analyzer", "Accuracy", "Domain"])

Run AutoLibrary to get weighted scores for all domains

In [None]:
dirs = os.listdir('../references/experiment data')
if '.DS_Store' in dirs:
    dirs.remove('.DS_Store')

weighted = {}
weighted_all = {}
weighted_stats = pd.DataFrame()
for directory in dirs:
    fp = '../references/experiment data/' + directory + '/weighted_AutoPhrase.csv'
    df = pd.read_csv(fp, index_col='Unnamed: 0')
    weighted_all[directory] = df
    weighted[directory] = df
    weighted_stats[directory] = df['score'].describe()
weighted_df = pd.concat(weighted, axis=1)
weighted_df.head(15)

## Computer Science

### Jstor

In [None]:
# score
jstor_cs_phrases = ["Psychotherapy",
"Machine learning",
"Text analytics",
"Health care quality",
"Academic aptitude",
"Art therapy",
"Big data",
"Biomedical data",
"Computer centers",
"Audio engineering",
"Code pages",
"Compensatory education",
"Computer programming",
"Data analysis",
"Data collection",
"Educational technology",
"Data transcription",
"Data models",
"Government information",
"Health care utilization",
"In state students",
"Information attributes",
"Learning",
"Learning by doing",
"Logistic regression",
"Learning strategies",
"Out of state students",
"Psychological research",
"Public health",
"Speech acts",
"School counseling",
"Second language learning",
"Student interests",
"Students",
"Time series",
"Test theory",
"Small area data",
"Surveillance",
"Time series forecasting",
"ZIP codes"]
len(jstor_cs_phrases)

In [None]:
jstor_cs_labels = [1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0]
top40_acc_df = top40_acc_df.append({"Accuracy": np.mean(jstor_cs_labels), "Analyzer": "Jstor", "Domain": "Computer Science"}, ignore_index = True)
top40_acc_df

### AutoLibrary

In [None]:
autophrase_cs_df = weighted_df['Computer Science'].dropna().head(40)
autophrase_cs_df.head(5)

In [None]:
autophrase_cs_labels = [1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 
                       0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
                       0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
                       0, 0, 1, 1, 0, 0, 0, 1, 0, 0]
top40_acc_df = top40_acc_df.append({"Accuracy": np.mean(autophrase_cs_labels), "Analyzer": "AutoLibrary", "Domain": "Computer Science"}, ignore_index = True)
top40_acc_df

In [None]:
top10_acc_df = top10_acc_df.append({"Accuracy": np.mean(autophrase_cs_labels[:10]), "Analyzer": "AutoLibrary", "Domain": "Computer Science"}, ignore_index = True)
top10_acc_df

### Webtools

In [None]:
webtools_cs_phrases = {"of the": 74,
"et al": 45,
"to the": 36,
"in the": 28,
"for the": 27,
"in order": 15,
"the therapist": 15,
"and the": 15,
"in order to": 14,
"the system": 14,
"by the": 13,
"it is": 12,
"of a": 12,
"the session": 12,
"at the": 12,
"from the": 12,
"of psychotherapy": 11,
"able to": 11,
"with the": 10,
"we have": 10,
"we are": 10,
"is not": 10,
"the two": 10,
"between the": 10,
"university of": 9,
"in a": 9,
"has been": 9,
"behavioral coding": 9,
"of the system": 9,
"speech and": 8,
"more than": 8,
"is a": 8,
"as a": 8,
"a variety of": 8,
"in this": 8,
"based on": 8,
"the client": 8,
"to be": 8,
"using the": 8,
"automated evaluation": 7}

In [None]:
# standardize it
webtools_cs_phrases = utils.min_max_scale(webtools_cs_phrases)
webtools_cs_phrases

In [None]:
webtools_cs_labels = [0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1]
len(webtools_cs_labels) == len(webtools_cs_phrases)

### MonkeyLearn

In [None]:
monkeylearn_cs_phrases = ["sessions",
"codes",
"utterances",
"behavioral coding",
"therapist",
"international speech communication",
"Automatic Speech Recognition",
"speech communication association",
"Proc",
"psychotherapy"]
len(monkeylearn_cs_phrases)

In [None]:
monkeylearn_cs_labels = [1, 0, 0, 1, 1, 1, 1, 1, 0, 1]
top10_acc_df = top10_acc_df.append({"Accuracy": np.mean(monkeylearn_cs_labels[:10]), "Analyzer": "MonkeyLearn", "Domain": "Computer Science"}, ignore_index = True)
top10_acc_df

## Economics

### AutoLibrary

In [None]:
autophrase_econ_df = weighted_df['Economics'].dropna().head(40)
autophrase_econ_df.head(5)

In [None]:
autophrase_econ_labels = [1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 
                         0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 
                         0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
                         0, 0, 0, 0, 0, 1, 0, 0, 0, 1]
top40_acc_df = top40_acc_df.append({"Accuracy": np.mean(autophrase_econ_labels), "Analyzer": "AutoLibrary", "Domain": "Economics"}, ignore_index = True)
top40_acc_df

In [None]:
top10_acc_df = top10_acc_df.append({"Accuracy": np.mean(autophrase_econ_labels[:10]), "Analyzer": "AutoLibrary", "Domain": "Economics"}, ignore_index = True)
top10_acc_df

### Jstor

In [None]:
# score
jstor_econ_phrases = {"Treaty lands",
"Axiom of choice",
"Test theory",
"Affirmative action",
"Indian history",
"Antennas",
"Caste system",
"Casting",
"Fire protection",
"Cognitive dissonance",
"Equal protection",
"Environmental protection",
"Genetic correlation",
"Grammemes",
"Legislative branch",
"Incantations",
"Caste determination",
"Logistic regression",
"Lost wax casting",
"Neuroscience",
"Multilevel models",
"Medicaid",
"Personality inventories",
"Pro choice movements",
"Rational choice theory",
"Rule of 72",
"Simplex method",
"Transfer pricing",
"Sundials",
"Rule of 78",
"School choice",
"Ray tracing",
"Product choice",
"Plasticizers",
"Preferential voting",
"Recursion",
"Rule of 70",
"Self control",
"Syntax",
"Tribal constitutions"}
len(jstor_econ_phrases)

In [None]:
jstor_econ_labels = [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0]
top40_acc_df = top40_acc_df.append({"Accuracy": np.mean(jstor_econ_labels), "Analyzer": "Jstor", "Domain": "Economics"}, ignore_index = True)
top40_acc_df

### Webtools

In [None]:
webtools_econ_phrases = {"choice rule": 101,
"of the": 58,
"hr protections": 36,
"set of": 34,
"i i": 33,
"of individuals": 28,
"in india": 26,
"sciakg choice rule": 26,
"in the": 25,
"set of individuals": 24,
"category v": 23,
"affirmative action": 22,
"is the":	21,
"can be":	21,
"justified envy": 21,
"the sciakg choice rule": 21,
"the set": 18,
"for every": 18,
"in section": 17,
"an individual": 17,
"m i": 17,
"individuals in": 16,
"v v": 16,
"merit score": 16,
"the set of": 16,
"meritorious horizontal": 16,
"2smh choice rule": 16,
"minimum guarantee": 15,
"number of": 15,
"sonmez and": 15,
"and yenmez": 15,
"meritorious horizontal choice rule": 15,
"c2s m": 15,
"et al": 15,
"a set": 14,
"the akghas": 14,
"hrprotected position": 14,
"the highest": 14,
"for the": 14,
"it is": 14,
}

In [None]:
# standardize it
webtools_econ_phrases = utils.min_max_scale(webtools_econ_phrases)
webtools_econ_phrases

In [None]:
webtools_econ_labels = [1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]
len(webtools_econ_labels) == len(webtools_econ_phrases)

### MonkenLearn

In [None]:
monkeylearn_econ_phrases = ["affirmative action",
"horizontal reservations",
"highest merit score",
"akg choice rules",
"2smh choice rules",
"justified envy",
"set of individual",
"vr protection",
"hr protection",
"traits"]

In [None]:
monkeylearn_econ_labels = [1, 1, 1, 0, 0, 1, 0, 1, 0, 0]
top10_acc_df = top10_acc_df.append({"Accuracy": np.mean(monkeylearn_econ_labels), "Analyzer": "MonkeyLearn", "Domain": "Economics"}, ignore_index = True)
top10_acc_df

## Electrical Engineering and Systems Science

### AutoLibrary

In [None]:
autophrase_eess_df = weighted_df['Electrical Engineering and Systems Science'].dropna().head(40)
autophrase_eess_df.head(5)

In [None]:
autophrase_eess_labels = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                         1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
                         1, 1, 0, 1, 1, 1, 0, 0, 1, 0,
                         0, 1, 1, 0, 1, 1, 1, 1, 0, 0]

In [None]:
top40_acc_df = top40_acc_df.append({"Accuracy": np.mean(autophrase_eess_labels), "Analyzer": "AutoLibrary", "Domain": "EE & System Design"}, ignore_index = True)
top40_acc_df

In [None]:
top10_acc_df = top10_acc_df.append({"Accuracy": np.mean(autophrase_eess_labels[:10]), "Analyzer": "AutoLibrary", "Domain": "EE & System Design"}, ignore_index = True)
top10_acc_df

### Jstor

In [None]:
# score
jstor_eess_phrases = ["Imaging",
"Computer networking",
"Machine learning",
"Image reconstruction",
"Nuclear magnetic resonance",
"Toilet training",
"Systems librarians",
"ARPA computer network",
"Anthropic principle",
"Astronomical cosmology",
                      
"Artificial neural networks",
"Astronomical spectroscopy",
"Bayesian analysis",
"Community structure",
"Citation indexes",
"Bayesian networks",
"Diagnostic imaging",
"Data models",
"Electrical networks",
"Fractals",
                      
"Functional neuroimaging",
"Image files",
"Human biology",
"Hilbert spaces",
"Images", 
"Inverse problems",
"Logic circuits",
"Kalman filters",
"Magnetic resonance angiography", 
"Magnetic resonance imaging",
                      
"Multilevel marketing", 
"Microvessels",
"Mathematical objects",
"Network topology",
"Neuroscience",
"Online social networking",
"RLC circuits",
"Radiology",
"Particle image velocimetry",
"Spacetime"]
len(jstor_eess_phrases)

In [None]:
jstor_eess_labels = [0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 
                    1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
                    0, 0, 0, 1, 0, 0, 1, 1, 1, 1,
                    0, 0, 0, 1, 1, 0, 1, 1, 0, 1]

In [None]:
top40_acc_df = top40_acc_df.append({"Accuracy": np.mean(jstor_eess_labels), "Analyzer": "Jstor", "Domain": "EE & System Design"}, ignore_index = True)
top40_acc_df

### Webtools

In [None]:
webtools_eess_phrases = {"of the": 127,
"in the": 56,
"for the": 30,
"can be": 29,
"the network": 28,
"the cnnblock": 28,
"to the": 27,
"and the": 27,
"as well": 26,
"the proposed": 25,
                         
"and ncg": 25,
"number of": 24,
"our proposed": 23,
"the entire": 21,
"network architecture": 20,
"as well as": 19,
"in terms of": 18,
"of the cnnblock": 18,
"to be": 17,
"note that": 17,
                         
"the entire network": 16,
"m 1": 16,
"reconstruction network": 15,
"ah i": 15,
"as the": 14,
"the training": 14,
"kspace data": 13,
"on the": 13,
"by the": 13,
"radial spokes": 13,
                         
"1 and": 13,
"that the": 12,
"the forward": 12,
"image reconstruction": 12,
"from the": 12,
"m 1 and": 12,
"the measured": 11,
"of our": 11,
"proposed method": 11,
"proposed cnnblock": 11
}

In [None]:
# standardize it
webtools_eess_phrases = utils.min_max_scale(webtools_eess_phrases)
webtools_eess_phrases

In [None]:
webtools_eess_labels = [0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
                       1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
                       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 
                       0, 0, 0, 1, 0, 0, 0, 0, 0, 1]
len(webtools_eess_labels) == len(webtools_eess_phrases)

### MonkeyLearn

In [None]:
monkeylearn_eess_phrases = ["cine mr image",
"mr image reconstruction",
"cnn",
"proposed methods",
"test time",
"block",
"proposed cnn",
"number of radial",
"entire network",
"radial spoke"]

In [None]:
monkeylearn_eess_labels = [1, 1, 1, 0, 0, 0, 0, 0, 1, 1]
top10_acc_df = top10_acc_df.append({"Accuracy": np.mean(monkeylearn_eess_labels), "Analyzer": "MonkeyLearn", "Domain": "EE & System Design"}, ignore_index = True)
top10_acc_df

## Mathematics

### AutoLibrary

In [None]:
autophrase_math_df = weighted_df['Mathematics'].dropna().head(40)
autophrase_math_df.head(5)

In [None]:
autophrase_math_labels = [1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
                         1, 0, 0, 0, 0, 1, 1, 1, 0, 1,
                         1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 
                         1, 0, 1, 0, 0, 0, 0, 0, 1, 1]
top40_acc_df = top40_acc_df.append({"Accuracy": np.mean(autophrase_math_labels), "Analyzer": "AutoLibrary", "Domain": "Mathematics"}, ignore_index = True)
top40_acc_df

In [None]:
top10_acc_df = top10_acc_df.append({"Accuracy": np.mean(autophrase_math_labels[:10]), "Analyzer": "AutoLibrary", "Domain": "Mathematics"}, ignore_index = True)
top10_acc_df

### Jstor

In [None]:
# score
jstor_math_phrases = ["Policy making",
"Power functions",
"Quantitative genetics",
"Newtons method",
"Mathematical induction",
"Work functions",
"Too big to fail policy",
"Resuscitation orders", 
"Rational functions",
"Stochastic processes",
                      
"Public policy", 
"Probability distributions",
"Piecewise continuous functions",
"Polynomials",
"Pareto efficiency",
"Oxygen consumption",
"One to one functions",
"Nonstandard analysis",
"Mental health policy",
"Mathematical problems",
                      
"Mathematical continuity",
"Markov chains",
"Mathematical completeness",
"Machine learning",
"Log buildings",
"Lattice theory",
"International environmental policy",
"International trade policy",
"Information policy",
"Homeowners insurance",
                      
"Hilbert spaces",
"Environmental policy",
"Formal languages",
"Function words",
"Entropy",
"Crime control",
"Claims made policies",
"Area function",
"Artificial neural networks",
"Central limit theorem"]
len(jstor_math_phrases)

In [None]:
jstor_math_labels = [0, 1, 1, 0, 1, 1, 0, 0, 1, 1,
                    0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
                    1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
                    1, 0, 0, 0, 1, 0, 0, 1, 1, 1]
top40_acc_df = top40_acc_df.append({"Accuracy": np.mean(jstor_math_labels), "Analyzer": "Jstor", "Domain": "Mathematics"}, ignore_index = True)
top40_acc_df

### Webtools

In [None]:
webtools_math_phrases = {"s 1": 70,
"s0 t": 56,
"1 2": 52,
"for any": 51,
"v s": 48,
"s s0": 46,
"of the": 40,
"s s0 t": 35,
"proof of": 33,
"1 1": 31,
                         
"for all": 31,
"s 2": 30,
"s h": 29,
"with the": 26,
"2 s": 25,
"and any": 24,
"any t": 24,
"one has": 23,
"sk s": 23,
"any s": 22,
                         
"sk s s0": 21,
"1 s": 21,
"p sk": 21,
"a1 1": 20,
"and the": 19,
"the proof": 19,
"the last": 19,
"and any t": 19,
"s1 s": 19,
"sk s s0 t": 19,
                         
"the following": 18,
"ta1 s": 18,
"s 3": 18,
"we have": 18,
"for any s": 18,
"all t": 18,
"that v": 18,
"p s0": 18,
"p sk s": 18,
"t p": 18}

In [None]:
# standardize it
webtools_math_phrases = utils.min_max_scale(webtools_math_phrases)
webtools_math_phrases

In [None]:
webtools_math_labels = [0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
                       1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
                       0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
                       1, 0, 0, 0, 1, 0, 1, 0, 0, 0]
len(webtools_math_labels) == len(webtools_math_phrases)

### MonkeyLearn

In [None]:
monkeylearn_math_phrases = ["softmax pg methods",
"proof of lemma",
"lemma",
"a0",
"natural policy gradient",
"a1",
"a1 a2 a1",
"a2 a1 a2",
"a1 a2 a0",
"last inequality"]

In [None]:
monkeylearn_math_labels = [1, 0, 1, 0, 1, 0, 0, 0, 0, 0]
top10_acc_df = top10_acc_df.append({"Accuracy": np.mean(monkeylearn_math_labels), "Analyzer": "MonkeyLearn", "Domain": "Mathematics"}, ignore_index = True)
top10_acc_df

## Physics

### AutoLibrary

In [None]:
autophrase_physics_df = weighted_df['Physics'].dropna().head(40)
autophrase_physics_df.head()

In [None]:
autophrase_physics_labels = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                            0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 
                            1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 
                            1, 1, 1, 0, 1, 1, 0, 1, 0, 1]
top40_acc_df = top40_acc_df.append({"Accuracy": np.mean(autophrase_physics_labels), "Analyzer": "AutoLibrary", "Domain": "Physics"}, ignore_index = True)
top40_acc_df

In [None]:
top10_acc_df = top10_acc_df.append({"Accuracy": np.mean(autophrase_physics_labels[:10]), "Analyzer": "AutoLibrary", "Domain": "Physics"}, ignore_index = True)
top10_acc_df

### Jstor

In [None]:
# score
jstor_physics_phrases = ["Relativity",
"Quantum field theory",
"Quantum mechanics",
"Gravitational waves",
"Lectures",
"Astronomical cosmology",
"Black holes",
"Conservation laws",
"General relativity",
"Field research",
                         
"Far fields",
"Gravitation theory",
"Gravitational fields",
"Gravitational potential",
"Hawking radiation",
"Gravity",
"Lunar gravitation",
"Methodism",
"Near fields",
"Old fields",
                         
"Physical sciences",
"Philosophy of religion",
"Quantum computers",
"Quantum cosmology",
"Quantum states",
"Radio astronomy",
"Quantum wells",
"Reissner Nordstrom black holes",
"Ring theory",
"Special relativity",
                         
"Space research",
"Schwarzschild radius",
"String theory",
"Tensors",
"Waves",
"Waving",
"Yang Mills theory",
"Bondi",
"1966-7",
"GM"]
len(jstor_physics_phrases)

In [None]:
jstor_physics_labels = [1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
                       0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
                       0, 0, 1, 1, 1, 0, 1, 1, 1, 1,
                       0, 1, 1, 0, 0, 0, 1, 0, 0, 0]
top40_acc_df = top40_acc_df.append({"Accuracy": np.mean(jstor_physics_labels), "Analyzer": "Jstor", "Domain": "Physics"}, ignore_index = True)
top40_acc_df

### Webtools

In [None]:
webtools_physics_phrases = {"of the": 108,
"in the": 68,
"to the": 37,
"gravitational waves": 31,
"on the": 30,
"the gravitational": 28,
"to be": 24,
"gravitational field": 24,
"quantum mechanics": 23,
"chapel hill": 21,
                            
"general relativity": 20,
"with the": 20,
"of a": 20,
"that the": 20,
"for the": 19,
"at the": 17,
"it is": 17,
"the gravitational field": 17,
"1 p": 16,
"and the": 15,
                            
"of gravitational": 15,
"from the": 14,
"the chapel hill": 12,
"as a": 12,
"the first": 12,
"in a": 12,
"chapel hill conference": 12,
"of gravitational waves": 12,
"in particular": 11,
"the chapel hill conference": 11,
                            
"by the": 10,
"of the gravitational": 10,
"with a": 10,
"a gravitational": 10,
"quantum gravity": 9,
"of quantum": 9,
"due to": 9,
"to a": 9,
"such a": 9,
"the conference": 9}

In [None]:
# standardize it
webtools_physics_phrases = utils.min_max_scale(webtools_physics_phrases)
webtools_physics_phrases

In [None]:
webtools_physics_labels = [0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
                          1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
                          0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 
                          0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
len(webtools_physics_labels) == len(webtools_physics_phrases)

### MonkeyLearn

In [None]:
monkeylearn_physics_phrases = ["feynman",
"gravity",
"chapel hill conference",
"quantum mechanics",
"quantum gravity",
"caltech lecture",
"general relativity",
"quantum theory",
"gravitational wave",
"gravitational field"]

In [None]:
monkeylearn_physics_labels = [0, 1, 1, 1, 1, 0, 1, 1, 1, 1]
top10_acc_df = top10_acc_df.append({"Accuracy": np.mean(monkeylearn_physics_labels), "Analyzer": "MonkeyLearn", "Domain": "Physics"}, ignore_index = True)
top10_acc_df

## Quantitative Biology

### AutoLibrary

In [None]:
autophrase_qbio_df = weighted_df['Quantitative Biology'].dropna().head(40)
autophrase_qbio_df.head()

In [None]:
autophrase_qbio_labels = [1, 1, 1, 1, 0, 1, 0, 0, 1, 0,
                         1, 0, 1, 1, 1, 0, 0, 1, 0, 0,
                         1, 0, 0, 1, 1, 0, 1, 0, 0, 0,
                         0, 0, 1, 0, 1, 0, 0, 1, 1, 0]
top40_acc_df = top40_acc_df.append({"Accuracy": np.mean(autophrase_qbio_labels), "Analyzer": "AutoLibrary", "Domain": "Quant Biology"}, ignore_index = True)
top40_acc_df

In [None]:
top10_acc_df = top10_acc_df.append({"Accuracy": np.mean(autophrase_qbio_labels[:10]), "Analyzer": "AutoLibrary", "Domain": "Quant Biology"}, ignore_index = True)
top10_acc_df

### Jstor

In [None]:
# score
jstor_qbio_phrases = ["Materials tests",
"Elasticity",
"Brain",
"Fluids",
"Soil strength",
"Astronomical cosmology",
"Computer networking",
"Deformation",
"Creep rupture strength",
"Fluid solid interactions",
                      
"Forced expiratory flow rates",
"Glaciers",
"Fracture strength",
"Fracture mechanics",
"Gray literature",
"Hydrogels",
"Information resources",
"Labor force participation rates",
"Interstitial fluids",
"Material world",
                      
"Materials",
"Materials flow analysis",
"Mechanical engineering",
"Materials science",
"Moduli of elasticity",
"Neuroglia",
"Ocean tides",
"Optics",
"Spacetime",
"Price rigidities",
                      
"Physiology",
"Steels",
"Stress distribution",
"Stress strain diagrams",
"Tectonic plate interactions",
"Stress strain relationships",
"Temperature",
"Tensile stress",
"Tools",
"Video recording"]
len(jstor_qbio_phrases)

In [None]:
jstor_qbio_labels = [0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 
                    1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
                    0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
                    0, 0, 0, 1, 1, 0, 0, 0, 0, 0]
top40_acc_df = top40_acc_df.append({"Accuracy": np.mean(jstor_qbio_labels), "Analyzer": "Jstor", "Domain": "Quant Biology"}, ignore_index = True)
top40_acc_df

### Webtools

In [None]:
webtools_qbio_phrases = {"of the": 109,
"et al": 77,
"in the": 74,
"to the": 46,
"brain tissue": 33,
"on the": 30,
"is the": 27,
"the crack": 25,
"crack propagation": 24,
"and the": 21,
                         
"the brain": 21,
"forte et al": 21,
"can be": 21,
"for the": 20,
"by the": 19,
"we have": 19,
"the fracture": 18,
"the brain tissue": 18,
"that the": 18,
"the cracktip": 18,
                         
"wire cutting": 17,
"the material": 17,
"crack tip": 17,
"the wire": 16,
"of fluid": 15,
"due to": 15,
"in fig": 15,
"of a": 14,
"as a": 13,
"the solid": 13,
                         
"wire diameter": 13,
"with the": 12,
"dw 0": 12,
"process zone": 11,
"from the": 11,
"with respect to": 11,
"in the brain tissue": 11,
"fracture process": 11,
"is a": 10,
"which is": 10}

In [None]:
# standardize it
webtools_qbio_phrases = utils.min_max_scale(webtools_qbio_phrases)
webtools_qbio_phrases

In [None]:
webtools_qbio_labels = [0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
                       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 
                       0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
                       1, 0, 0, 1, 0, 0, 1, 1, 0, 0]
len(webtools_qbio_labels) == len(webtools_qbio_phrases)

### MonkeyLearn

In [None]:
monkeylearn_qbio_phrases = ["brain tissue",
"budday et al",
"forte et al",
"energy dissipation",
"fracture process",
"crack propagation",
"fracture toughness",
"tips region",
"wire diameter dw",
"physics of solid"]

In [None]:
monkeylearn_qbio_labels = [1, 0, 0, 1, 1, 1, 1, 0, 0, 0]
top10_acc_df = top10_acc_df.append({"Accuracy": np.mean(monkeylearn_qbio_labels), "Analyzer": "MonkeyLearn", "Domain": "Quant Biology"}, ignore_index = True)
top10_acc_df

## Quantitative Finance

### AutoLibrary

In [None]:
autophrase_qfin_df = weighted_df['Quantitative Finance'].dropna().head(40)
autophrase_qfin_df.head()

In [None]:
autophrase_qfin_labels = [1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 
                         1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
                         0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
                         0, 1, 0, 0, 1, 0, 0, 1, 0, 0]
top40_acc_df = top40_acc_df.append({"Accuracy": np.mean(autophrase_qfin_labels), "Analyzer": "AutoLibrary", "Domain": "Quant Finance"}, ignore_index = True)
top40_acc_df

In [None]:
top10_acc_df = top10_acc_df.append({"Accuracy": np.mean(autophrase_qfin_labels[:10]), "Analyzer": "AutoLibrary", "Domain": "Quant Finance"}, ignore_index = True)
top10_acc_df

### Jstor

In [None]:
# phrases
jstor_qfin_phrases = ["Microeconomics",
"Game theory",
"Nonstandard analysis",
"Hilbert spaces",
"Monte Carlo methods",
"Anesthetics",
"Antiallergics",
"Antifertility agents",
"Antimitotics",
"Astronomical cosmology",
                      
"Central limit theorem",
"Convexity",
"Defense policy",
"Economic competition",
"Economic principles",
"Economic theory",
"Economics",
"Expected utility",
"Formal languages",
"Free agents",
                      
"Genetic drift",
"Hallucinogens",
"Hardy Weinberg law",
"Home economics",
"Iron oxides",
"Market clearing prices",
"Mathematical objects",
"Mathematical problems",
"Musical scales",
"Nash equilibrium",
                      
"Neoclassical economics",
"Optimal control",
"Pareto efficiency",
"Positive economics",
"Posted price markets",
"Prices",
"Saltwater economics",
"Simian virus 40",
"Steady state economies",
"Stochastic processes"]
len(jstor_qfin_phrases)

In [None]:
jstor_qfin_labels = [1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
                    1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 
                    1, 1, 1, 0, 1, 0, 0, 0, 0, 1]
top40_acc_df = top40_acc_df.append({"Accuracy": np.mean(jstor_qfin_labels), "Analyzer": "Jstor", "Domain": "Quant Finance"}, ignore_index = True)
top40_acc_df

### Webtools

In [None]:
webtools_qfin_phrases = {"0 t": 283,
"c 0": 125,
"t c": 114,
"1 t": 114,
"i t": 112,
"c 0 t": 88,
"0 t c": 78,
"t t": 64,
"c 0 t c": 59,
"of the": 58,
                         
"c i": 58,
"e 0 t": 53,
"t c 0": 51,
"c 0 t c i": 46,
"t c 0 t": 42,
"in the": 41,
"t y": 40,
"0 0": 38,
"to the": 37,
"n 0": 36,
                         
"c 1 t": 36,
"for the": 35,
"c i t": 35,
"t x": 35,
"y 1": 33,
"major agent": 32,
"y 1 t": 32,
"n i1": 31,
"t y 1": 31,
"t y 1 t": 30,
                         
"t 0": 29,
"the major": 28,
"t dt": 28,
"the major agent": 26,
"c 0 t c i t": 26,
"j t": 26,
"0 t y": 26,
"dw 0 t": 25,
"c 0 ci": 25,
"0 0 0": 24}

In [None]:
# standardize it
webtools_qfin_phrases = utils.min_max_scale(webtools_qfin_phrases)
webtools_qfin_phrases

In [None]:
webtools_qfin_labels = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
                       0, 1, 0, 1, 0, 0, 0, 0, 0, 0]
len(webtools_qfin_labels) == len(webtools_qfin_phrases)

### MonkeyLearn

In [None]:
monkeylearn_qfin_phrases = ["equilibrium price formation",
"equilibrium price process",
"means field games",
"major agent",
"stochastic differential equation",
"large population limit",
"minor agent",
"dw",
"c0",
"system of fbsdes"]

In [None]:
monkeylearn_qfin_labels = [1, 1, 1, 0, 1, 1, 0, 0, 0, 0]
top10_acc_df = top10_acc_df.append({"Accuracy": np.mean(monkeylearn_qfin_labels), "Analyzer": "MonkeyLearn", "Domain": "Quant Finance"}, ignore_index = True)
top10_acc_df

## Statistics

### AutoLibrary

In [None]:
autophrase_stat_df = weighted_df['Statistics'].dropna().head(40)
autophrase_stat_df.head()

In [None]:
autophrase_stat_labels = [1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 
                         1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
                         0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
                         0, 1, 0, 0, 1, 1, 1, 1, 0, 0]
top40_acc_df = top40_acc_df.append({"Accuracy": np.mean(autophrase_stat_labels), "Analyzer": "AutoLibrary", "Domain": "Statistics"}, ignore_index = True)
top40_acc_df

In [None]:
top10_acc_df = top10_acc_df.append({"Accuracy": np.mean(autophrase_stat_labels[:10]), "Analyzer": "AutoLibrary", "Domain": "Statistics"}, ignore_index = True)
top10_acc_df

### Jstor

In [None]:
# score
jstor_stat_phrases = ["Text analytics",
"Parallel computing",
"Credit ratings",
"Machine learning",
"Information classification",
"Z score",
"Time series forecasting",
"Work credits",
"Test theory",
"Test scores",
                      
"Test data",
"Teaching methods",
"T score",
"Test score decline",
"Ratio test",
"Preeclampsia",
"Production estimates",
"Opportunity equality",
"Mathematical programming",
"Long run profit maximization",
                      
"Judicial powers",
"Letters of credit",
"Logistic regression",
"Information science",
"Information attributes",
"Full scores",
"Financial institutions",
"Diversity indices",
"Education credits",
"Fairness",
                      
"Disparate impact",
"Dimensionality reduction",
"Debt collection",
"Credit default swaps",
"Big data",
"Charge accounts",
"Classified information",
"Data models",
"Credit",
"Yield curves"]
len(jstor_stat_phrases)

In [None]:
jstor_stat_labels = [0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
                    0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
                    0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 
                    1, 1, 0, 0, 0, 0, 0, 0, 0, 1]
top40_acc_df = top40_acc_df.append({"Accuracy": np.mean(jstor_stat_labels), "Analyzer": "Jstor", "Domain": "Statistics"}, ignore_index = True)
top40_acc_df

### Webtools

In [None]:
webtools_stat_phrases = {"et al": 62,
"of the": 53,
"credit scoring": 38,
"on the": 28,
"to the": 27,
"and the": 25,
"in the": 24,
"of a": 23,
"is the": 21,
"fairness processors": 20,
                         
"the same": 19,
"based on": 19,
"p 1": 18,
"that the": 17,
"sensitive attribute": 17,
"of fairness": 17,
"can be": 15,
"fairness criteria": 15,
"the sensitive": 14,
"1 1": 14,
                         
"0 1": 13,
"is a": 13,
"for the": 12,
"to a": 11,
"has a": 11,
"based on the": 11,
"the other": 11,
"data sets": 11,
"1 0": 10,
"the separation": 10,
                         
"the emp": 10,
"the sensitive attribute": 9,
"barocas et al": 9,
"hardt et al": 9,
"a loan": 9,
"the separation criterion": 9,
"with a": 9,
"in credit scoring": 9,
"scoring model": 9,
"platt scaling": 9}

In [None]:
# standardize it
webtools_stat_phrases = utils.min_max_scale(webtools_stat_phrases)
webtools_stat_phrases

In [None]:
webtools_stat_labels = [0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
                       0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 1, 0, 0, 0, 0, 0, 0, 1, 1]
len(webtools_stat_labels) == len(webtools_stat_phrases)

### MonkeyLearn

In [None]:
monkeylearn_stat_phrases = ["credit scoring",
"fairness criteria",
"fairness processor",
"processor",
"separation",
"barocas et al",
"sufficiency",
"reject options classification",
"disparate impact remover",
"equalized odds processor"]

In [None]:
monkeylearn_stat_labels = [1, 1, 0, 0, 0, 0, 0, 1, 1, 1]
top10_acc_df = top10_acc_df.append({"Accuracy": np.mean(monkeylearn_stat_labels), "Analyzer": "MonkeyLearn", "Domain": "Statistics"}, ignore_index = True)
top10_acc_df

## Comparison

In [None]:
# autolibrary vs. webtools
utils.graph_precision_recall(autophrase_cs_labels, autophrase_cs_df,
    webtools_cs_labels, webtools_cs_phrases, "Computer Science")
utils.graph_precision_recall(autophrase_econ_labels, autophrase_econ_df,
    webtools_econ_labels, webtools_econ_phrases, "Economics")
utils.graph_precision_recall(autophrase_eess_labels, autophrase_eess_df,
    webtools_eess_labels, webtools_eess_phrases, "EE & System Design")
utils.graph_precision_recall(autophrase_math_labels, autophrase_math_df,
    webtools_math_labels, webtools_math_phrases, "Mathematics")
utils.graph_precision_recall(autophrase_physics_labels, autophrase_physics_df,
    webtools_physics_labels, webtools_physics_phrases, "Physics")
utils.graph_precision_recall(autophrase_qbio_labels, autophrase_qbio_df,
    webtools_qbio_labels, webtools_qbio_phrases, "Quantitative Biology")
utils.graph_precision_recall(autophrase_qfin_labels, autophrase_qfin_df,
    webtools_qfin_labels, webtools_qfin_phrases, "Quantitative Finance")
utils.graph_precision_recall(autophrase_stat_labels, autophrase_stat_df,
    webtools_stat_labels, webtools_stat_phrases, "Statistics")

we can see that the area under the precision-recall curve of AutoLibrary is bigger than that of Webtools in all eight domains. This means that AutoLibrary performs better than the Webtools analyzer across different domains. It makes sense intuitively since the latter one is very simple. Its results often contain non-quality phrases, such as “of the”, “can be”, and etc. Moreover, in the domain of Quantitative Finance, where there were lots of mathematical equations, the Webtools analyzer failed to filter out symbols like “ts”.

In [None]:
# autolibrary vs. jstor
import seaborn as sns
import matplotlib.pyplot as plt
fig = plt.figure(figsize = (27, 12))
sns.set(font_scale = 1.8)
ax = sns.histplot(data = top40_acc_df, x = "Domain", weights = "Accuracy", hue = "Analyzer", 
                  hue_order = ["AutoLibrary", "Jstor"], multiple="dodge", shrink=.8)
ax.set_xlabel('Domains', fontsize = 30)
ax.set_ylabel('Accuracy', fontsize = 30)
ax.legend(labels = ["Jstor", "AutoLibrary"], fontsize = 30)
ax.set_title("Precision of Top 40 Extracted Phrases: AutoPhrase vs. Jstor", fontsize = 40)
plt.show()

We can see that in all domains, AutoLibrary performs better than the Jstor analyzer. The main con of Jstor is that its recommendation is based on a fixed set of predefined topics. Therefore, it cannot make customized recommendations for specific papers. For example, when analyzing a statistics paper, it recommended “debt collection”, which doesn’t even exist in the original text. In contrast, AutoLibrary first extracts phrases from the input paper, which improves the contingency between the quality phrases and the original paper.

Another big shortcoming of Jstor is that the topics it recommends are sometimes too general. Although they make sense in English, they don't qualify as quality phrases. For  example, when analyzing the Computer Science paper, one of the phrases extracted is “computer programming.” It doesn’t really help users since it’s too general for them to search for related papers.

Last but not least, Jstor doesn’t offer quality scores to recommended phrases, which means that users don’t know which phrases can best represent their input papers. AutoLibrary, on the other hand, ranks the top 5 phrases in order, so that users can have a better overview of the papers.


In [None]:
# autolibrary vs. MonkeyLearn
fig = plt.figure(figsize = (27, 12))
sns.set(font_scale = 1.8)
ax = sns.histplot(data = top10_acc_df, x = "Domain", weights = "Accuracy", hue = "Analyzer", 
                  hue_order = ["AutoLibrary", "MonkeyLearn"], multiple="dodge", shrink=.8)
ax.set_xlabel('Domains', fontsize = 30)
ax.set_ylabel('Accuracy', fontsize = 30)
ax.legend(labels = ["MonkeyLearn", "AutoLibrary"], fontsize = 30)
ax.set_title("Precision of Top 10 Extracted Phrases: AutoPhrase vs. MonkeyLearn", fontsize = 40)
plt.show()

From the figure, we can see that AutoPhrase outperformed MonkeyLearn in all domains. One big disadvantage of MonkeyLearn we found is that it probably relies heavily on the frequency of phrases. Although it can extract some really meaningful phrases amongst its top 5 phrases, it also recommends some phrases that make no sense. For example, when analyzing the Mathematics paper, it extracted "a1 a2 a1", "a2 a1 a2" and "a1 a2 a0" amongst the top 10 phrases. We believe that AutoLibrary defeats MonkeyLearn by weighting phrases against domain knowledge pools, which eliminates the reckless ones.