### Preprocessing the collected abstracts to remove uncommon words

In [218]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist

from string import punctuation
from data_utils import clean_data, clean_authors

In [253]:
## May need to install pyarrow or fastparquet to process this
## df = pd.read_parquet("./data/recent_50k.parquet")
## df = pd.read_parquet("./data/eng_175k.parquet")
## df = pd.read_parquet("./data/eng_50k.parquet")
df = pd.read_parquet("./data/filter_20k.parquet")

In [254]:
df.head(5)

Unnamed: 0,id,title,abstract,update_date,authors_parsed,strip_cat
182244,1412.3275,Limit cycles bifurcating from a degenerate center,We study the maximum number of limit cycles ...,2014-12-11,"[['Llibre', 'J.', ''], ['Pantazi', 'C.', '']]",[DS]
196425,809.351,Shrinking Point Bifurcations of Resonance Tong...,Resonance tongues are modelocking regions of...,2015-05-13,"[['Simpson', 'D. J. W.', ''], ['Meiss', 'J. D....",[DS]
479424,2201.04222,Classification of Codimension1 Singular Bifurc...,The study of bifurcations of differentialalg...,2022-01-13,"[['Ovsyannikov', 'Ivan', ''], ['Ruan', 'Haibo'...",[DS]
176385,1408.5812,Partial sums of excursions along random geodes...,"For a nonuniform lattice in SL(2,R), we cons...",2014-10-09,"[['Gadre', 'Vaibhav', '']]","[GT, DS]"
291058,1707.03102,Uniform dimension results for a family of Mark...,In this paper we prove uniform Hausdorff and...,2017-10-03,"[['Sun', 'Xiaobin', ''], ['Xiao', 'Yimin', '']...",[PR]


In [255]:
## Clean the title and abstracts in the database
## Adding three new columns to the database
df['clean_title'] = df['title'].apply(clean_data)
df['clean_abstract'] = df['abstract'].apply(clean_data)
df['clean_authors'] = df['authors_parsed'].apply(clean_authors)

In [256]:
df.head()

Unnamed: 0,id,title,abstract,update_date,authors_parsed,strip_cat,clean_title,clean_abstract,clean_authors
182244,1412.3275,Limit cycles bifurcating from a degenerate center,We study the maximum number of limit cycles ...,2014-12-11,"[['Llibre', 'J.', ''], ['Pantazi', 'C.', '']]",[DS],limit cycles bifurcating from a degenerate center,we study the maximum number of limit cycles th...,"[['llibre', 'j', ''], ['pantazi', 'c', '']]"
196425,809.351,Shrinking Point Bifurcations of Resonance Tong...,Resonance tongues are modelocking regions of...,2015-05-13,"[['Simpson', 'D. J. W.', ''], ['Meiss', 'J. D....",[DS],shrinking point bifurcations of resonance tong...,resonance tongues are modelocking regions of p...,"[['simpson', 'd j w', ''], ['meiss', 'j d', '']]"
479424,2201.04222,Classification of Codimension1 Singular Bifurc...,The study of bifurcations of differentialalg...,2022-01-13,"[['Ovsyannikov', 'Ivan', ''], ['Ruan', 'Haibo'...",[DS],classification of codimension singular bifurca...,the study of bifurcations of differentialalgeb...,"[['ovsyannikov', 'ivan', ''], ['ruan', 'haibo'..."
176385,1408.5812,Partial sums of excursions along random geodes...,"For a nonuniform lattice in SL(2,R), we cons...",2014-10-09,"[['Gadre', 'Vaibhav', '']]","[GT, DS]",partial sums of excursions along random geodes...,for a nonuniform lattice in slr we consider ex...,"[['gadre', 'vaibhav', '']]"
291058,1707.03102,Uniform dimension results for a family of Mark...,In this paper we prove uniform Hausdorff and...,2017-10-03,"[['Sun', 'Xiaobin', ''], ['Xiao', 'Yimin', '']...",[PR],uniform dimension results for a family of mark...,in this paper we prove uniform hausdorff and p...,"[['sun', 'xiaobin', ''], ['xiao', 'yimin', '']..."


In [258]:
df['abstract_tokenized'] = df['clean_abstract'].apply(nltk.word_tokenize)

In [259]:
## Get rid of the occasional empty string.
def clear_empty(clean_string):
    return [word for word in clean_string if word != '']

In [260]:
df['abstract_tokenized'] = df['abstract_tokenized'].apply(clear_empty) 

Now we create a corpus of all the words (non-unique) that appear in the abstracts and then find the frequency distribution.

In [261]:
indices = df.index.values

# Corpus will be a list of lists
corpus = []
for i  in indices:
    corpus.append(df['abstract_tokenized'][i])

# Convert a list of lists to a list because FreqDist takes in a
# list of strings
flat_corpus = []
for sublist in corpus:
    for item in sublist:
        flat_corpus.append(item)
        
print(len(flat_corpus))

1988337


In [262]:
# Create a frequency distribution from the flattened corpus
freq = FreqDist(flat_corpus)
print("There are", len(freq), "words in the frequency distribution.")
## For the eng_50k dataset there were 66993 words.

There are 44183 words in the frequency distribution.


In [265]:
df_fdist = pd.DataFrame(list(freq.items()), columns = ["Word","Frequency"])
df_fdist = df_fdist.sort_values(by="Frequency", ascending=True)
df_fdist.head(50)

Unnamed: 0,Word,Frequency
22091,evidently,1
30532,amplitudesquared,1
30533,gsignature,1
30534,fullerene,1
16061,thermostats,1
16060,generelized,1
30536,uqslmodules,1
30538,teepee,1
30539,chameleon,1
16065,epde,1


In [266]:
## Create a list of the words that appear only once
unique_words = list(df_fdist[df_fdist['Frequency'] == 1]['Word'])
print("There are", len(unique_words), "words that appear only once in the abstracts.")
## Fort he eng_50k dataset there were 28097 words that appear only once in the abstracts.

There are 21707 words that appear only once in the abstracts.


We also want to remove common stopwords from the abstracts. We will append these to the unique words, so then wew only need to iterate through the dataframe once.

In [267]:
eng_stopwords = stopwords.words('english')
print(stopwords.words('english'))

## Remove punctutation from stopwords because we've already r
## removed it from the abstracts
## punctuation is imported from the string class
new_punct = punctuation + "’" + "‘"
print("\n", new_punct)
print()

new_stop = []
for word in eng_stopwords:
    new_word = ""
    for char in word:
        if char not in new_punct:
            new_word = new_word + char
        # print(new_word)
    new_stop.append(new_word)
    
print(new_stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [268]:
remove_words = new_stop + unique_words

## Remove infrequent words and stop words from the tokenized abstracts
def remove_stop_unique(tokens):
    return [token for token in tokens if token not in remove_words]

In [269]:
%%time
df['abstract_reduced_tokens'] = df['abstract_tokenized'].apply(remove_stop_unique)

CPU times: user 8min 9s, sys: 2.4 s, total: 8min 12s
Wall time: 8min 16s


In [270]:
df.head()

Unnamed: 0,id,title,abstract,update_date,authors_parsed,strip_cat,clean_title,clean_abstract,clean_authors,abstract_tokenized,abstract_reduced_tokens
182244,1412.3275,Limit cycles bifurcating from a degenerate center,We study the maximum number of limit cycles ...,2014-12-11,"[['Llibre', 'J.', ''], ['Pantazi', 'C.', '']]",[DS],limit cycles bifurcating from a degenerate center,we study the maximum number of limit cycles th...,"[['llibre', 'j', ''], ['pantazi', 'c', '']]","[we, study, the, maximum, number, of, limit, c...","[study, maximum, number, limit, cycles, bifurc..."
196425,809.351,Shrinking Point Bifurcations of Resonance Tong...,Resonance tongues are modelocking regions of...,2015-05-13,"[['Simpson', 'D. J. W.', ''], ['Meiss', 'J. D....",[DS],shrinking point bifurcations of resonance tong...,resonance tongues are modelocking regions of p...,"[['simpson', 'd j w', ''], ['meiss', 'j d', '']]","[resonance, tongues, are, modelocking, regions...","[resonance, tongues, modelocking, regions, par..."
479424,2201.04222,Classification of Codimension1 Singular Bifurc...,The study of bifurcations of differentialalg...,2022-01-13,"[['Ovsyannikov', 'Ivan', ''], ['Ruan', 'Haibo'...",[DS],classification of codimension singular bifurca...,the study of bifurcations of differentialalgeb...,"[['ovsyannikov', 'ivan', ''], ['ruan', 'haibo'...","[the, study, of, bifurcations, of, differentia...","[study, bifurcations, differentialalgebraic, e..."
176385,1408.5812,Partial sums of excursions along random geodes...,"For a nonuniform lattice in SL(2,R), we cons...",2014-10-09,"[['Gadre', 'Vaibhav', '']]","[GT, DS]",partial sums of excursions along random geodes...,for a nonuniform lattice in slr we consider ex...,"[['gadre', 'vaibhav', '']]","[for, a, nonuniform, lattice, in, slr, we, con...","[nonuniform, lattice, slr, consider, excursion..."
291058,1707.03102,Uniform dimension results for a family of Mark...,In this paper we prove uniform Hausdorff and...,2017-10-03,"[['Sun', 'Xiaobin', ''], ['Xiao', 'Yimin', '']...",[PR],uniform dimension results for a family of mark...,in this paper we prove uniform hausdorff and p...,"[['sun', 'xiaobin', ''], ['xiao', 'yimin', '']...","[in, this, paper, we, prove, uniform, hausdorf...","[paper, prove, uniform, hausdorff, packing, di..."


In [271]:
# Corpus will be a list of lists
corpus_2 = []
for i  in indices:
    corpus_2.append(df['abstract_reduced_tokens'][i])

# Convert a list of lists to a list because FreqDist takes in a
# list of strings
flat_corpus_2 = []
for sublist in corpus_2:
    for item in sublist:
        flat_corpus_2.append(item)
        
print(len(flat_corpus_2))
print(len(flat_corpus) - len(flat_corpus_2))

1117534
870803


In [272]:
## Let's look at the frequency distribution again.
freq_2 = FreqDist(flat_corpus_2)
print("There are", len(freq_2), "words in the frequency distribution.")

There are 22344 words in the frequency distribution.


In [273]:
## These are the most common words, perhaps we would want to remove some of them?
freq_2.most_common(50)

[('show', 6191),
 ('prove', 5995),
 ('paper', 5811),
 ('also', 5518),
 ('equation', 5469),
 ('space', 5416),
 ('solutions', 5301),
 ('results', 5280),
 ('study', 5101),
 ('equations', 5029),
 ('model', 4837),
 ('problem', 4608),
 ('case', 4445),
 ('theory', 4332),
 ('system', 4303),
 ('two', 4278),
 ('random', 4063),
 ('one', 3931),
 ('time', 3893),
 ('function', 3652),
 ('result', 3617),
 ('systems', 3478),
 ('boundary', 3443),
 ('solution', 3435),
 ('consider', 3338),
 ('new', 3332),
 ('general', 3126),
 ('quantum', 3119),
 ('functions', 3099),
 ('existence', 3039),
 ('using', 2981),
 ('conditions', 2959),
 ('given', 2941),
 ('class', 2872),
 ('field', 2857),
 ('method', 2854),
 ('first', 2853),
 ('set', 2794),
 ('particular', 2721),
 ('limit', 2648),
 ('finite', 2644),
 ('order', 2623),
 ('give', 2572),
 ('process', 2510),
 ('terms', 2481),
 ('operator', 2463),
 ('number', 2458),
 ('type', 2457),
 ('group', 2432),
 ('theorem', 2334)]

In [274]:
## Save the dataframe to a parquet file
df.to_parquet('./filter_20k_tokenized.parquet')

In [276]:
## Read in the parquet file
# df_new = pd.read_parquet('./eng_50k_tokenized.gzip')
#df_new = pd.read_parquet('./filter_20k_tokenized.parquet')
#df_new.head()