### Preprocessing the collected data to remove uncommon words

In [218]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist

from string import punctuation
from data_utils import clean_data, clean_authors

In [219]:
## May need to install pyarrow or fastparquet to process this
## df = pd.read_parquet("./data/recent_50k.parquet")
## df = pd.read_parquet("./data/eng_175k.parquet")
df = pd.read_parquet("./data/eng_50k.parquet")

In [220]:
df.head(5)

Unnamed: 0,id,title,abstract,update_date,authors_parsed,new_cat
0,math/9907166,Vertex representations via finite groups and t...,Given a finite group $\Gamma$ and a virtual ...,2023-05-19,"[['Frenkel', 'Igor', ''], ['Jing', 'Naihuan', ...","['math.QA', 'hep-th', 'math.RT']"
1,math/9602216,Categoricity and amalgamation for AEC and $ \k...,"In the original version of this paper, we as...",2023-05-19,"[['Kolman', 'Oren', ''], ['Shelah', 'Saharon',...",['math.LO']
2,math/0504123,From Loop Groups to 2-Groups,We describe an interesting relation between ...,2023-05-16,"[['Baez', 'John C.', ''], ['Crans', 'Alissa S....","['math.QA', 'hep-th', 'math.DG']"
3,quant-ph/0401139,Finite Supersymmetry Transformations,We investigate simple examples of supersymme...,2023-05-09,"[['Ilieva', 'Nevena', ''], ['Narnhofer', 'Heid...","['quant-ph', 'hep-th', 'math-ph', 'math.MP']"
4,math/0212249,Super black box (formerly: Middle diamond),This is a slightly corrected version of an o...,2023-05-04,"[['Shelah', 'Saharon', '']]",['math.LO']


In [221]:
## Clean the title and abstracts in the database
## Adding three new columns to the database

df['clean_title'] = df['title'].apply(clean_data)
df['clean_summary'] = df['abstract'].apply(clean_data)
df['clean_authors'] = df['authors_parsed'].apply(clean_authors)

In [222]:
df.head()

Unnamed: 0,id,title,abstract,update_date,authors_parsed,new_cat,clean_title,clean_summary,clean_authors
0,math/9907166,Vertex representations via finite groups and t...,Given a finite group $\Gamma$ and a virtual ...,2023-05-19,"[['Frenkel', 'Igor', ''], ['Jing', 'Naihuan', ...","['math.QA', 'hep-th', 'math.RT']",vertex representations via finite groups and t...,given a finite group and a virtual character o...,"[['frenkel', 'igor', ''], ['jing', 'naihuan', ..."
1,math/9602216,Categoricity and amalgamation for AEC and $ \k...,"In the original version of this paper, we as...",2023-05-19,"[['Kolman', 'Oren', ''], ['Shelah', 'Saharon',...",['math.LO'],categoricity and amalgamation for aec and meas...,in the original version of this paper we assum...,"[['kolman', 'oren', ''], ['shelah', 'saharon',..."
2,math/0504123,From Loop Groups to 2-Groups,We describe an interesting relation between ...,2023-05-16,"[['Baez', 'John C.', ''], ['Crans', 'Alissa S....","['math.QA', 'hep-th', 'math.DG']",from loop groups to groups,we describe an interesting relation between li...,"[['baez', 'john c', ''], ['crans', 'alissa s',..."
3,quant-ph/0401139,Finite Supersymmetry Transformations,We investigate simple examples of supersymme...,2023-05-09,"[['Ilieva', 'Nevena', ''], ['Narnhofer', 'Heid...","['quant-ph', 'hep-th', 'math-ph', 'math.MP']",finite supersymmetry transformations,we investigate simple examples of supersymmetr...,"[['ilieva', 'nevena', ''], ['narnhofer', 'heid..."
4,math/0212249,Super black box (formerly: Middle diamond),This is a slightly corrected version of an o...,2023-05-04,"[['Shelah', 'Saharon', '']]",['math.LO'],super black box formerly middle diamond,this is a slightly corrected version of an old...,"[['shelah', 'saharon', '']]"


In [223]:
df['abstract_tokenized'] = df['clean_summary'].apply(nltk.word_tokenize)

In [224]:
## Get rid of the occasional empty string.
def clear_empty(clean_string):
    return [word for word in clean_string if word != '']

In [225]:
df['abstract_tokenized'] = df['abstract_tokenized'].apply(clear_empty) 

Now we create a corpus of all the words (non-unique) that appear in the abstracts and then find the frequency distribution.

In [226]:
indices = df.index.values

# Corpus will be a list of lists
corpus = []
for i  in indices:
    corpus.append(df['abstract_tokenized'][i])

# Convert a list of lists to a list because FreqDist takes in a
# list of strings
flat_corpus = []
for sublist in corpus:
    for item in sublist:
        flat_corpus.append(item)
        
print(len(flat_corpus))

5937867


In [227]:
# Create a frequency distribution from the flattened corpus
freq = FreqDist(flat_corpus)
print("There are", len(freq), "words in the frequency distribution.")

There are 66993 words in the frequency distribution.


In [229]:
df_fdist = pd.DataFrame(list(freq.items()), columns = ["Word","Frequency"])
df_fdist = df_fdist.sort_values(by="Frequency", ascending=True)
df_fdist.head(20)

Unnamed: 0,Word,Frequency
66992,matilde,1
50245,counterparty,1
50246,differenceselements,1
50247,statisticalcomputational,1
50248,seriousness,1
41945,wente,1
50249,delayers,1
41943,pelts,1
50250,mummert,1
50251,oberste,1


In [230]:
## Create a list of the words that appear only once
unique_words = list(df_fdist[df_fdist['Frequency'] == 1]['Word'])
print("There are", len(unique_words), "words that appear only once in the abstracts.")

There are 28097 words that appear only once in the abstracts.


We also want to remove common stopwords from the abstracts. We will append these to the unique words, so then wew only need to iterate through the dataframe once.

In [248]:
eng_stopwords = stopwords.words('english')
print(stopwords.words('english'))

## Remove punctutation from stopwords because we've already r
## removed it from the abstracts
## punctuation is imported from the string class
new_punct = punctuation + "’" + "‘"
print("\n", new_punct)
print()

new_stop = []
for word in eng_stopwords:
    new_word = ""
    for char in word:
        if char not in new_punct:
            new_word = new_word + char
        # print(new_word)
    new_stop.append(new_word)
    
print(new_stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [232]:
remove_words = new_stop + unique_words

## Remove infrequent words and stop words from the tokenized abstracts
def remove_stop_unique(tokens):
    return [token for token in tokens if token not in remove_words]

In [233]:
%time
df['abstract_reduced_tokens'] = df['abstract_tokenized'].apply(remove_stop_unique)

In [234]:
df.head()

Unnamed: 0,id,title,abstract,update_date,authors_parsed,new_cat,clean_title,clean_summary,clean_authors,abstract_tokenized,abstract_reduced_tokens
0,math/9907166,Vertex representations via finite groups and t...,Given a finite group $\Gamma$ and a virtual ...,2023-05-19,"[['Frenkel', 'Igor', ''], ['Jing', 'Naihuan', ...","['math.QA', 'hep-th', 'math.RT']",vertex representations via finite groups and t...,given a finite group and a virtual character o...,"[['frenkel', 'igor', ''], ['jing', 'naihuan', ...","[given, a, finite, group, and, a, virtual, cha...","[given, finite, group, virtual, character, con..."
1,math/9602216,Categoricity and amalgamation for AEC and $ \k...,"In the original version of this paper, we as...",2023-05-19,"[['Kolman', 'Oren', ''], ['Shelah', 'Saharon',...",['math.LO'],categoricity and amalgamation for aec and meas...,in the original version of this paper we assum...,"[['kolman', 'oren', ''], ['shelah', 'saharon',...","[in, the, original, version, of, this, paper, ...","[original, version, paper, assume, theory, log..."
2,math/0504123,From Loop Groups to 2-Groups,We describe an interesting relation between ...,2023-05-16,"[['Baez', 'John C.', ''], ['Crans', 'Alissa S....","['math.QA', 'hep-th', 'math.DG']",from loop groups to groups,we describe an interesting relation between li...,"[['baez', 'john c', ''], ['crans', 'alissa s',...","[we, describe, an, interesting, relation, betw...","[describe, interesting, relation, lie, algebra..."
3,quant-ph/0401139,Finite Supersymmetry Transformations,We investigate simple examples of supersymme...,2023-05-09,"[['Ilieva', 'Nevena', ''], ['Narnhofer', 'Heid...","['quant-ph', 'hep-th', 'math-ph', 'math.MP']",finite supersymmetry transformations,we investigate simple examples of supersymmetr...,"[['ilieva', 'nevena', ''], ['narnhofer', 'heid...","[we, investigate, simple, examples, of, supers...","[investigate, simple, examples, supersymmetry,..."
4,math/0212249,Super black box (formerly: Middle diamond),This is a slightly corrected version of an o...,2023-05-04,"[['Shelah', 'Saharon', '']]",['math.LO'],super black box formerly middle diamond,this is a slightly corrected version of an old...,"[['shelah', 'saharon', '']]","[this, is, a, slightly, corrected, version, of...","[slightly, corrected, version, old, work, cert..."


In [237]:
# Corpus will be a list of lists
corpus_2 = []
for i  in indices:
    corpus_2.append(df['abstract_reduced_tokens'][i])

# Convert a list of lists to a list because FreqDist takes in a
# list of strings
flat_corpus_2 = []
for sublist in corpus_2:
    for item in sublist:
        flat_corpus_2.append(item)
        
print(len(flat_corpus_2))

3481879


In [238]:
3577852 - 3549126

28726

In [244]:
## Save the dataframe to a parquet file
df.to_parquet('./eng_50k_tokenized.gzip',compression='gzip')

In [245]:
## Read in the parquet file
# df_new = pd.read_parquet('./eng_50k_tokenized.gzip')
# df_new.head()

In [251]:
## Let's look at the frequency distribution again.
freq_2 = FreqDist(flat_corpus_2)
print("There are", len(freq_2), "words in the frequency distribution.")

There are 38760 words in the frequency distribution.


In [252]:
## These are the most common words, perhaps we would want to remove some of them?
freq_2.most_common(50)

[('show', 19020),
 ('paper', 18302),
 ('results', 16682),
 ('problem', 16663),
 ('also', 15665),
 ('two', 15039),
 ('prove', 14638),
 ('time', 13008),
 ('model', 12713),
 ('study', 12660),
 ('space', 12519),
 ('method', 12136),
 ('one', 12008),
 ('non', 11912),
 ('based', 11575),
 ('number', 11390),
 ('function', 10926),
 ('using', 10430),
 ('new', 10425),
 ('finite', 10363),
 ('order', 10359),
 ('case', 9958),
 ('set', 9776),
 ('system', 9747),
 ('theory', 9484),
 ('functions', 9298),
 ('linear', 9101),
 ('first', 9089),
 ('group', 8902),
 ('result', 8870),
 ('data', 8769),
 ('algorithm', 8558),
 ('dimensional', 8364),
 ('solutions', 7895),
 ('graph', 7866),
 ('systems', 7828),
 ('problems', 7792),
 ('given', 7735),
 ('proposed', 7727),
 ('type', 7698),
 ('equation', 7541),
 ('solution', 7528),
 ('equations', 7503),
 ('class', 7491),
 ('well', 7454),
 ('numerical', 7415),
 ('general', 7345),
 ('consider', 7031),
 ('methods', 6908),
 ('work', 6857)]