In [1]:
import pymongo
import pandas as pd
from bson import Regex

In [2]:
# Database Information
cnx = 'mongodb+srv://gabe:gabe_mongo@arxiv.xawxi.mongodb.net/test'
# Connection to MongoDB
client = pymongo.MongoClient(cnx)

In [3]:
db = client['arxiv']
collections = db.list_collection_names()
collections

['Math',
 'Math_Clean',
 'Stat_Clean',
 'CS_Clean',
 'Statistics',
 'ComputerScience']

In [None]:
query = {'published': {'$type': 'string'}}
date_format = {'$set': {'published': {'dateFromString': {'dateString': 'published'}}}}

db.Math.update_many(query, date_format)

In [None]:
c = db.ComputerScience.aggregate([
    {'$project': {'_id': 0}},
    {'$unwind': '$author.name'},
    {'$group': {'_id': '$author.name', 'count': {'$sum': 1}}}
    #{'sort': {'count': 1}}
])

authors = pd.DataFrame(c)

In [None]:
def art_by_year(col):
    project = {'$project': {'_id': 0}}
    group_by = {'$group': {'_id': {'year': {'$year': '$formatted_date'}},
                'count': {'$sum': 1}}}
    
    group_by_date = col.aggregate([project, group_by])
    byYear = pd.DataFrame(group_by_date)
    byYear['_id'] = pd.json_normalize(byYear['_id'])

    return byYear

In [None]:
statYear = art_by_year(db.Statistics_Clean)
mathYear = art_by_year(db.Math_Clean)
econYear = art_by_year(db.Economics_Clean)
csYear = art_by_year(db.ComputerScience_Clean)

## Math

In [None]:
unwind = [{'$project': {'_id': 0, 'category.@term': 1}}]

for doc in db.Math.aggregate(unwind):
    print(doc)

In [None]:
# Title, author name, affiliation, and date published
project = {'$project': {'_id': 0, 'title':1, 'author.name':1}}
unwind = {'$unwind': '$author.name'}
limit = {'$limit': 5}

pipeline = [project, limit]

for doc in db.Math.aggregate(pipeline):
    print(doc)

#cursor = db.Math.aggregate(pipeline)

In [None]:
# Group by category_id, return the number of authors in a paper
c = db.Math.aggregate([
    {'$project': {'_id':0}},
    #{'$unwind': '$author.name'},
    {'$group': {'_id': '$author.name', 'count': {'$sum': 1}}}
])

authors = pd.DataFrame(c)

In [None]:
authors['count'].value_counts()

In [5]:
for doc in db.Math.aggregate([
    {'$project': {'_id':0, 'category.@term':1}},
    {'$unwind': '$category'},
    {'$limit': 10}
    #{'$group': {'_id': '$category.@term', 'count': {'$sum': 1}}}
    ]):
    print(doc)


{'category': {'@term': 'math.PR'}}
{'category': {'@term': 'math.PR'}}
{'category': {'@term': 'math.PR'}}
{'category': {'@term': 'math.PR'}}
{'category': {'@term': 'math.PR'}}
{'category': {'@term': 'math.PR'}}
{'category': {'@term': 'math.PR'}}
{'category': {'@term': 'math.PR'}}
{'category': {'@term': 'math.PR'}}
{'category': {'@term': 'math.PR'}}


In [146]:
unwind = {'$unwind': '$arxiv:comment.#text'}

project = {'$project': {'_id':0, 'title':1,  'arxiv:comment.#text': 1}}

match = {'$match': {'arxiv:comment.#text': {'$regex': 'pages[^0-9]'}}}

limit = {'$limit': 5}

pipeline = [project, match]

long = db.Math.aggregate(pipeline)

longPaper = pd.DataFrame(long)


#for doc in db.Math.aggregate(pipeline):
#    print(doc)

In [147]:
longPaper['arxiv:comment'] = longPaper['arxiv:comment'].apply(pd.Series).astype(str)

In [128]:
longPaper['arxiv:comment'] = longPaper['arxiv:comment'].str.replace('pages', '')

In [148]:
longPaper['arxiv:comment'] = longPaper['arxiv:comment'].str.replace('[^0-9]', '', regex=True)

In [154]:
longPaper['arxiv:comment'] = longPaper['arxiv:comment'].astype(float)

longPaper['arxiv:comment'] = longPaper['arxiv:comment'] // 10


In [None]:
c = db.ComputerScience.aggregate([
    {'$project': {'_id': 0}},
    {'$unwind': '$author.name'},
    {'$group': {'_id': '$author.name', 'count': {'$sum': 1}}}
    #{'sort': {'count': 1}}
])

authors = pd.DataFrame(c)

In [None]:
csAuthors = authors.sort_values(by=['count'], ascending=False)
csAuthors

In [61]:
c = db.Math.aggregate([
    {'$project': {'_id': 0}},
    {'$unwind': '$author.name'},
    {'$group': {'_id': '$author.name', 'count': {'$sum': 1}}}
    #{'sort': {'count': 1}}
])

authors = pd.DataFrame(c)

In [63]:
mathAuthors = authors.sort_values(by=['count'], ascending=False)
mathAuthors

Unnamed: 0,_id,count
384,Robin Pemantle,12
385,Boris Tsirelson,8
310,Ashkan Nikeghbali,7
146,Inder Jeet Taneja,7
409,Vyacheslav M. Abramov,6
...,...,...
189,Frank Hampel,1
187,Berwin A. Turlach,1
186,Christophe Sabot,1
184,Firas Rassoul-Agha,1


In [None]:
stage_lookup = {
    '$lookup': {
        'from': 'Math',
        'localField': 'author.name',
        'foreignField': 'author.name',
        'as': 'same_author'
    }
}

match = {'$match': {'same_author.0': {'$exists': True}}}

add_fields = {'$addFields': {
    'author_name': 'author.name',
    'paper_title': 'title'
}}

project = {'$project': {'_id': 0, 'author.name':1, 'title': 1}}

unwind = {'$unwind': '$author.name'}

group_by = {'$group': {'_id': '$author.name', 'count': {'$sum': 1}}}

limit = {'$limit': 3}

pipeline = [stage_lookup, match, project, add_fields, project, limit]
#pipeline = [stage_lookup, match, project, unwind, group_by, limit]

for doc in db.ComputerScience.aggregate(pipeline):
    print(doc)



In [None]:
group = {'$group':
            {
                '_id': {'title': '$title'},
                'authors': {'$cnt': 'author.names'}
            }}

for doc in db.ComputerScience.aggregate([group]):
    print(doc)

In [None]:
for doc in db.ComputerScience.aggregate([
    {'$group': {'_id': 'author.names', 'count': {'$sum': 1}}}]):
    print(doc)

In [None]:
# Number of authors for a given paper
size = {
    '$addFields': {
        'author_count': {
            '$size': '$author'
        }
    }
}

project = {
    '$project': {'_id': 0, 'title':1}
}

pipeline = [size, project]

for doc in db.Statistics.aggregate(pipeline):
    print(doc)

In [None]:
for doc in db.Statistics.find({'author': {'$size': 5}}):
    print(doc)

In [50]:
project = {'$project': {'_id': 0, 'author': 1}}
unwind = {'$unwind': '$author'}
add_fields = {'$addFields': {
    'number_authors': {
        'author': {'$size': {'$author'}}
        }
    }
}
#project = {'$project': {'_id':0, 'author': {'$size': '$author'}}}

limit = {'$limit': 5}

pipe = [project, limit]

for doc in db.Math.aggregate(pipe):
    print(doc)

#authors = [doc for doc in db.Math.aggregate(pipe)]

{'author': [{'name': 'Martin T. Barlow'}, {'name': 'Richard F. Bass'}]}
{'author': [{'name': 'Edward C. Waymire'}, {'name': 'Stanley C. Williams'}]}
{'author': [{'name': 'Christopher J. Bishop'}, {'name': 'Peter Jones'}, {'name': 'Robin Pemantle'}, {'name': 'Yuval Peres'}]}
{'author': [{'name': 'Lincoln Chayes'}, {'name': 'Robin Pemantle'}, {'name': 'Yuval Peres'}]}
{'author': [{'name': 'Robin Pemantle'}, {'name': 'Stanislav Volkov'}]}
