In [179]:
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['ceur_ws_fix']
papers = db['papers']
print(f"papers: {papers.count_documents({})}")
print(f"volumes: {db['volumes'].count_documents({})}")

papers: 13589
volumes: 773


We have to clear the dataset before migration to the neo4j database.
malformed papers example:

In [180]:
query_empty_paper_info = {
    "$or": [
        {"paper_info.related_papers": {"$size": 0}},
        {"paper_info.keywords": {"$size": 0}},
        {"paper_info": {"$exists": False}},
        {"paper_info": None}
    ]
}
papers.count_documents(query_empty_paper_info)

1714

Delete malformed papaers

In [47]:
papers = db['papers']
papers.delete_many(query_empty_paper_info)
print(papers.count_documents({}))

12160


Those are papers that contains at least one related paper without authors.

In [61]:
query = {
    "paper_info.related_papers": {
        "$elemMatch": {
            "authors": { "$size": 0}
        }
    }
}
print(papers.count_documents(query))

11630


Now we count how many related papers we have and how many have no authors.

In [178]:
pipeline = [
    {"$match": {"paper_info.related_papers": {"$exists": True, "$ne": None}}},
    {"$project": {
        "related_papers": "$paper_info.related_papers"
    }},
    {"$unwind": "$related_papers"},
    {"$group": {
        "_id": None,
        "total_related": {"$sum": 1},
        "no_authors": {
            "$sum": {
                "$cond": [
        {"$eq": [{"$size": "$related_papers.authors"}, 0]},
        1,
        0
    ]
            }
        }
    }}
]

result = list(papers.aggregate(pipeline))[0]
print("Total related_papers:", result["total_related"])
print("Related_papers without authors:", result["no_authors"])
print("Related_papers with authors:", result["total_related"] - result["no_authors"])

Total related_papers: 369096
Related_papers without authors: 200929
Related_papers with authors: 168167
