In [2]:
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['ceur_ws_fix']
papers = db['papers']
print(f"papers: {papers.count_documents({})}")
print(f"volumes: {db['volumes'].count_documents({})}")

papers: 19247
volumes: 3008


We have to clear the dataset before migration to the neo4j database.
Malformed papers example:

In [3]:
query_empty_paper_info = {
    "$or": [
        {"paper_info.related_papers": {"$size": 0}},
        {"paper_info.keywords": {"$size": 0}},
        {"paper_info": {"$exists": False}},
        {"paper_info": None}
    ]
}
papers.count_documents(query_empty_paper_info)

11351

Delete malformed papers

In [4]:
papers = db['papers']
papers.delete_many(query_empty_paper_info)
print(papers.count_documents({}))

19247


Those are papers that contain at least one related paper without authors.

In [15]:
query = {
    "paper_info.related_papers": {
        "$elemMatch": {
            "authors": { "$size": 0}
        }
    }
}
print(papers.count_documents(query))

26480


Now we count how many related papers we have and how many have no authors.

In [6]:
pipeline = [
    {"$match": {"paper_info.related_papers": {"$exists": True, "$ne": None}}},
    {"$project": {
        "related_papers": "$paper_info.related_papers"
    }},
    {"$unwind": "$related_papers"},
    {"$group": {
        "_id": None,
        "total_related": {"$sum": 1},
        "no_authors": {
            "$sum": {
                "$cond": [
        {"$eq": [{"$size": "$related_papers.authors"}, 0]},
        1,
        0
    ]
            }
        }
    }}
]

result = list(papers.aggregate(pipeline))[0]
print("Total related_papers:", result["total_related"])
print("Related_papers without authors:", result["no_authors"])
print("Related_papers with authors:", result["total_related"] - result["no_authors"])

Total related_papers: 386407
Related_papers without authors: 104366
Related_papers with authors: 282041


Number of parsing errors in the references

In [17]:
pipeline_ref_err = [
    { "$unwind": "$paper_info.related_papers" },
    { "$match": {
        "$expr": {
            "$gt": [ { "$strLenCP": "$paper_info.related_papers.text" }, 450 ]
        }
    }},
    { "$group": { "_id": "$_id" } }
]

count = len(list(papers.aggregate(pipeline_ref_err)))
print(f"Faulty Papers: {count}")

Faulty Papers: 5981


Updating Faulty Papers by Truncating Related Papers with Text Longer than 450 characters

In [30]:
faulty_ids = [doc["_id"] for doc in papers.aggregate(pipeline_ref_err)]

for paper_id in faulty_ids:
    doc = papers.find_one({ "_id": paper_id })
    related = doc.get("paper_info", {}).get("related_papers", [])

    cleaned = []
    for rp in related:
        if len(rp.get("text", "")) < 450:
            rp["text"] = rp["text"][:450]
            cleaned.append(rp)

    if len(cleaned) < len(related):
        papers.update_one(
            { "_id": paper_id },
            { "$set": { "paper_info.related_papers": cleaned } }
        )

print(f"Updated {len(faulty_ids)} papers.")

Updated 0 papers.


Searching for Errors in the References Indexes (Truncating Text and Having a new Element without Index)

In [31]:
pipeline_index_err = [
  { "$unwind": "$paper_info.related_papers" },
  { "$match": {
      "$expr": {
        "$regexMatch": {
          "input": "$paper_info.related_papers.text",
          "regex": "^\\[\\d+\\]"
        }
      }
    }
  },
  { "$group": { "_id": "$_id" } }
]

count_index_err = len(list(papers.aggregate(pipeline_index_err)))
print(f"Faulty Papers: {count_index_err}")

Faulty Papers: 8278


Deleting the Index Error from the Related Papers

In [32]:
import re

faulty_ids = [doc["_id"] for doc in papers.aggregate(pipeline_index_err)]

pattern = re.compile("^\\[\\d+\\]")

for paper_id in faulty_ids:
    doc = papers.find_one({ "_id": paper_id })
    related = doc.get("paper_info", {}).get("related_papers", [])

    cleaned = []
    for rp in related:
        if pattern.match(rp["text"]):
            cleaned.append(rp)

    if len(cleaned) != len(related):
        papers.update_one(
            { "_id": paper_id },
            { "$set": { "paper_info.related_papers": cleaned } }
        )

print(f"Updated {len(faulty_ids)} papers.")

Updated 8278 papers.


Searching for Related Papers found in many Papers → Given a Paper found how many Papers Cite It

In [15]:
targetTitle = "Digital Twin"

pipeline = [
  { "$unwind": "$paper_info.related_papers" },
  {
   "$match": {
      "paper_info.related_papers.text": {
        "$regex": targetTitle, "$options": "i"
      }
    }
  },
    { "$group": {
        "_id": "$_id"
    }}
]

result = list(papers.aggregate(pipeline))
print(f"Found {len(result)} papers citing '{targetTitle}'")

Found 123 papers citing 'Digital Twin'


Finding the Related Papers with Indented Title

In [16]:
pipeline_title_err = [
  { "$unwind": "$paper_info.related_papers" },
  { "$match": {
      "$expr": {
        "$regexMatch": {
          "input": "$paper_info.related_papers.title",
          "regex": "^\\[\\d+\\]"
        }
      }
    }
  },
  { "$group": { "_id": "$_id" } }
]

count_title_err = len(list(papers.aggregate(pipeline_title_err)))
print(f"Faulty Papers: {count_title_err}")

Faulty Papers: 5673


Removing the Related Papers with Indented Title

In [18]:
import re

faulty_ids = [doc["_id"] for doc in papers.aggregate(pipeline_title_err)]

pattern = re.compile("^\\[\\d+\\]")

for paper_id in faulty_ids:
    doc = papers.find_one({ "_id": paper_id })
    related = doc.get("paper_info", {}).get("related_papers", [])

    cleaned = []
    for rp in related:
        if not pattern.match(rp["title"]):
            cleaned.append(rp)

    if len(cleaned) != len(related):
        papers.update_one(
            { "_id": paper_id },
            { "$set": { "paper_info.related_papers": cleaned } }
        )

print(f"Updated {len(faulty_ids)} papers.")

Updated 5673 papers.
