In [20]:
import json
import pymongo
from pymongo.database import Database
from pymongo.collection import Collection
from time import time
import datetime
with open("watermelon.config", "rb") as f:
    js = json.load(f)
    mongo_key: str = js["mongo_key"]
    prefix: str = js["prefix"]

if prefix in ["w?", "t?"]:  # only access mongodb for w? and t?
    client = pymongo.MongoClient(mongo_key)
    db: Database = client.get_database("AlexMindustry")
    expgains: Collection = db["expgains"]
    convertedexp: Collection = db["convertedexp"]
    muuid: Collection = db["muuid"]
    # V7 stuff
    expv7: Collection = db["expv7"]
    convertedexpv7: Collection = db["convertedexpv7"]
    ingamecosmeticsv7: Collection = db["ingamecosmeticsv7"]
    serverplayerupdates1: Collection = db["serverplayerupdates1"]
    hexv7: Collection = db["hexdataV7"]

## this notebook filters documents with duplicated muuid,usid,ip. and keeps the unique one with the latest date.

## to clean up, run the first 3 cells

In [2]:
def clean_up_duplicate_muuid(timedeltadays=3,num_of_days_in_the_past=100):
    for i in range(0,num_of_days_in_the_past//timedeltadays):
        start_date = datetime.datetime.now() - datetime.timedelta(days=(i+1)*timedeltadays)
        end_date = datetime.datetime.now() - datetime.timedelta(days=i*timedeltadays)
        current_date = start_date
        would_delete = []
        while current_date < end_date:
            next_date = current_date + datetime.timedelta(days=timedeltadays)
            # Group documents by unique fields excluding _id and date
            pipeline = [
                {
                    '$match': {
                        'date': {'$gte': current_date, '$lt': next_date}
                    }
                },
                {
                    '$group': {
                        '_id': {
                            'musername': '$musername',
                            'muuid': '$muuid',
                            'musid': '$musid',
                            'con_address': '$con_address',
                            'color': '$color',
                            'servername': '$servername'
                        },
                        'latest': {'$max': '$date'},
                        'docs': {'$push': {'_id': '$_id', 'date': '$date'}}
                    }
                }
            ]
            # Execute the aggregation pipeline
            result = muuid.aggregate(pipeline)
            # Loop through the aggregation result
            keep=0
            for doc in result:
                latest_date = doc['latest']
                docs = doc['docs']
                latest_id = None
                # Find the document with the latest date
                for d in docs:
                    if d['date'] == latest_date:
                        latest_id = d['_id']
        
                # Print all other documents except the latest one
                if latest_id:
                    keep+=1
                    for d in docs:
                        if d['_id'] != latest_id:
                            #print(f"Would delete document: {d['_id']}")
                            would_delete.append(d['_id'])
            print("docs",len(would_delete),"keep",keep,next_date)
            current_date = next_date
        res = muuid.delete_many({'_id': {'$in': would_delete}})
        print("deleted",res.deleted_count)
    print("Run complete!")

In [3]:
%%time
clean_up_duplicate_muuid(timedeltadays=3,num_of_days_in_the_past=100)
clean_up_duplicate_muuid(timedeltadays=10,num_of_days_in_the_past=100)
clean_up_duplicate_muuid(timedeltadays=30,num_of_days_in_the_past=100) # max is 50.

docs 4288 keep 7324 2024-12-14 14:05:22.556333
deleted 4288
docs 4482 keep 7151 2024-12-11 14:05:29.391986
deleted 4482
docs 5681 keep 8889 2024-12-08 14:05:36.468924
deleted 5681
docs 4460 keep 12776 2024-12-05 14:05:42.185975
deleted 4460
docs 6009 keep 9659 2024-12-02 14:05:50.879045
deleted 6009
docs 4871 keep 8176 2024-11-29 14:05:58.325674
deleted 4871
docs 5326 keep 8458 2024-11-26 14:06:04.575170
deleted 5326
docs 5132 keep 8726 2024-11-23 14:06:09.265128
deleted 5132
docs 5220 keep 8210 2024-11-20 14:06:15.326936
deleted 5220
docs 5906 keep 9375 2024-11-17 14:06:26.989688
deleted 5906
docs 4307 keep 10899 2024-11-14 14:06:34.458271
deleted 4307
docs 6155 keep 9973 2024-11-11 14:06:41.698526
deleted 6155
docs 4157 keep 7218 2024-11-08 14:06:48.889393
deleted 4157
docs 4647 keep 7418 2024-11-05 14:06:54.825495
deleted 4647
docs 4759 keep 8145 2024-11-02 14:07:01.155866
deleted 4759
docs 5166 keep 8793 2024-10-30 14:07:07.480654
deleted 5166
docs 5114 keep 10616 2024-10-27 14:07:

# delete old docs in playerserverupdates1

# this will delete incrementally in batches of 10 days

In [4]:
print("DRY RUN")
num_days_ago = 30 # delete docs older than X days.
for num_days_ago in list(range(90,5,-10)):
    days_ago = datetime.datetime.now() - datetime.timedelta(days=num_days_ago)
    filterr = {'date': {'$lt': days_ago}}
    # Count the number of documents that would be deleted
    count = serverplayerupdates1.count_documents(filterr)
    print(f"Number of documents to be deleted: {count}")

DRY RUN
Number of documents to be deleted: 0
Number of documents to be deleted: 0
Number of documents to be deleted: 0
Number of documents to be deleted: 0
Number of documents to be deleted: 55623
Number of documents to be deleted: 110223
Number of documents to be deleted: 161371
Number of documents to be deleted: 218958
Number of documents to be deleted: 272586


In [5]:
print("ACTUAL RUN")
num_days_ago = 30 # delete docs older than X days.
for num_days_ago in list(range(90,5,-10)):
    days_ago = datetime.datetime.now() - datetime.timedelta(days=num_days_ago)
    filterr = {'date': {'$lt': days_ago}}
    # Count the number of documents that would be deleted
    count = serverplayerupdates1.count_documents(filterr)
    print(f"Number of documents to be deleted: {count}")
    result = serverplayerupdates1.delete_many(filterr)
    print(f"Deleted {result.deleted_count} documents.")

ACTUAL RUN
Number of documents to be deleted: 0
Deleted 0 documents.
Number of documents to be deleted: 0
Deleted 0 documents.
Number of documents to be deleted: 0
Deleted 0 documents.
Number of documents to be deleted: 0
Deleted 0 documents.
Number of documents to be deleted: 55623
Deleted 55623 documents.
Number of documents to be deleted: 54600
Deleted 54600 documents.
Number of documents to be deleted: 51151
Deleted 51151 documents.
Number of documents to be deleted: 57584
Deleted 57584 documents.
Number of documents to be deleted: 53638
Deleted 53638 documents.


## try to delete every muuid, up to the latest one

In [25]:
%%time
# Step 1: Get all unique 'muuid'
unique_muuids = muuid.distinct("muuid")

# Step 2: Process 100 'muuid' at a time
batch_size = 100
total_counts=0
for i in range(1, len(unique_muuids), batch_size):
    counts=0
    batch = unique_muuids[i:i+batch_size]  # Get a batch of 100 muuids
    for muuid1 in batch:
        # Find the latest document for this muuid
        latest_doc = muuid.find({"muuid": muuid1}).sort("date", -1).limit(1).next()
        latest_doc_id = latest_doc["_id"]
        #print("keep",latest_doc_id,latest_doc)
        # Delete all other documents with this muuid except the latest one
        result = muuid.delete_many({"muuid": muuid1, "_id": {"$ne": latest_doc_id}})
        counts+=result.deleted_count
        #print(f"Deleted {result.deleted_count} old documents for muuid: {muuid1}")
    print(f"{counts}",end=" ")
    total_counts+=counts
print(f"\nCleanup complete. total deleted={total_counts}")

5 399 388 643 772 568 402 550 576 807 475 648 703 600 940 872 749 422 431 463 665 387 385 586 626 437 489 563 938 529 629 393 614 559 1219 331 428 501 567 441 488 658 447 455 369 503 378 491 640 580 775 319 535 463 468 657 532 646 886 464 593 668 561 384 671 765 455 424 371 744 492 606 896 696 661 559 435 771 585 343 560 963 533 492 719 621 531 517 510 516 579 480 456 391 422 477 393 523 428 561 472 559 325 567 565 511 292 570 389 466 406 487 675 496 646 563 586 572 703 432 531 593 475 624 631 782 662 1611 550 488 526 454 765 513 421 395 469 527 513 487 1000 564 378 663 1007 787 401 469 678 691 531 408 565 1085 776 387 601 497 346 368 595 633 570 683 411 587 486 404 427 416 405 409 633 576 472 688 578 385 698 512 678 481 620 386 428 327 638 489 653 446 419 759 744 723 798 692 430 625 356 569 599 793 598 482 407 480 637 616 846 781 606 475 363 528 755 582 1000 475 617 612 605 397 705 783 453 598 957 392 478 678 514 471 430 966 417 555 354 357 450 699 360 679 555 425 575 407 761 1409 449

KeyboardInterrupt: 

In [27]:
i,total_counts

(101201, 593805)

# END OF FILE

In [15]:
%%time
num_days_ago = 30 # delete docs older than X days.
days_ago = datetime.datetime.now() - datetime.timedelta(days=num_days_ago)
filterr = {'date': {'$lt': days_ago}}
# Count the number of documents that would be deleted
count = serverplayerupdates1.count_documents(filterr)
print(f"Number of documents to be deleted: {count}")

Number of documents to be deleted: 80007
CPU times: total: 15.6 ms
Wall time: 411 ms


In [17]:
list(range(90,5,-10))

[90, 80, 70, 60, 50, 40, 30, 20, 10]

In [16]:
%%time
result = serverplayerupdates1.delete_many(filterr)
print(f"Deleted {result.deleted_count} documents.")

Deleted 80007 documents.
CPU times: total: 0 ns
Wall time: 17.3 s


In [2]:
t1=time()
alexid=612861256189083669
xxx1=315764312700485632 #cohl
xxx2=1185756027929501718 #cen
duuid=alexid
t2=time()

In [38]:
%%time
collection = muuid

# Calculate the start date for 2 months ago
for i in range(0,4):
    start_date = datetime.datetime.now() - datetime.timedelta(days=(i+1)*40) #150 to 
    end_date = datetime.datetime.now() - datetime.timedelta(days=i*40)
    
    current_date = start_date
    
    would_delete = []
    
    while current_date < end_date:
        next_date = current_date + datetime.timedelta(days=40)
        
        # Group documents by unique fields excluding _id and date
        pipeline = [
            {
                '$match': {
                    'date': {'$gte': current_date, '$lt': next_date}
                }
            },
            {
                '$group': {
                    '_id': {
                        'musername': '$musername',
                        'muuid': '$muuid',
                        'musid': '$musid',
                        'con_address': '$con_address',
                        'color': '$color',
                        'servername': '$servername'
                    },
                    'latest': {'$max': '$date'},
                    'docs': {'$push': {'_id': '$_id', 'date': '$date'}}
                }
            }
        ]
    
        # Execute the aggregation pipeline
        result = collection.aggregate(pipeline)
    
        # Loop through the aggregation result
        keep=0
        for doc in result:
            latest_date = doc['latest']
            docs = doc['docs']
            latest_id = None
    
            # Find the document with the latest date
            for d in docs:
                if d['date'] == latest_date:
                    latest_id = d['_id']
    
            # Print all other documents except the latest one
            if latest_id:
                keep+=1
                for d in docs:
                    if d['_id'] != latest_id:
                        #print(f"Would delete document: {d['_id']}")
                        would_delete.append(d['_id'])
        print("docs",len(would_delete),"keep",keep,next_date)
    
        current_date = next_date
    
    print("Run complete!")
    res = muuid.delete_many({'_id': {'$in': would_delete}})
    print("delted",res.deleted_count)

docs 690 keep 188641 2024-06-22 17:17:44.687712
Run complete!
delted 690
docs 3004 keep 157589 2024-05-13 17:20:24.273383
Run complete!
delted 3004
docs 1008 keep 158618 2024-04-03 17:22:16.759669
Run complete!
delted 1008
docs 4 keep 24270 2024-02-23 17:23:56.078185
Run complete!
delted 4
CPU times: total: 6.44 s
Wall time: 6min 32s


In [13]:
%%time
from bson.objectid import ObjectId

object_id_list = [ObjectId(idd) for idd in would_delete]
query = {'_id': {'$in': object_id_list}}

matching_documents = muuid.find(query)
counts=0
for doc in matching_documents:
    #print(doc)
    counts+=1

CPU times: total: 46.9 ms
Wall time: 7.6 s


In [14]:
counts

13346