In [1]:
import json
import pymongo
from pymongo.database import Database
from pymongo.collection import Collection
from time import time
import datetime
with open("watermelon.config", "rb") as f:
    js = json.load(f)
    mongo_key: str = js["mongo_key"]
    prefix: str = js["prefix"]

if prefix in ["w?", "t?"]:  # only access mongodb for w? and t?
    client = pymongo.MongoClient(mongo_key)
    db: Database = client.get_database("AlexMindustry")
    expgains: Collection = db["expgains"]
    convertedexp: Collection = db["convertedexp"]
    muuid: Collection = db["muuid"]
    # V7 stuff
    expv7: Collection = db["expv7"]
    convertedexpv7: Collection = db["convertedexpv7"]
    ingamecosmeticsv7: Collection = db["ingamecosmeticsv7"]
    serverplayerupdates1: Collection = db["serverplayerupdates1"]
    hexv7: Collection = db["hexdataV7"]

## this notebook filters documents with duplicated muuid,usid,ip. and keeps the unique one with the latest date.

## to clean up, run the first 3 cells

In [2]:
def clean_up_duplicate_muuid(timedeltadays=3,num_of_days_in_the_past=100):
    for i in range(0,num_of_days_in_the_past//timedeltadays):
        start_date = datetime.datetime.now() - datetime.timedelta(days=(i+1)*timedeltadays)
        end_date = datetime.datetime.now() - datetime.timedelta(days=i*timedeltadays)
        current_date = start_date
        would_delete = []
        while current_date < end_date:
            next_date = current_date + datetime.timedelta(days=timedeltadays)
            # Group documents by unique fields excluding _id and date
            pipeline = [
                {
                    '$match': {
                        'date': {'$gte': current_date, '$lt': next_date}
                    }
                },
                {
                    '$group': {
                        '_id': {
                            'musername': '$musername',
                            'muuid': '$muuid',
                            'musid': '$musid',
                            'con_address': '$con_address',
                            'color': '$color',
                            'servername': '$servername'
                        },
                        'latest': {'$max': '$date'},
                        'docs': {'$push': {'_id': '$_id', 'date': '$date'}}
                    }
                }
            ]
            # Execute the aggregation pipeline
            result = muuid.aggregate(pipeline)
            # Loop through the aggregation result
            keep=0
            for doc in result:
                latest_date = doc['latest']
                docs = doc['docs']
                latest_id = None
                # Find the document with the latest date
                for d in docs:
                    if d['date'] == latest_date:
                        latest_id = d['_id']
        
                # Print all other documents except the latest one
                if latest_id:
                    keep+=1
                    for d in docs:
                        if d['_id'] != latest_id:
                            #print(f"Would delete document: {d['_id']}")
                            would_delete.append(d['_id'])
            print("docs",len(would_delete),"keep",keep,next_date)
            current_date = next_date
        res = muuid.delete_many({'_id': {'$in': would_delete}})
        print("deleted",res.deleted_count)
    print("Run complete!")

In [3]:
%%time
clean_up_duplicate_muuid(timedeltadays=3,num_of_days_in_the_past=100)
clean_up_duplicate_muuid(timedeltadays=10,num_of_days_in_the_past=100)
clean_up_duplicate_muuid(timedeltadays=30,num_of_days_in_the_past=100) # max is 50.

docs 4809 keep 8263 2024-10-25 22:53:41.055078
deleted 4809
docs 5202 keep 9251 2024-10-22 22:53:49.704345
deleted 5202
docs 6394 keep 10236 2024-10-19 22:53:59.064843
deleted 6394
docs 5596 keep 8955 2024-10-16 22:54:10.537684
deleted 5596
docs 7687 keep 16842 2024-10-13 22:54:18.863983
deleted 7687
docs 5095 keep 11457 2024-10-10 22:54:31.001212
deleted 5095
docs 7062 keep 12480 2024-10-07 22:54:39.862712
deleted 7062
docs 5391 keep 9881 2024-10-04 22:54:49.507628
deleted 5391
docs 5681 keep 9687 2024-10-01 22:54:59.980359
deleted 5681
docs 5990 keep 10201 2024-09-28 22:55:10.009160
deleted 5990
docs 4895 keep 7525 2024-09-25 22:55:19.284274
deleted 4895
docs 6702 keep 10153 2024-09-22 22:55:27.048955
deleted 6702
docs 5954 keep 9607 2024-09-19 22:55:37.265990
deleted 5954
docs 7056 keep 11205 2024-09-16 22:55:47.847675
deleted 7056
docs 5442 keep 8972 2024-09-13 22:55:57.566336
deleted 5442
docs 5719 keep 9664 2024-09-10 22:56:08.222235
deleted 5719
docs 6330 keep 11182 2024-09-07 2

# delete old docs in playerserverupdates1

# this will delete incrementally in batches of 10 days

In [4]:
print("DRY RUN")
num_days_ago = 30 # delete docs older than X days.
for num_days_ago in list(range(90,5,-10)):
    days_ago = datetime.datetime.now() - datetime.timedelta(days=num_days_ago)
    filterr = {'date': {'$lt': days_ago}}
    # Count the number of documents that would be deleted
    count = serverplayerupdates1.count_documents(filterr)
    print(f"Number of documents to be deleted: {count}")

DRY RUN
Number of documents to be deleted: 0
Number of documents to be deleted: 0
Number of documents to be deleted: 0
Number of documents to be deleted: 36916
Number of documents to be deleted: 95756
Number of documents to be deleted: 162676
Number of documents to be deleted: 223773
Number of documents to be deleted: 290041
Number of documents to be deleted: 352808


In [5]:
print("ACTUAL RUN")
num_days_ago = 30 # delete docs older than X days.
for num_days_ago in list(range(90,5,-10)):
    days_ago = datetime.datetime.now() - datetime.timedelta(days=num_days_ago)
    filterr = {'date': {'$lt': days_ago}}
    # Count the number of documents that would be deleted
    count = serverplayerupdates1.count_documents(filterr)
    print(f"Number of documents to be deleted: {count}")
    result = serverplayerupdates1.delete_many(filterr)
    print(f"Deleted {result.deleted_count} documents.")

ACTUAL RUN
Number of documents to be deleted: 0
Deleted 0 documents.
Number of documents to be deleted: 0
Deleted 0 documents.
Number of documents to be deleted: 0
Deleted 0 documents.
Number of documents to be deleted: 36916
Deleted 36916 documents.
Number of documents to be deleted: 58840
Deleted 58840 documents.
Number of documents to be deleted: 66920
Deleted 66920 documents.
Number of documents to be deleted: 61104
Deleted 61104 documents.
Number of documents to be deleted: 66267
Deleted 66267 documents.
Number of documents to be deleted: 62767
Deleted 62767 documents.


# END OF FILE

In [15]:
%%time
num_days_ago = 30 # delete docs older than X days.
days_ago = datetime.datetime.now() - datetime.timedelta(days=num_days_ago)
filterr = {'date': {'$lt': days_ago}}
# Count the number of documents that would be deleted
count = serverplayerupdates1.count_documents(filterr)
print(f"Number of documents to be deleted: {count}")

Number of documents to be deleted: 80007
CPU times: total: 15.6 ms
Wall time: 411 ms


In [17]:
list(range(90,5,-10))

[90, 80, 70, 60, 50, 40, 30, 20, 10]

In [16]:
%%time
result = serverplayerupdates1.delete_many(filterr)
print(f"Deleted {result.deleted_count} documents.")

Deleted 80007 documents.
CPU times: total: 0 ns
Wall time: 17.3 s


In [2]:
t1=time()
alexid=612861256189083669
xxx1=315764312700485632 #cohl
xxx2=1185756027929501718 #cen
duuid=alexid
t2=time()

In [38]:
%%time
collection = muuid

# Calculate the start date for 2 months ago
for i in range(0,4):
    start_date = datetime.datetime.now() - datetime.timedelta(days=(i+1)*40) #150 to 
    end_date = datetime.datetime.now() - datetime.timedelta(days=i*40)
    
    current_date = start_date
    
    would_delete = []
    
    while current_date < end_date:
        next_date = current_date + datetime.timedelta(days=40)
        
        # Group documents by unique fields excluding _id and date
        pipeline = [
            {
                '$match': {
                    'date': {'$gte': current_date, '$lt': next_date}
                }
            },
            {
                '$group': {
                    '_id': {
                        'musername': '$musername',
                        'muuid': '$muuid',
                        'musid': '$musid',
                        'con_address': '$con_address',
                        'color': '$color',
                        'servername': '$servername'
                    },
                    'latest': {'$max': '$date'},
                    'docs': {'$push': {'_id': '$_id', 'date': '$date'}}
                }
            }
        ]
    
        # Execute the aggregation pipeline
        result = collection.aggregate(pipeline)
    
        # Loop through the aggregation result
        keep=0
        for doc in result:
            latest_date = doc['latest']
            docs = doc['docs']
            latest_id = None
    
            # Find the document with the latest date
            for d in docs:
                if d['date'] == latest_date:
                    latest_id = d['_id']
    
            # Print all other documents except the latest one
            if latest_id:
                keep+=1
                for d in docs:
                    if d['_id'] != latest_id:
                        #print(f"Would delete document: {d['_id']}")
                        would_delete.append(d['_id'])
        print("docs",len(would_delete),"keep",keep,next_date)
    
        current_date = next_date
    
    print("Run complete!")
    res = muuid.delete_many({'_id': {'$in': would_delete}})
    print("delted",res.deleted_count)

docs 690 keep 188641 2024-06-22 17:17:44.687712
Run complete!
delted 690
docs 3004 keep 157589 2024-05-13 17:20:24.273383
Run complete!
delted 3004
docs 1008 keep 158618 2024-04-03 17:22:16.759669
Run complete!
delted 1008
docs 4 keep 24270 2024-02-23 17:23:56.078185
Run complete!
delted 4
CPU times: total: 6.44 s
Wall time: 6min 32s


In [13]:
%%time
from bson.objectid import ObjectId

object_id_list = [ObjectId(idd) for idd in would_delete]
query = {'_id': {'$in': object_id_list}}

matching_documents = muuid.find(query)
counts=0
for doc in matching_documents:
    #print(doc)
    counts+=1

CPU times: total: 46.9 ms
Wall time: 7.6 s


In [14]:
counts

13346