In [4]:
import json
import pymongo
from pymongo.database import Database
from pymongo.collection import Collection
from time import time
import datetime
with open("watermelon.config", "rb") as f:
    js = json.load(f)
    mongo_key: str = js["mongo_key"]
    prefix: str = js["prefix"]

if prefix in ["w?", "t?"]:  # only access mongodb for w? and t?
    client = pymongo.MongoClient(mongo_key)
    db: Database = client.get_database("AlexMindustry")
    expgains: Collection = db["expgains"]
    convertedexp: Collection = db["convertedexp"]
    muuid: Collection = db["muuid"]
    # V7 stuff
    expv7: Collection = db["expv7"]
    convertedexpv7: Collection = db["convertedexpv7"]
    ingamecosmeticsv7: Collection = db["ingamecosmeticsv7"]
    serverplayerupdates1: Collection = db["serverplayerupdates1"]
    hexv7: Collection = db["hexdataV7"]

## this notebook filters documents with duplicated muuid,usid,ip. and keeps the unique one with the latest date.

## to clean up, run the first 3 cells

In [2]:
def clean_up_duplicate_muuid(timedeltadays=3,num_of_days_in_the_past=100):
    for i in range(0,num_of_days_in_the_past//timedeltadays):
        start_date = datetime.datetime.now() - datetime.timedelta(days=(i+1)*timedeltadays)
        end_date = datetime.datetime.now() - datetime.timedelta(days=i*timedeltadays)
        current_date = start_date
        would_delete = []
        while current_date < end_date:
            next_date = current_date + datetime.timedelta(days=timedeltadays)
            # Group documents by unique fields excluding _id and date
            pipeline = [
                {
                    '$match': {
                        'date': {'$gte': current_date, '$lt': next_date}
                    }
                },
                {
                    '$group': {
                        '_id': {
                            'musername': '$musername',
                            'muuid': '$muuid',
                            'musid': '$musid',
                            'con_address': '$con_address',
                            'color': '$color',
                            'servername': '$servername'
                        },
                        'latest': {'$max': '$date'},
                        'docs': {'$push': {'_id': '$_id', 'date': '$date'}}
                    }
                }
            ]
            # Execute the aggregation pipeline
            result = muuid.aggregate(pipeline)
            # Loop through the aggregation result
            keep=0
            for doc in result:
                latest_date = doc['latest']
                docs = doc['docs']
                latest_id = None
                # Find the document with the latest date
                for d in docs:
                    if d['date'] == latest_date:
                        latest_id = d['_id']
        
                # Print all other documents except the latest one
                if latest_id:
                    keep+=1
                    for d in docs:
                        if d['_id'] != latest_id:
                            #print(f"Would delete document: {d['_id']}")
                            would_delete.append(d['_id'])
            print("docs",len(would_delete),"keep",keep,next_date)
            current_date = next_date
        res = muuid.delete_many({'_id': {'$in': would_delete}})
        print("deleted",res.deleted_count)
    print("Run complete!")

In [3]:
%%time
clean_up_duplicate_muuid(timedeltadays=3,num_of_days_in_the_past=100)
clean_up_duplicate_muuid(timedeltadays=10,num_of_days_in_the_past=100)
clean_up_duplicate_muuid(timedeltadays=30,num_of_days_in_the_past=100) # max is 50.

docs 9 keep 12612 2024-07-16 16:33:40.141333
deleted 9
docs 0 keep 14563 2024-07-13 16:33:54.220665
deleted 0
docs 0 keep 14269 2024-07-10 16:34:06.079679
deleted 0
docs 0 keep 10687 2024-07-07 16:34:16.752974
deleted 0
docs 0 keep 10250 2024-07-04 16:34:25.961808
deleted 0
docs 0 keep 10859 2024-07-01 16:34:34.705233
deleted 0
docs 0 keep 11221 2024-06-28 16:34:43.173262
deleted 0
docs 0 keep 16134 2024-06-25 16:34:51.759847
deleted 0
docs 0 keep 17126 2024-06-22 16:35:04.516645
deleted 0
docs 0 keep 11434 2024-06-19 16:35:18.371400
deleted 0
docs 0 keep 13451 2024-06-16 16:35:29.085304
docs 0 keep 11434 2024-06-19 16:35:29.085304
deleted 0
docs 0 keep 12452 2024-06-13 16:35:49.281117
deleted 0
docs 0 keep 14850 2024-06-10 16:36:00.050760
deleted 0
docs 0 keep 12612 2024-06-07 16:36:12.670533
deleted 0
docs 0 keep 12454 2024-06-04 16:36:22.928347
deleted 0
docs 0 keep 12862 2024-06-01 16:36:33.029829
deleted 0
docs 0 keep 13506 2024-05-29 16:36:45.770291
deleted 0
docs 0 keep 14850 20

# delete old docs in playerserverupdates1

In [21]:
%%time
num_days_ago = 30
days_ago = datetime.datetime.now() - datetime.timedelta(days=num_days_ago)
filterr = {'date': {'$lt': days_ago}}
# Count the number of documents that would be deleted
count = serverplayerupdates1.count_documents(filterr)
print(f"Number of documents to be deleted: {count}")

Number of documents to be deleted: 71
CPU times: total: 0 ns
Wall time: 2.12 s


In [22]:
%%time
result = serverplayerupdates1.delete_many(filterr)
print(f"Deleted {result.deleted_count} documents.")

Deleted 71 documents.
CPU times: total: 0 ns
Wall time: 434 ms


In [2]:
t1=time()
alexid=612861256189083669
xxx1=315764312700485632 #cohl
xxx2=1185756027929501718 #cen
duuid=alexid
t2=time()

In [38]:
%%time
collection = muuid

# Calculate the start date for 2 months ago
for i in range(0,4):
    start_date = datetime.datetime.now() - datetime.timedelta(days=(i+1)*40) #150 to 
    end_date = datetime.datetime.now() - datetime.timedelta(days=i*40)
    
    current_date = start_date
    
    would_delete = []
    
    while current_date < end_date:
        next_date = current_date + datetime.timedelta(days=40)
        
        # Group documents by unique fields excluding _id and date
        pipeline = [
            {
                '$match': {
                    'date': {'$gte': current_date, '$lt': next_date}
                }
            },
            {
                '$group': {
                    '_id': {
                        'musername': '$musername',
                        'muuid': '$muuid',
                        'musid': '$musid',
                        'con_address': '$con_address',
                        'color': '$color',
                        'servername': '$servername'
                    },
                    'latest': {'$max': '$date'},
                    'docs': {'$push': {'_id': '$_id', 'date': '$date'}}
                }
            }
        ]
    
        # Execute the aggregation pipeline
        result = collection.aggregate(pipeline)
    
        # Loop through the aggregation result
        keep=0
        for doc in result:
            latest_date = doc['latest']
            docs = doc['docs']
            latest_id = None
    
            # Find the document with the latest date
            for d in docs:
                if d['date'] == latest_date:
                    latest_id = d['_id']
    
            # Print all other documents except the latest one
            if latest_id:
                keep+=1
                for d in docs:
                    if d['_id'] != latest_id:
                        #print(f"Would delete document: {d['_id']}")
                        would_delete.append(d['_id'])
        print("docs",len(would_delete),"keep",keep,next_date)
    
        current_date = next_date
    
    print("Run complete!")
    res = muuid.delete_many({'_id': {'$in': would_delete}})
    print("delted",res.deleted_count)

docs 690 keep 188641 2024-06-22 17:17:44.687712
Run complete!
delted 690
docs 3004 keep 157589 2024-05-13 17:20:24.273383
Run complete!
delted 3004
docs 1008 keep 158618 2024-04-03 17:22:16.759669
Run complete!
delted 1008
docs 4 keep 24270 2024-02-23 17:23:56.078185
Run complete!
delted 4
CPU times: total: 6.44 s
Wall time: 6min 32s


In [13]:
%%time
from bson.objectid import ObjectId

object_id_list = [ObjectId(idd) for idd in would_delete]
query = {'_id': {'$in': object_id_list}}

matching_documents = muuid.find(query)
counts=0
for doc in matching_documents:
    #print(doc)
    counts+=1

CPU times: total: 46.9 ms
Wall time: 7.6 s


In [14]:
counts

13346