In [1]:
from pymongo import MongoClient

In [2]:
# Replace "localhost" and "27017" with your MongoDB host and port if different
client = MongoClient("mongodb://localhost:27017/")

In [3]:
# Connect to a specific database
db = client["adtech"]

In [4]:
# Access a collection (like a table in relational databases)
collection1 = db["impressions"]

In [5]:
count_clicks = collection1.count_documents({"is_click": 1})
print(f"Total clicks: {count_clicks}")

Total clicks: 10862


In [6]:
distinct_os_versions = collection1.distinct("os_version")
print(f"Distinct OS Versions: {distinct_os_versions}")

Distinct OS Versions: ['intermediate', 'latest', 'old']


In [7]:
pipeline = [
    {"$match": {"is_4G": 0}}, 
    {"$group": {"_id": "$os_version", "countClick": {"$sum": "$is_click"}}}
]
result = collection1.aggregate(pipeline)
for doc in result:
    print(f"OS Version: {doc['_id']}, Clicks: {doc['countClick']}")

OS Version: old, Clicks: 1896
OS Version: latest, Clicks: 3201
OS Version: intermediate, Clicks: 1923


In [13]:
pipeline = [
    {"$match": {"app_code": {"$gt": 500}}},  
    {"$group": {"_id": "$app_code", "countClick": {"$sum": "$is_click"}}},
    {"$sort": {"countClick": -1}}  # Sort by countClick in descending order
]

result = collection1.aggregate(pipeline)

for doc in result:
    print(f"App Code: {doc['_id']}, Clicks: {doc['countClick']}")

App Code: 508, Clicks: 132
App Code: 504, Clicks: 47
App Code: 509, Clicks: 27
App Code: 512, Clicks: 23
App Code: 507, Clicks: 14
App Code: 522, Clicks: 6
App Code: 503, Clicks: 5
App Code: 505, Clicks: 4
App Code: 514, Clicks: 4
App Code: 513, Clicks: 3
App Code: 521, Clicks: 2
App Code: 517, Clicks: 1
App Code: 519, Clicks: 1
App Code: 520, Clicks: 1
App Code: 518, Clicks: 0
App Code: 510, Clicks: 0
App Code: 516, Clicks: 0
App Code: 515, Clicks: 0
App Code: 502, Clicks: 0
App Code: 506, Clicks: 0


In [15]:
pipeline = [
    {
        "$group": {
            "_id": "$user_id",  # Group by user_id
            "CountOfClick": {
                "$sum": "$is_click"  # Sum is_click values for each user_id
            }
        }
    },
    {
        "$match": {
            "CountOfClick": 10  # Filter for CountOfClick equal to 5
        }
    }
]

result = collection1.aggregate(pipeline)

for doc in result:
    print(f"User ID: {doc['_id']}, Count of Clicks: {doc['CountOfClick']}")

User ID: 37747, Count of Clicks: 10
User ID: 90953, Count of Clicks: 10
User ID: 52737, Count of Clicks: 10
User ID: 64389, Count of Clicks: 10
User ID: 3364, Count of Clicks: 10


In [16]:
# Access a collection (like a table in relational databases)
collection2 = db["viewlog"]

In [32]:
pipeline = [
    {
        "$match": {
            "device_type": "android"  # First match for device_type
        }
    },
    {
        "$sort": {
            "user_id": 1  # Sort by user_id in ascending order
        }
    },
    {
        "$match": {
            "item_id": {
                "$gt": 132861  # Second match for item_id greater than 1000
            }
        }
    }
]

result = collection2.aggregate(pipeline)

for doc in result:
    print(doc)  # Print the resulting documents

{'_id': ObjectId('66f339bb0b5d4df30046ddd8'), 'server_time': datetime.datetime(2018, 11, 17, 21, 15), 'device_type': 'android', 'session_id': 699238, 'user_id': 111, 'item_id': 132864}
{'_id': ObjectId('66f339730b5d4df3002f3d78'), 'server_time': datetime.datetime(2018, 10, 19, 0, 5), 'device_type': 'android', 'session_id': 692656, 'user_id': 131, 'item_id': 132865}
{'_id': ObjectId('66f339830b5d4df30034a049'), 'server_time': datetime.datetime(2018, 10, 26, 5, 48), 'device_type': 'android', 'session_id': 103223, 'user_id': 5822, 'item_id': 132863}
{'_id': ObjectId('66f339770b5d4df30030af40'), 'server_time': datetime.datetime(2018, 10, 21, 1, 53), 'device_type': 'android', 'session_id': 182390, 'user_id': 10613, 'item_id': 132865}
{'_id': ObjectId('66f339fa0b5d4df3005aa751'), 'server_time': datetime.datetime(2018, 12, 10, 23, 20), 'device_type': 'android', 'session_id': 1026181, 'user_id': 10947, 'item_id': 132862}
{'_id': ObjectId('66f339740b5d4df3002f9476'), 'server_time': datetime.dat