In [67]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
import re
from datetime import datetime
from dotenv import load_dotenv
import os 

In [68]:
load_dotenv()
server_ip = os.getenv("SERVER_IP")
server_port = os.getenv("SERVER_PORT")

# Initiate the connection to the database

In [69]:
# Connect to MongoDB
server_adress = f'{server_ip}:{server_port}' 
client = MongoClient(server_adress)

# Select database
db = client['dblp']

# Select collection
collection = db['awards']

# Queries

In [70]:
queries = []

queries.append([
    {
        '$group': {
            '_id': {
                '$toLower': '$institution.city_name'
            }, 
            'montant_cumul': {
                '$sum': 1
            }
        }
    }, {
        '$sort': {
            'montant_cumul': -1
        }
    }, {
        '$project': {
            'City Name': '$_id', 
            'Cumulative Amount': '$montant_cumul', 
            '_id': 0
        }
    }
])

In [71]:
queries.append([
    {
        '$match': {
            'institution.city_name': re.compile(r"New York(?i)")
        }
    }, {
        '$group': {
            '_id': {
                'instName': '$institution.name', 
                'instZip': '$institution.zipcode'
            }, 
            'quantity': {
                '$sum': 1
            }
        }
    }, {
        '$sort': {
            'quantity': -1
        }
    }, {
        '$project': {
            'Insitution name': '$_id.instName', 
            'Insitution zip': '$_id.instZip', 
            'Awards received': '$quantity', 
            '_id': 0
        }
    }
])

In [72]:
queries.append([
    {
        '$match': {
            'investigators.email_id': {
                '$ne': None
            }
        }
    }, {
        '$group': {
            '_id': '$investigators.email_id', 
            'setDomaines': {
                '$addToSet': '$foa_info'
            }, 
            'email': {
                '$first': '$investigators.email_id'
            }
        }
    }, {
        '$project': {
            'email': '$_id', 
            'Number of domains': {
                '$size': '$setDomaines'
            }, 
            '_id': 0
        }
    }, {
        '$sort': {
            'Number of domains': -1
        }
    }, {
        '$limit': 10
    }
])

In [73]:
queries.append([
    {
        '$match': {
            'institution.state_name': 'California'
        }
    }, {
        '$group': {
            '_id': '$organisation_code', 
            'nbInstitutions': {
                '$sum': 1
            }
        }
    }, {
        '$sort': {
            'nbInstitutions': -1
        }
    }, {
        '$limit': 5
    }
])

In [74]:
queries.append([
    {
        '$group': {
            '_id': '$institution', 
            'count': {
                '$sum': 1
            }, 
            'averageAwardAmount': {
                '$avg': '$amount'
            }, 
            'maxAwardAmount': {
                '$max': '$amount'
            }, 
            'minAwardAmount': {
                '$min': '$amount'
            }, 
            'stdAwardAmount': {
                '$stdDevPop': '$amount'
            }
        }
    }, {
        '$sort': {
            'count': -1
        }
    }
])

In [75]:
queries.append([
    {
        '$unwind': {
            'path': '$programs', 
            'includeArrayIndex': 'string', 
            'preserveNullAndEmptyArrays': False
        }
    }, {
        '$group': {
            '_id': '$programs', 
            'nb_amount': {
                '$sum': '$amount'
            }
        }
    }, {
        '$sort': {
            'nb_amount': -1
        }
    }, {
        '$project': {
            '_id': 1, 
            'nb_amount': 1
        }
    }
])

In [76]:
queries.append([
    {
        '$match': {
            'investigators.email_id': re.compile(r"(a)(?i)")
        }
    }, {
        '$group': {
            '_id': {
                'date': {
                    '$substr': [
                        '$effective_date', 6, -1
                    ]
                }, 
                'invest': '$investigators.email_id'
            }, 
            'nb': {
                '$sum': 1
            }
        }
    }, {
        '$project': {
            'Investigator': '$_id.invest', 
            'Date': '$_id.date', 
            'Number': '$nb', 
            '_id': 0
        }
    }
])

In [77]:
queries.append([
    {
        '$match': {
            'investigators.email_id': 'jeremygtaylor@compuserve.com'
        }
    }, {
        '$project': {
            'Email': '$investigators.email_id', 
            'date': {
                '$toDate': '$investigators.start_date'
            }, 
            'today': datetime.utcnow()
        }
    }, {
        '$addFields': {
            '_id': '$_id', 
            'dateDiff': {
                '$subtract': [
                    '$today', '$date'
                ]
            }
        }
    }, {
        '$project': {
            'Email': 1, 
            'dateDiff': 1, 
            'secs': {
                '$divide': [
                    '$dateDiff', 1000
                ]
            }
        }
    }, {
        '$project': {
            'Email': 1, 
            'mins': {
                '$divide': [
                    '$secs', 60
                ]
            }
        }
    }, {
        '$project': {
            'Email': 1, 
            'hours': {
                '$divide': [
                    '$mins', 60
                ]
            }
        }
    }, {
        '$project': {
            'Email': 1, 
            'days': {
                '$divide': [
                    '$hours', 24
                ]
            }
        }
    }, {
        '$group': {
            '_id': '$Email', 
            'avgDays': {
                '$avg': '$days'
            }
        }
    }, {
        '$project': {
            'Email': '$_id', 
            'Average days': '$avgDays', 
            '_id': 0
        }
    }
])

# Definitions

In [78]:
def get_aggregation_execTimeMs(query, db, collection):
    cursor = db.command(
        'explain', 
        {
            'aggregate': "awards", 
            'pipeline': query, 
            'cursor': {}
        }, 
        verbosity='executionStats'
    )
    times = []
    # Keep only the longest execution time
    for shard in cursor["shards"].keys():
        times.append(cursor["shards"][shard]["stages"][0]["$cursor"]["executionStats"]["executionTimeMillis"])
    return np.max(times)

In [79]:
def run_query_n_times(query, db, collection, n):
    times = []
    for i in range(n):
        t = get_aggregation_execTimeMs(query, db, collection)
        print(f"Ran in {t}ms")
        times.append(t)
    return times

In [80]:
def initialize_df(queries, n):
    times_list = [f"{i}_stats" for i in range(1, n+1)] # List of stats columns
    stats_list = ['min_runtime', 'max_runtime', 'all_avg_runtime']
    columns = ['query_number', 'query'] + times_list + stats_list
    
    df = pd.DataFrame(np.zeros((len(queries),len(columns))), columns=columns) # Create dataframe
    
    df['query_number'] = [i for i in range(1, len(queries)+1)]
    df['query'] = queries

    return df, times_list, stats_list

# Initialize a dataframe

In [81]:
n = 10 # Number of times to run each query
df, times_list, stats_list = initialize_df(queries, n)
df

Unnamed: 0,query_number,query,1_stats,2_stats,3_stats,4_stats,5_stats,6_stats,7_stats,8_stats,9_stats,10_stats,min_runtime,max_runtime,all_avg_runtime
0,1,[{'$group': {'_id': {'$toLower': '$institution...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,[{'$match': {'institution.city_name': re.compi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,[{'$match': {'investigators.email_id': {'$ne':...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,[{'$match': {'institution.state_name': 'Califo...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,"[{'$group': {'_id': '$institution', 'count': {...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,6,"[{'$unwind': {'path': '$programs', 'includeArr...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,7,[{'$match': {'investigators.email_id': re.comp...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,[{'$match': {'investigators.email_id': 'jeremy...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Loop through the queries

In [82]:
num_shards = 6
for query in queries:
    query_number = queries.index(query) + 1
    print(f"Running query {query_number}")
    times = run_query_n_times(query, db, "awards", n)
    
    print("Times for query", query_number, ":", times)
    df.loc[df['query_number'] == query_number, times_list] = times
    df.loc[df['query_number'] == query_number, 'min_runtime'] = min(times)
    df.loc[df['query_number'] == query_number, 'max_runtime'] = max(times)
    df.loc[df['query_number'] == query_number, 'all_avg_runtime'] = np.mean(times)
    
    # Drop min and max values
    times.remove(min(times))
    times.remove(max(times))
    print("Times without outliers for query", query_number, ":", times)
    
    df.loc[df['query_number'] == query_number, 'avg_runtime_no_outliers'] = np.mean(times)
    
df.to_csv(f"results/S{num_shards}_results.csv", index=False)

Running query 1
Ran in 2225ms
Ran in 2228ms
Ran in 2207ms
Ran in 2213ms
Ran in 2207ms
Ran in 2206ms
Ran in 2234ms
Ran in 2264ms
Ran in 2266ms
Ran in 2238ms
Times for query 1 : [2225, 2228, 2207, 2213, 2207, 2206, 2234, 2264, 2266, 2238]
Times without outliers for query 1 : [2225, 2228, 2207, 2213, 2207, 2234, 2264, 2238]
Running query 2
Ran in 758ms
Ran in 754ms
Ran in 754ms
Ran in 756ms
Ran in 755ms
Ran in 753ms
Ran in 756ms
Ran in 756ms
Ran in 754ms
Ran in 756ms
Times for query 2 : [758, 754, 754, 756, 755, 753, 756, 756, 754, 756]
Times without outliers for query 2 : [754, 754, 756, 755, 756, 756, 754, 756]
Running query 3
Ran in 629ms
Ran in 624ms
Ran in 622ms
Ran in 624ms
Ran in 632ms
Ran in 626ms
Ran in 626ms
Ran in 631ms
Ran in 625ms
Ran in 626ms
Times for query 3 : [629, 624, 622, 624, 632, 626, 626, 631, 625, 626]
Times without outliers for query 3 : [629, 624, 624, 626, 626, 631, 625, 626]
Running query 4
Ran in 845ms
Ran in 842ms
Ran in 841ms
Ran in 848ms
Ran in 859ms
Ran in

In [83]:
df

Unnamed: 0,query_number,query,1_stats,2_stats,3_stats,4_stats,5_stats,6_stats,7_stats,8_stats,9_stats,10_stats,min_runtime,max_runtime,all_avg_runtime,avg_runtime_no_outliers
0,1,[{'$group': {'_id': {'$toLower': '$institution...,2225.0,2228.0,2207.0,2213.0,2207.0,2206.0,2234.0,2264.0,2266.0,2238.0,2206.0,2266.0,2228.8,2227.0
1,2,[{'$match': {'institution.city_name': re.compi...,758.0,754.0,754.0,756.0,755.0,753.0,756.0,756.0,754.0,756.0,753.0,758.0,755.2,755.125
2,3,[{'$match': {'investigators.email_id': {'$ne':...,629.0,624.0,622.0,624.0,632.0,626.0,626.0,631.0,625.0,626.0,622.0,632.0,626.5,626.375
3,4,[{'$match': {'institution.state_name': 'Califo...,845.0,842.0,841.0,848.0,859.0,844.0,843.0,842.0,842.0,844.0,841.0,859.0,845.0,843.75
4,5,"[{'$group': {'_id': '$institution', 'count': {...",3386.0,3409.0,3410.0,3415.0,3421.0,3396.0,3406.0,3401.0,3438.0,3502.0,3386.0,3502.0,3418.4,3412.0
5,6,"[{'$unwind': {'path': '$programs', 'includeArr...",1911.0,1900.0,1917.0,1922.0,1919.0,1988.0,1985.0,1955.0,1955.0,1952.0,1900.0,1988.0,1940.4,1939.5
6,7,[{'$match': {'investigators.email_id': re.comp...,644.0,644.0,639.0,644.0,650.0,643.0,644.0,646.0,644.0,645.0,639.0,650.0,644.3,644.25
7,8,[{'$match': {'investigators.email_id': 'jeremy...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,4.0,3.1,3.0
