In [210]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
import re
from datetime import datetime

# Initiate the connection to the database

In [211]:
# Connect to MongoDB
server_adress = 'mesiin592022-0031.westeurope.cloudapp.azure.com:30000' 
client = MongoClient(server_adress)

# Select database
db = client['dblp']

# Select collection
collection = db['awards']

# Queries

In [212]:
queries = []

queries.append([
    {
        '$group': {
            '_id': {
                '$toLower': '$institution.city_name'
            }, 
            'montant_cumul': {
                '$sum': 1
            }
        }
    }, {
        '$sort': {
            'montant_cumul': -1
        }
    }, {
        '$project': {
            'City Name': '$_id', 
            'Cumulative Amount': '$montant_cumul', 
            '_id': 0
        }
    }
])

In [213]:
queries.append([
    {
        '$match': {
            'institution.city_name': re.compile(r"New York(?i)")
        }
    }, {
        '$group': {
            '_id': {
                'instName': '$institution.name', 
                'instZip': '$institution.zipcode'
            }, 
            'quantity': {
                '$sum': 1
            }
        }
    }, {
        '$sort': {
            'quantity': -1
        }
    }, {
        '$project': {
            'Insitution name': '$_id.instName', 
            'Insitution zip': '$_id.instZip', 
            'Awards received': '$quantity', 
            '_id': 0
        }
    }
])

In [214]:
queries.append([
    {
        '$match': {
            'investigators.email_id': {
                '$ne': None
            }
        }
    }, {
        '$group': {
            '_id': '$investigators.email_id', 
            'setDomaines': {
                '$addToSet': '$foa_info'
            }, 
            'email': {
                '$first': '$investigators.email_id'
            }
        }
    }, {
        '$project': {
            'email': '$_id', 
            'Number of domains': {
                '$size': '$setDomaines'
            }, 
            '_id': 0
        }
    }, {
        '$sort': {
            'Number of domains': -1
        }
    }, {
        '$limit': 10
    }
])

In [215]:
queries.append([
    {
        '$match': {
            'institution.state_name': 'California'
        }
    }, {
        '$group': {
            '_id': '$organisation_code', 
            'nbInstitutions': {
                '$sum': 1
            }
        }
    }, {
        '$sort': {
            'nbInstitutions': -1
        }
    }, {
        '$limit': 5
    }
])

In [216]:
queries.append([
    {
        '$group': {
            '_id': '$institution', 
            'count': {
                '$sum': 1
            }, 
            'averageAwardAmount': {
                '$avg': '$amount'
            }, 
            'maxAwardAmount': {
                '$max': '$amount'
            }, 
            'minAwardAmount': {
                '$min': '$amount'
            }, 
            'stdAwardAmount': {
                '$stdDevPop': '$amount'
            }
        }
    }, {
        '$sort': {
            'count': -1
        }
    }
])

In [217]:
queries.append([
    {
        '$unwind': {
            'path': '$programs', 
            'includeArrayIndex': 'string', 
            'preserveNullAndEmptyArrays': False
        }
    }, {
        '$group': {
            '_id': '$programs', 
            'nb_amount': {
                '$sum': '$amount'
            }
        }
    }, {
        '$sort': {
            'nb_amount': -1
        }
    }, {
        '$project': {
            '_id': 1, 
            'nb_amount': 1
        }
    }
])

In [218]:
queries.append([
    {
        '$match': {
            'investigators.email_id': re.compile(r"(a)(?i)")
        }
    }, {
        '$group': {
            '_id': {
                'date': {
                    '$substr': [
                        '$effective_date', 6, -1
                    ]
                }, 
                'invest': '$investigators.email_id'
            }, 
            'nb': {
                '$sum': 1
            }
        }
    }, {
        '$project': {
            'Investigator': '$_id.invest', 
            'Date': '$_id.date', 
            'Number': '$nb', 
            '_id': 0
        }
    }
])

In [219]:
queries.append([
    {
        '$match': {
            'investigators.email_id': 'jeremygtaylor@compuserve.com'
        }
    }, {
        '$project': {
            'Email': '$investigators.email_id', 
            'date': {
                '$toDate': '$investigators.start_date'
            }, 
            'today': datetime.utcnow()
        }
    }, {
        '$addFields': {
            '_id': '$_id', 
            'dateDiff': {
                '$subtract': [
                    '$today', '$date'
                ]
            }
        }
    }, {
        '$project': {
            'Email': 1, 
            'dateDiff': 1, 
            'secs': {
                '$divide': [
                    '$dateDiff', 1000
                ]
            }
        }
    }, {
        '$project': {
            'Email': 1, 
            'mins': {
                '$divide': [
                    '$secs', 60
                ]
            }
        }
    }, {
        '$project': {
            'Email': 1, 
            'hours': {
                '$divide': [
                    '$mins', 60
                ]
            }
        }
    }, {
        '$project': {
            'Email': 1, 
            'days': {
                '$divide': [
                    '$hours', 24
                ]
            }
        }
    }, {
        '$group': {
            '_id': '$Email', 
            'avgDays': {
                '$avg': '$days'
            }
        }
    }, {
        '$project': {
            'Email': '$_id', 
            'Average days': '$avgDays', 
            '_id': 0
        }
    }
])

# Definitions

In [220]:
def get_aggregation_execTimeMs(query, db, collection):
    cursor = db.command(
        'explain', 
        {
            'aggregate': "awards", 
            'pipeline': query, 
            'cursor': {}
        }, 
        verbosity='executionStats'
    )
    times = []
    # Keep only the longest execution time
    for shard in cursor["shards"].keys():
        times.append(cursor["shards"][shard]["stages"][0]["$cursor"]["executionStats"]["executionTimeMillis"])
    return np.max(times)

In [221]:
def run_query_n_times(query, db, collection, n):
    times = []
    for i in range(n):
        t = get_aggregation_execTimeMs(query, db, collection)
        print(f"Ran in {t}ms")
        times.append(t)
    return times

In [222]:
def initialize_df(queries, n):
    times_list = [f"{i}_stats" for i in range(1, n+1)] # List of stats columns
    stats_list = ['min_runtime', 'max_runtime', 'all_avg_runtime']
    columns = ['query_number', 'query'] + times_list + stats_list
    
    df = pd.DataFrame(np.zeros((len(queries),len(columns))), columns=columns) # Create dataframe
    
    df['query_number'] = [i for i in range(1, len(queries)+1)]
    df['query'] = queries

    return df, times_list, stats_list

# Initialize a dataframe

In [223]:
n = 10 # Number of times to run each query
df, times_list, stats_list = initialize_df(queries, n)
df

Unnamed: 0,query_number,query,1_stats,2_stats,3_stats,4_stats,5_stats,6_stats,7_stats,8_stats,9_stats,10_stats,min_runtime,max_runtime,all_avg_runtime
0,1,[{'$group': {'_id': {'$toLower': '$institution...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,[{'$match': {'institution.city_name': re.compi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,[{'$match': {'investigators.email_id': {'$ne':...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,[{'$match': {'institution.state_name': 'Califo...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,"[{'$group': {'_id': '$institution', 'count': {...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,6,"[{'$unwind': {'path': '$programs', 'includeArr...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,7,[{'$match': {'investigators.email_id': re.comp...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,[{'$match': {'investigators.email_id': 'jeremy...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Loop through the queries

In [224]:
num_shards = 2
for query in queries:
    query_number = queries.index(query) + 1
    print(f"Running query {query_number}")
    times = run_query_n_times(query, db, "awards", n)
    
    print("Times for query", query_number, ":", times)
    df.loc[df['query_number'] == query_number, times_list] = times
    df.loc[df['query_number'] == query_number, 'min_runtime'] = min(times)
    df.loc[df['query_number'] == query_number, 'max_runtime'] = max(times)
    df.loc[df['query_number'] == query_number, 'all_avg_runtime'] = np.mean(times)
    
    # Drop min and max values
    times.remove(min(times))
    times.remove(max(times))
    print("Times without outliers for query", query_number, ":", times)
    
    df.loc[df['query_number'] == query_number, 'avg_runtime_no_outliers'] = np.mean(times)
    
df.to_csv(f"results/S{num_shards}_results.csv", index=False)

Running query 1
Ran in 2937ms
Ran in 2926ms
Ran in 2928ms
Ran in 2922ms
Ran in 2947ms
Ran in 2935ms
Ran in 2916ms
Ran in 2923ms
Ran in 2940ms
Ran in 2923ms
Times for query 1 : [2937, 2926, 2928, 2922, 2947, 2935, 2916, 2923, 2940, 2923]
Times without outliers for query 1 : [2937, 2926, 2928, 2922, 2935, 2923, 2940, 2923]
Running query 2
Ran in 1009ms
Ran in 1012ms
Ran in 1010ms
Ran in 1010ms
Ran in 1010ms
Ran in 1009ms
Ran in 1010ms
Ran in 1011ms
Ran in 1011ms
Ran in 1011ms
Times for query 2 : [1009, 1012, 1010, 1010, 1010, 1009, 1010, 1011, 1011, 1011]
Times without outliers for query 2 : [1010, 1010, 1010, 1009, 1010, 1011, 1011, 1011]
Running query 3
Ran in 1426ms
Ran in 1433ms
Ran in 1425ms
Ran in 1436ms
Ran in 1416ms
Ran in 1425ms
Ran in 1445ms
Ran in 1421ms
Ran in 1421ms
Ran in 1429ms
Times for query 3 : [1426, 1433, 1425, 1436, 1416, 1425, 1445, 1421, 1421, 1429]
Times without outliers for query 3 : [1426, 1433, 1425, 1436, 1425, 1421, 1421, 1429]
Running query 4
Ran in 1126ms
R

In [227]:
df

Unnamed: 0,query_number,query,1_stats,2_stats,3_stats,4_stats,5_stats,6_stats,7_stats,8_stats,9_stats,10_stats,min_runtime,max_runtime,all_avg_runtime,avg_runtime_no_outliers
0,1,[{'$group': {'_id': {'$toLower': '$institution...,2937.0,2926.0,2928.0,2922.0,2947.0,2935.0,2916.0,2923.0,2940.0,2923.0,2916.0,2947.0,2929.7,2929.25
1,2,[{'$match': {'institution.city_name': re.compi...,1009.0,1012.0,1010.0,1010.0,1010.0,1009.0,1010.0,1011.0,1011.0,1011.0,1009.0,1012.0,1010.3,1010.25
2,3,[{'$match': {'investigators.email_id': {'$ne':...,1426.0,1433.0,1425.0,1436.0,1416.0,1425.0,1445.0,1421.0,1421.0,1429.0,1416.0,1445.0,1427.7,1427.0
3,4,[{'$match': {'institution.state_name': 'Califo...,1126.0,1125.0,1124.0,1127.0,1123.0,1124.0,1126.0,1126.0,1126.0,1125.0,1123.0,1127.0,1125.2,1125.25
4,5,"[{'$group': {'_id': '$institution', 'count': {...",4483.0,4422.0,4483.0,4431.0,4520.0,4494.0,4500.0,4475.0,4482.0,4457.0,4422.0,4520.0,4474.7,4475.625
5,6,"[{'$unwind': {'path': '$programs', 'includeArr...",2570.0,2594.0,2585.0,2573.0,2581.0,2567.0,2565.0,2562.0,2576.0,2578.0,2562.0,2594.0,2575.1,2574.375
6,7,[{'$match': {'investigators.email_id': re.comp...,1490.0,1526.0,1499.0,1513.0,1503.0,1513.0,1489.0,1516.0,1509.0,1494.0,1489.0,1526.0,1505.2,1504.625
7,8,[{'$match': {'investigators.email_id': 'jeremy...,3.0,4.0,4.0,4.0,4.0,3.0,3.0,3.0,4.0,4.0,3.0,4.0,3.6,3.625
