In [63]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
import re
from datetime import datetime

# Initiate the connectoin to the database

In [64]:
# Connect to MongoDB
server_adress = 'mesiin592022-0031.westeurope.cloudapp.azure.com:30000' 
client = MongoClient(server_adress)

# Select database
db = client['dblp']

# Select collection
collection = db['awards']

# Queries

In [65]:
# Queries
queries = []

queries.append([
    {
        '$group': {
            '_id': {
                '$toLower': '$institution.city_name'
            }, 
            'montant_cumul': {
                '$sum': 1
            }
        }
    }, {
        '$sort': {
            'montant_cumul': -1
        }
    }, {
        '$project': {
            'City Name': '$_id', 
            'Cumulative Amount': '$montant_cumul', 
            '_id': 0
        }
    }
])

In [66]:
queries.append([
    {
        '$match': {
            'institution.city_name': re.compile(r"New York(?i)")
        }
    }, {
        '$group': {
            '_id': {
                'instName': '$institution.name', 
                'instZip': '$institution.zipcode'
            }, 
            'quantity': {
                '$sum': 1
            }
        }
    }, {
        '$sort': {
            'quantity': -1
        }
    }, {
        '$project': {
            'Insitution name': '$_id.instName', 
            'Insitution zip': '$_id.instZip', 
            'Awards received': '$quantity', 
            '_id': 0
        }
    }
])

In [67]:
queries.append([
    {
        '$match': {
            'investigators.email_id': {
                '$ne': None
            }
        }
    }, {
        '$group': {
            '_id': '$investigators.email_id', 
            'setDomaines': {
                '$addToSet': '$foa_info'
            }, 
            'email': {
                '$first': '$investigators.email_id'
            }
        }
    }, {
        '$project': {
            'email': '$_id', 
            'Number of domains': {
                '$size': '$setDomaines'
            }, 
            '_id': 0
        }
    }, {
        '$sort': {
            'Number of domains': -1
        }
    }, {
        '$limit': 10
    }
])

In [68]:
queries.append([
    {
        '$match': {
            'institution.state_name': 'California'
        }
    }, {
        '$group': {
            '_id': '$organisation_code', 
            'nbInstitutions': {
                '$sum': 1
            }
        }
    }, {
        '$sort': {
            'nbInstitutions': -1
        }
    }, {
        '$limit': 5
    }
])

In [69]:
queries.append([
    {
        '$group': {
            '_id': '$institution', 
            'count': {
                '$sum': 1
            }, 
            'averageAwardAmount': {
                '$avg': '$amount'
            }, 
            'maxAwardAmount': {
                '$max': '$amount'
            }, 
            'minAwardAmount': {
                '$min': '$amount'
            }, 
            'stdAwardAmount': {
                '$stdDevPop': '$amount'
            }
        }
    }, {
        '$sort': {
            'count': -1
        }
    }
])

In [70]:
queries.append([
    {
        '$unwind': {
            'path': '$programs', 
            'includeArrayIndex': 'string', 
            'preserveNullAndEmptyArrays': False
        }
    }, {
        '$group': {
            '_id': '$programs', 
            'nb_amount': {
                '$sum': '$amount'
            }
        }
    }, {
        '$sort': {
            'nb_amount': -1
        }
    }, {
        '$project': {
            '_id': 1, 
            'nb_amount': 1
        }
    }
])

In [71]:
queries.append([
    {
        '$match': {
            'investigators.email_id': re.compile(r"(a)(?i)")
        }
    }, {
        '$group': {
            '_id': {
                'date': {
                    '$substr': [
                        '$effective_date', 6, -1
                    ]
                }, 
                'invest': '$investigators.email_id'
            }, 
            'nb': {
                '$sum': 1
            }
        }
    }, {
        '$project': {
            'Investigator': '$_id.invest', 
            'Date': '$_id.date', 
            'Number': '$nb', 
            '_id': 0
        }
    }, {
        '$sort': {
            'Number': -1
        }
    }
])

In [72]:
queries.append([
    {
        '$match': {
            'investigators.email_id': 'jeremygtaylor@compuserve.com'
        }
    }, {
        '$project': {
            'Email': '$investigators.email_id', 
            'date': {
                '$toDate': '$investigators.start_date'
            }, 
            'today': datetime.utcnow()
        }
    }, {
        '$addFields': {
            '_id': '$_id', 
            'dateDiff': {
                '$subtract': [
                    '$today', '$date'
                ]
            }
        }
    }, {
        '$project': {
            'Email': 1, 
            'dateDiff': 1, 
            'secs': {
                '$divide': [
                    '$dateDiff', 1000
                ]
            }
        }
    }, {
        '$project': {
            'Email': 1, 
            'mins': {
                '$divide': [
                    '$secs', 60
                ]
            }
        }
    }, {
        '$project': {
            'Email': 1, 
            'hours': {
                '$divide': [
                    '$mins', 60
                ]
            }
        }
    }, {
        '$project': {
            'Email': 1, 
            'days': {
                '$divide': [
                    '$hours', 24
                ]
            }
        }
    }, {
        '$group': {
            '_id': '$Email', 
            'avgDays': {
                '$avg': '$days'
            }
        }
    }, {
        '$project': {
            'Email': '$_id', 
            'Average days': '$avgDays', 
            '_id': 0
        }
    }
])

# Execute the queries

In [73]:
# Create a dataframe with 8 rows for the 8 queries and 10 columns for the 10 runtimes
stats_list = [f"{i}_stats" for i in range(1, 11)]
columns = ['query_number', 'query']
columns.extend(stats_list)
df = pd.DataFrame(np.zeros((len(queries),12)), columns=columns)
# Fill the dataframe with the query number and the query
df['query_number'] = [i for i in range(1, len(queries)+1)]
df['query'] = queries
df

Unnamed: 0,query_number,query,1_stats,2_stats,3_stats,4_stats,5_stats,6_stats,7_stats,8_stats,9_stats,10_stats
0,1,[{'$group': {'_id': {'$toLower': '$institution...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,[{'$match': {'institution.city_name': re.compi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,[{'$match': {'investigators.email_id': {'$ne':...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,[{'$match': {'institution.state_name': 'Califo...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,"[{'$group': {'_id': '$institution', 'count': {...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,6,"[{'$unwind': {'path': '$programs', 'includeArr...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,7,[{'$match': {'investigators.email_id': re.comp...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,[{'$match': {'investigators.email_id': 'jeremy...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [74]:
query = df[df['query_number'] == 1]['query'].values[0]
query

[{'$group': {'_id': {'$toLower': '$institution.city_name'},
   'montant_cumul': {'$sum': 1}}},
 {'$sort': {'montant_cumul': -1}},
 {'$project': {'City Name': '$_id',
   'Cumulative Amount': '$montant_cumul',
   '_id': 0}}]

In [92]:
cursor = db.command(
    'explain', 
    {
        'aggregate': "awards", 
        'pipeline': query, 
        'cursor': {}
    }, 
    verbosity='executionStats'
)
cursor

{'serverInfo': {'host': 'MESIIN592022-0031',
  'port': 30000,
  'version': '4.4.17',
  'gitVersion': '85de0cc83f4dc64dbbac7fe028a4866228c1b5d1'},
 'splitPipeline': None,
 'shards': {'RS1': {'host': 'MESIIN592022-0047:27017',
   'stages': [{'$cursor': {'queryPlanner': {'plannerVersion': 1,
       'namespace': 'dblp.awards',
       'indexFilterSet': False,
       'parsedQuery': {},
       'queryHash': '55955545',
       'planCacheKey': '55955545',
       'winningPlan': {'stage': 'PROJECTION_DEFAULT',
        'transformBy': {'institution.city_name': 1, '_id': 0},
        'inputStage': {'stage': 'SHARDING_FILTER',
         'inputStage': {'stage': 'COLLSCAN', 'direction': 'forward'}}},
       'rejectedPlans': []},
      'executionStats': {'executionSuccess': True,
       'nReturned': 1351992,
       'executionTimeMillis': 3651,
       'totalKeysExamined': 0,
       'totalDocsExamined': 1351992,
       'executionStages': {'stage': 'PROJECTION_DEFAULT',
        'nReturned': 1351992,
        '

In [56]:
collection.aggregate(query, {"explain": "true"})

AttributeError: 'dict' object has no attribute '_txn_read_preference'

In [7]:
# For each query, run it 10 times and store the runtime in the dataframe
for i in range(len(queries)):
    query = df[df['query_number'] == i+1]['query'].values[0]
    print("Executing query", i+1, ":", query)
    for j in range(1, 11):
        # Run the query
        cursor = collection.aggregate(query)
        # Store the runtime
        df.loc[i, f"{j}_stats"] = cursor.explain()['executionStats']['executionTimeMillis']
        print("Query", i+1, "run", j, "done")

Executing query 1 : {'investigators.email_id': 'tilarson@vt.edu'}
Query 1 run 1 done
Query 1 run 2 done
Query 1 run 3 done
Query 1 run 4 done
Query 1 run 5 done
Query 1 run 6 done
Query 1 run 7 done
Query 1 run 8 done
Query 1 run 9 done
Query 1 run 10 done
Executing query 2 : {'investigators.email_id': 'jziegert@uncc.edu'}
Query 2 run 1 done
Query 2 run 2 done
Query 2 run 3 done
Query 2 run 4 done
Query 2 run 5 done
Query 2 run 6 done
Query 2 run 7 done
Query 2 run 8 done
Query 2 run 9 done
Query 2 run 10 done
Executing query 3 : {'insitution.state_name': 'California'}
Query 3 run 1 done
Query 3 run 2 done
Query 3 run 3 done
Query 3 run 4 done
Query 3 run 5 done
Query 3 run 6 done
Query 3 run 7 done
Query 3 run 8 done
Query 3 run 9 done
Query 3 run 10 done


In [8]:
df

Unnamed: 0,query_number,query,1_stats,2_stats,3_stats,4_stats,5_stats,6_stats,7_stats,8_stats,9_stats,10_stats
0,1,{'investigators.email_id': 'tilarson@vt.edu'},12.0,2.0,2.0,2.0,3.0,2.0,1.0,2.0,1.0,2.0
1,2,{'investigators.email_id': 'jziegert@uncc.edu'},2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
2,3,{'insitution.state_name': 'California'},997.0,993.0,995.0,998.0,999.0,995.0,997.0,997.0,997.0,998.0
