In [1]:
import pickle
import pandas as pd
import pymongo
from pymongo import TEXT
import numpy as np

myclient = pymongo.MongoClient("mongodb://localhost")
mydb = myclient["yelp"]
business = mydb["business"]
review = mydb["review"]
user = mydb["user"]

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
import zipfile
import os.path

if not os.path.isfile('yelp_dataset/yelp_academic_dataset_review.json'):
    with zipfile.ZipFile('yelp_dataset/yelp_academic_dataset_review.json', 'r') as zip_ref:
        zip_ref.extractall('data')

if not os.path.isfile('yelp_dataset/yelp_academic_dataset_user.json'):
    with zipfile.ZipFile('yelp_dataset/yelp_academic_dataset_user.json', 'r') as zip_ref:
        zip_ref.extractall('data')

if not os.path.isfile('yelp_dataset/yelp_academic_dataset_business.json'):
    with zipfile.ZipFile('yelp_dataset/yelp_academic_dataset_business.json', 'r') as zip_ref:
        zip_ref.extractall('data')

In [3]:
import json

if business.count_documents({}) == 0:
    print("Loading business collection...")
    with open('yelp_dataset/yelp_academic_dataset_business.json', encoding='utf-8') as f:
        for line in f:
            business.insert_one(json.loads(line))

if review.count_documents({}) == 0:
    print("Loading review collection...")
    with open('yelp_dataset/yelp_academic_dataset_review.json', encoding='utf-8') as f:
        for line in f:
            review.insert_one(json.loads(line))
            
if user.count_documents({}) == 0:
    print("Loading user collection...")
    with open('yelp_dataset/yelp_academic_dataset_user.json', encoding='utf-8') as f:
        for line in f:
            user.insert_one(json.loads(line))

In [4]:
pipeline = [
    {
        "$match": { "state": "FL" }
    },
    {
        "$group": {
            "_id": "$city",
            "business_count": { "$sum": 1 }
        }
    },
    {
        "$sort": { "business_count": -1 }
    },
    {
        "$project": {
            "_id": 0,
            "city": "$_id",
            "business_count": 1
        }
    }
]

results = business.aggregate(pipeline)
df = pd.DataFrame(list(results))

print("Cities in FL with the Most Businesses:")
print(df)

Cities in FL with the Most Businesses:
     business_count                  city
0              9048                 Tampa
1              2221            Clearwater
2              1663      Saint Petersburg
3              1185        St. Petersburg
4              1033               Brandon
..              ...                   ...
221               1           spring hill
222               1               ​Lithia
223               1        TARPON SPRINGS
224               1          Apollo beach
225               1  Wyndlake Condominium

[226 rows x 2 columns]


In [5]:
pipeline = [
    {
        "$match": { "state": "FL" }
    },
    {
        "$group": {
            "_id": "$city",
            "business_count": { "$sum": 1 }
        }
    },
    {
        "$sort": { "business_count": -1 }
    },
    {
        "$limit": 3
    },
    {
        "$project": {
            "_id": 0,
            "city": "$_id",
            "business_count": 1
        }
    }
]

results = business.aggregate(pipeline)

df_top_cities = pd.DataFrame(list(results))

print("Top 3 Cities in FL with the Most Businesses:")
print(df_top_cities)

Top 3 Cities in FL with the Most Businesses:
   business_count              city
0            9048             Tampa
1            2221        Clearwater
2            1663  Saint Petersburg


TAMPAS BIGGESST HATER QUERY

In [6]:
business.create_index([("business_id", 1)], name="business_id_idx")
business.create_index([("city", 1)], name="city_idx")
business.create_index([("state", 1)], name="state_idx")

review.create_index([("business_id", 1)], name="review_business_id_idx")
review.create_index([("user_id", 1)], name="review_user_id_idx")

user.create_index([("user_id", 1)], name="user_id_idx")

'user_id_idx'

In [7]:
user_ids_to_remove = [
    '5XiPz5mJK_RtJQVkXIqxYg',
    'NCeW1I6C4K7qhY4kRH8cOA',
    'vq2H7lJ73VwXMDqC8DiImw'
]
result = review.delete_many({
    'user_id': {'$in': user_ids_to_remove}
})

#verify
remaining_reviews = review.count_documents({
    'user_id': {'$in': user_ids_to_remove}
})

print(f"Remaining reviews for specified user_ids: {remaining_reviews}")



Remaining reviews for specified user_ids: 0


In [8]:
pipeline_fl_businesses = [
    {"$match": {"state": "FL"}}
]

fl_businesses = list(business.aggregate(pipeline_fl_businesses))


In [9]:
import re

city_variations = ["Tampa", "Tampa Bay", "tampa", "TAMPA"]

escaped_cities = [re.escape(city) for city in city_variations]
regex_pattern = f"^({'|'.join(escaped_cities)})$"

pipeline_tampa_businesses = [
    {"$match": {
        "city": {"$regex": regex_pattern, "$options": "i"},
        "state": "FL"
    }}
]

tampa_businesses = list(business.aggregate(pipeline_tampa_businesses))


In [10]:
tampa_business_ids = [business["business_id"] for business in tampa_businesses]



In [11]:
pipeline_tampa_reviews = [
    {"$match": {"business_id": {"$in": tampa_business_ids}}}
]

tampa_reviews = list(review.aggregate(pipeline_tampa_reviews))


In [12]:
pipeline_hater_query = [

    {"$match": {"business_id": {"$in": tampa_business_ids}}},
    # Join with user collection to get user names
    {
        "$lookup": {
            "from": "user",
            "localField": "user_id",
            "foreignField": "user_id",
            "as": "user_details"
        }
    },
    {"$unwind": "$user_details"},
    {
        "$group": {
            "_id": {"user_id": "$user_id", "user_name": "$user_details.name"},
            "review_count": {"$sum": 1},
            "avg_stars": {"$avg": "$stars"}
        }
    },
    {"$match": {"review_count": {"$gt": 5}}},
    # Sort by avg_stars ascending and review_count descending
    {"$sort": { "user_name" : 1,"avg_stars": 1,  "review_count": -1}},
    {"$limit": 10},
    {
        "$project": {
            "_id": 0,
            "user_id": "$_id.user_id",
            "user_name": "$_id.user_name",
            "review_count": 1,
            "avg_stars": {"$round": ["$avg_stars", 2]}
        }
    }
]

hater_query_results = list(review.aggregate(pipeline_hater_query))

df_hater_query = pd.DataFrame(hater_query_results)

print("Top 10 Haters in Tampa, FL:")
print(df_hater_query)


Top 10 Haters in Tampa, FL:
   review_count                 user_id  user_name  avg_stars
0            14  C7fbmhCmXXhtWRZyW6Nzmg          A        1.0
1            10  k2lXKvz4iSw0OZSSojCkdw  Bob-A-Lou        1.0
2            10  YgeZ-tJ0ZEU_m41pGBng_A     Carlos        1.0
3             9  mAI_E0rVjyC4FF9PSn37rA      Terry        1.0
4             9  zHKbREByeKvgbKPxbKIweA        Vic        1.0
5             9  CGp3KducoS3lemPVanz0Xw       Lola        1.0
6             9  OL87RR63x07Y0K2xpCG4Aw     Nicole        1.0
7             9  fluPpH9ap58Y5xkPPyFQuQ       Rick        1.0
8             9  EXL55TgQAo1RRUIpZvKmWQ      Sandy        1.0
9             9  mvUw8RPzmXkVdCR4Pr_WJg      Mauro        1.0


In [13]:
# explain_cmd = {
#     "explain": {
#         "aggregate": "review",          # The name of the collection
#         "pipeline": pipeline_hater_query,
#         "cursor": {}
#     },
#     "verbosity": "executionStats"        # or "queryPlanner" / "allPlansExecution"
# }

# # Run the explain command
# explain_result = mydb.command(explain_cmd)

# # Print the explain output nicely
# print(json.dumps(explain_result, indent=4))

In [14]:
mydb.command('explain', {'aggregate': 'review', 'pipeline': pipeline_hater_query, 'cursor': {}}, verbosity='executionStats')

{'explainVersion': '1',
 'stages': [{'$cursor': {'queryPlanner': {'namespace': 'yelp.review',
     'parsedQuery': {'business_id': {'$in': ['--ARBQr1WMsTWiwOKOj-FQ',
        '--LC8cIrALInl2vyo701tg',
        '--eBbs3HpZYIym5pEw8Qdw',
        '--gJkxbsiSIwsQKbiwm_Ng',
        '--pDYWb4DzqKdAdrPcxuaA',
        '--rS-rnOIZxoiDA8yctWpQ',
        '-0i2KNr7WrCsDF5m0IViJg',
        '-0oPt7sSKtJG1ysLwV_E9g',
        '-1WM2044r3jVZC6oQ2QeVA',
        '-1oygVebK81K8JEPI6H6Lw',
        '-2C-Ll8cbgNMpEKokXK6Dw',
        '-2CPhK6ik9ZBgFX_F-dkxQ',
        '-2ZIgbTr50xq64ODisvKhw',
        '-2dvQxx3cYXd5XmFdDDsDA',
        '-2slH9PUjJ9YK16b0hGxWg',
        '-2wh7NTLkWEgsrLJvilnFQ',
        '-34c4hcDPIInTROr8Xtxtw',
        '-361Hc0tlxSYdrH_C3OgzA',
        '-3w58iNJL_sZuT0Ozpz0yQ',
        '-4HTX6yWNuRJusCTezaUnQ',
        '-53z4kzdbB9F4kPQqmQjyg',
        '-5fXJjU1oeWnCHW5MPOuxg',
        '-5iuX3tPbwH5LpoWN9QhMQ',
        '-5psHqEISccHDdHaWF2-6Q',
        '-788YqeAedOZseDJOo7kfQ',
        '-7GjicSH_r

One review at restaurant per user query

In [15]:
pipeline = [
    {
        "$lookup": {
            "from": "business",
            "localField": "business_id",
            "foreignField": "business_id",
            "as": "business_details"
        }
    },
    {
        "$unwind": "$business_details"
    },
    {
        "$match": {
            "business_details.city": {
                "$regex": regex_pattern,
                "$options": "i"
            },
            "business_details.state": {
                "$regex": "^FL$",
                "$options": "i"
            }
        }
    },
    {
        "$group": {
            "_id": {
                "business_id": "$business_id",
                "user_id": "$user_id"
            },
            "review_count": { "$sum": 1 },
            "avg_stars": { "$avg": "$stars" }
        }
    },
    {
        "$match": {
            "review_count": 1
        }
    },
    {
        "$group": {
            "_id": "$_id.business_id",
            "single_review_user_count": { "$sum": 1 },
            "avg_stars_for_single_reviews": { "$avg": "$avg_stars" }
        }
    },
    {
        "$lookup": {
            "from": "business",
            "localField": "_id",
            "foreignField": "business_id",
            "as": "business_details"
        }
    },
    {
        "$unwind": "$business_details"
    },
    
    {
        "$project": {
            "_id": 0,
            "business_id": "$_id",
            "business_name": "$business_details.name",
            "single_review_user_count": 1,
            "avg_stars_for_single_reviews": { "$round": ["$avg_stars_for_single_reviews", 2] }
        }
    },
    {
        "$sort": {
            "single_review_user_count": -1
        }
    },
    {"$limit": 3}

]

results = review.aggregate(pipeline)
df_single_review_businesses = pd.DataFrame(list(results))

print("Businesses in Tampa with High Number of Single-Review Users:")
print(df_single_review_businesses)

Businesses in Tampa with High Number of Single-Review Users:
   single_review_user_count             business_id       business_name  \
0                      3148  QHWYlmVbLC3K6eglWoHVvA                Datz   
1                      2962  L5LLN0RafiV1Z9cddzvuCw               Ulele   
2                      2831  dsfRniRgfbDjC8os848B6A  Bern's Steak House   

   avg_stars_for_single_reviews  
0                          4.14  
1                          4.17  
2                          4.26  


In [16]:
# explain_cmd = {
#     "explain": {
#         "aggregate": "review",          
#         "pipeline": pipeline,
#         "cursor": {}
#     },
#     "verbosity": "executionStats"       
# }

# explain_result = mydb.command(explain_cmd)

# print(json.dumps(explain_result, indent=4))

In [17]:
mydb.command('explain', {'aggregate': 'review', 'pipeline': pipeline, 'cursor': {}}, verbosity='executionStats')

{'explainVersion': '1',
 'stages': [{'$cursor': {'queryPlanner': {'namespace': 'yelp.review',
     'parsedQuery': {},
     'indexFilterSet': False,
     'queryHash': '4E7FE95E',
     'planCacheKey': 'D8CBDF1E',
     'optimizationTimeMillis': 0,
     'maxIndexedOrSolutionsReached': False,
     'maxIndexedAndSolutionsReached': False,
     'maxScansToExplodeReached': False,
     'prunedSimilarIndexes': False,
     'winningPlan': {'isCached': False,
      'stage': 'PROJECTION_SIMPLE',
      'transformBy': {'business_id': 1, 'stars': 1, 'user_id': 1, '_id': 0},
      'inputStage': {'stage': 'COLLSCAN', 'direction': 'forward'}},
     'rejectedPlans': []},
    'executionStats': {'executionSuccess': True,
     'nReturned': 6990277,
     'executionTimeMillis': 365113,
     'totalKeysExamined': 0,
     'totalDocsExamined': 6990277,
     'executionStages': {'isCached': False,
      'stage': 'PROJECTION_SIMPLE',
      'nReturned': 6990277,
      'executionTimeMillisEstimate': 4803,
      'works': 

Simple query

In [18]:
pipeline = [
    {
        "$match": {
            "city": { "$regex": regex_pattern, "$options": "i" },  
            "state": { "$regex": "^FL$", "$options": "i" },         
            "postal_code": { "$ne": None }                        
        }
    },
    {
        "$group": {
            "_id": "$postal_code",
            "total_businesses": { "$sum": 1 }
        }
    },
    {
        "$sort": { "total_businesses": -1 }
    },
    {
        "$limit": 10
    },
    {
        "$project": {
            "_id": 0,
            "postal_code": "$_id",
            "total_businesses": 1
        }
    }
]


results = business.aggregate(pipeline)
df_top_postal_codes = pd.DataFrame(list(results))
print("Top 10 Postal Codes in Tampa, FL with Most Businesses:")
print(df_top_postal_codes)

Top 10 Postal Codes in Tampa, FL with Most Businesses:
   total_businesses postal_code
0               794       33607
1               649       33602
2               639       33609
3               591       33618
4               580       33612
5               509       33606
6               496       33629
7               434       33614
8               413       33611
9               410       33647


In [19]:
mydb.command('explain', {'aggregate': 'review', 'pipeline': pipeline, 'cursor': {}}, verbosity='executionStats')

{'explainVersion': '2',
 'stages': [{'$cursor': {'queryPlanner': {'namespace': 'yelp.review',
     'parsedQuery': {'$and': [{'city': {'$regex': '^(Tampa|Tampa\\ Bay|tampa|TAMPA)$',
         '$options': 'i'}},
       {'state': {'$regex': '^FL$', '$options': 'i'}},
       {'postal_code': {'$not': {'$eq': None}}}]},
     'indexFilterSet': False,
     'queryHash': '452F5FAB',
     'planCacheKey': '552FAAD7',
     'optimizationTimeMillis': 0,
     'maxIndexedOrSolutionsReached': False,
     'maxIndexedAndSolutionsReached': False,
     'maxScansToExplodeReached': False,
     'prunedSimilarIndexes': False,
     'winningPlan': {'isCached': False,
      'queryPlan': {'stage': 'GROUP',
       'planNodeId': 3,
       'inputStage': {'stage': 'COLLSCAN',
        'planNodeId': 1,
        'filter': {'$and': [{'city': {'$regex': '^(Tampa|Tampa\\ Bay|tampa|TAMPA)$',
            '$options': 'i'}},
          {'state': {'$regex': '^FL$', '$options': 'i'}},
          {'postal_code': {'$not': {'$eq': None}}

In [20]:
unique_postal_codes = business.distinct("postal_code", {
        "city": { "$regex": regex_pattern, "$options": "i" },
        "state": { "$regex": "^FL$", "$options": "i" },
        "postal_code": { "$ne": None }
    })
    
distinct_postal_code_count = len(unique_postal_codes)
print(f"Distinct Postal Codes in Tampa Businesses: {distinct_postal_code_count}")


Distinct Postal Codes in Tampa Businesses: 97


In [21]:
pipeline = [
    {
        "$match": {
            "city": { "$regex": regex_pattern, "$options": "i" },  
            "state": { "$regex": "^FL$", "$options": "i" },         
            "postal_code": { "$ne": None }                        
        }
    },
    {
        "$group": {
            "_id": "$postal_code",
            "total_businesses": { "$sum": 1 }
        }
    },
    {
        "$group": {
            "_id": None,
            "min_businesses": { "$min": "$total_businesses" },
            "max_businesses": { "$max": "$total_businesses" },
            "avg_businesses": { "$avg": "$total_businesses" },
            "stddev_businesses": { "$stdDevSamp": "$total_businesses" }  
        }
    },
    {
        "$project": {
            "_id": 0,
            "min_businesses": 1,
            "max_businesses": 1,
            "avg_businesses": 1,
            "stddev_businesses": 1
        }
    }
]
results = list(business.aggregate(pipeline))
    
df_stats = pd.DataFrame(results)
print("\nBusiness Statistics per Postal Code in Tampa, FL:")
print(df_stats)


Business Statistics per Postal Code in Tampa, FL:
   min_businesses  max_businesses  avg_businesses  stddev_businesses
0               1             794       94.917526         184.141062
