In [1]:
import sys
import json
import csv
import yaml

import importlib

import pandas as pd
import numpy as np
import math

import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

from datetime import time
from datetime import date
from datetime import datetime
# with the above choices, the imported datetime.time(2023,07,01) is recognized
# from datetime import date
# from datetime import datetime

import pprint

import psycopg2
from sqlalchemy import create_engine, text as sql_text

# Create an utilities file util.py in a folder benchmarking and import it
sys.path.append('helper_functions/')
# import util as util
import util

In [2]:
from pymongo import MongoClient

client = MongoClient()

In [3]:
# I have (or will have) a database "airbnb"
db = client.airbnb


print('The list of all databases currently in the MongoDB client is:')
print(client.list_database_names())

print('\nThe list of all collections in the airbnb database is:')
print(db.list_collection_names())
# Note: calendar may not show up yet; it is created only when a first document is inserted into it

The list of all databases currently in the MongoDB client is:
['admin', 'airbnb', 'config', 'local', 'test']

The list of all collections in the airbnb database is:
['listings_with_calendar', 'listings', 'reviews', 'listings_with_reviews_m', 'calendar', 'listings_test']


### Building **listings_with_reviews_and_cal**

In [8]:
db.listings_with_reviews_and_cal.drop()

pipeline = [
    {
        '$lookup':
        {
            'from': 'listings_with_calendar',
            'localField': 'id',
            'foreignField': '_id',
            'as': 'cal_docs'
        }
    },
    {
        '$unwind':
        {
            'path': '$cal_docs',
            'preserveNullAndEmptyArrays': True
        }
    },
    {
        '$addFields':{
            'average_price': '$$ROOT.cal_docs.average_price',
            'first_available_date': '$$ROOT.cal_docs.first_available_date',
            'last_available_date': '$$ROOT.cal_docs.last_available_date',
            'dates_list': '$$ROOT.cal_docs.dates_list'

        }
    },
    {
        '$unset': 'cal_docs'
    },
    {
        '$out': 'listings_with_reviews_and_cal'
    }
]

time1 = datetime.now()
test1 = db.listings_with_reviews_m.aggregate(pipeline)
time2 = datetime.now()
diff = util.time_diff(time1, time2)

print('\nTime it took was:', format(diff, '.4f'), '.')

print(db.list_collection_names())


Time it took was: 15.9775 .
['listings_with_reviews_and_cal', 'listings_with_calendar', 'listings', 'reviews', 'listings_with_reviews_m', 'calendar', 'listings_test']


In [9]:
print(db.listings_with_reviews_and_cal.count_documents({}))

cursor = db.listings_with_reviews_and_cal.find({'id' : {'$regex' : '^1000.*$'}})
    
l = list(cursor)
print(len(l))

39202
43


In [10]:
db.listings_with_reviews_and_cal.find_one()

{'_id': ObjectId('665ea170e9a75767d0524773'),
 'id': '977395984065981849',
 'name': 'Home in Brooklyn · 1 bedroom · 1 bed · 1 bath',
 'host_id': '95344065',
 'host_name': 'Derek',
 'neighbourhood_group': 'Brooklyn',
 'neighbourhood': 'Sheepshead Bay',
 'latitude': 40.59179,
 'longitude': -73.94285,
 'room_type': 'Private room',
 'price': 30.0,
 'minimum_nights': 31,
 'number_of_reviews': 1,
 'last_review': datetime.datetime(2024, 1, 3, 0, 0),
 'reviews_per_month': 0.86,
 'calculated_host_listings_count': 7,
 'availability_365': 339,
 'number_of_reviews_ltm': 1,
 'license': nan,
 'reviews': [{'listing_id': '977395984065981849',
   'id': '1060927930986644037',
   'date': datetime.datetime(2024, 1, 3, 0, 0),
   'reviewer_id': '56179331',
   'reviewer_name': 'Sean',
   'comments': 'I fear it’s kind of a “you get what you pay for” situation. The place is very dirty, especially the common rooms where the cupboards are caked and grease and stickiness. To the point that even opening a drawer o

In [15]:
def isnan(val):
    return val != val

In [26]:
def convert_lwrac_to_json(doc):
    doc_new = {}

    doc_new['_id'] = str(doc['_id'])

    for key in ['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 'neighbourhood', 'latitude', 'longitude', 'room_type', 'price', 'minimum_nights', 'number_of_reviews']:
        if not isnan(doc[key]):
            doc_new[key] = doc[key]
        else:
            doc_new[key] = None

    if doc['last_review']:
        doc_new['last_review'] = doc['last_review'].strftime('%Y-%m-%d')
    else:
        doc_new['last_review'] = None
    
    for key in ['reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'number_of_reviews_ltm']:
        if not isnan(doc[key]):
            doc_new[key] = doc[key]
        else:
            doc_new[key] = None

    if not isnan(doc['license']):
        doc_new['license'] = doc['license']
    else:
        doc_new['license'] = ""


    doc_new['average_price'] = doc['average_price']

    if doc['first_available_date']:
        doc_new['first_available_date'] = doc['first_available_date'].strftime('%Y-%m-%d')
    else:
        doc_new['first_available_date'] = None

    if doc['last_available_date']:
        doc_new['last_available_date'] = doc['last_available_date'].strftime('%Y-%m-%d')
    else:
        doc_new['last_available_date'] = None
    
    dlist = []
    for d in doc['reviews']:
        d_new = {}
        for key in ['listing_id', 'id']:
            if not isnan(d[key]):
                d_new[key] = d[key]
            else:
                d_new[key] = None

        if d['date']:
            d_new['date'] = d['date'].strftime('%Y-%m-%d')
        else:
            d_new['date'] = None

        for key in ['reviewer_id', 'reviewer_name', 'comments']:
            if not isnan(d[key]):
                d_new[key] = d[key]
            else:
                d_new[key] = None

        dlist.append(d_new)

    doc_new['reviews'] = dlist

    date_list = []
    for date in doc['dates_list']:
        date_new = {}
        date_new['date'] = date['date'].strftime('%Y-%m-%d')
        for key in ['price', 'minimum_nights', 'maximum_nights']:
            if not isnan(date[key]):
                date_new[key] = date[key]
            else:
                date_new[key] = None
        date_new['available'] = date['available']
        date_list.append(date_new)
    
    doc_new['dates_list'] = date_list

    return doc_new

In [27]:
cursor = db.listings_with_reviews_and_cal.find({'id' : {'$regex' : '^1000.*$'}})

output = []

for doc in cursor:
    output.append(convert_lwrac_to_json(doc))

print(len(output))

43


In [28]:
# Writing dict to a json file into a json file in a subdirectory
# Also putting this function into my util.py
def write_dict_to_dir_json(dict, dir, filename):
    with open(dir + '/' + filename, 'w') as fp:
        json.dump(dict, fp)

dir = 'OUTPUTS'
filename = 'listings_with_reviews_and_cal_subset_1000.json'
write_dict_to_dir_json(output, dir, filename)