# Data Cleaning
- Find major cities
- Filter by category


In [1]:
import lightfm
import scipy as sp
import pymongo
import pandas as pd
import numpy as np



In [22]:
# Read in data
# Connect to mongodb on 192.168.1.87
client = pymongo.MongoClient(
    host='192.168.1.87',
    port=27017)
# Get the yelp database
db = client.yelp
review = db.review
business = db.business
user = db.user

In [23]:
# Get count of business by city and state
business_by_city = business.aggregate(
    [
        {
            '$group': {
                '_id': {
                    'city': '$city',
                    'state': '$state'
                },
                'count': {
                    '$sum': 1
                }
            }
        }, {
            '$sort': {
                'count': -1
            }
        }
    ])
business_by_city = pd.DataFrame(list(business_by_city))

In [24]:
business_by_city.head(10)

Unnamed: 0,_id,count
0,"{'city': 'Philadelphia', 'state': 'PA'}",14567
1,"{'city': 'Tucson', 'state': 'AZ'}",9249
2,"{'city': 'Tampa', 'state': 'FL'}",9048
3,"{'city': 'Indianapolis', 'state': 'IN'}",7540
4,"{'city': 'Nashville', 'state': 'TN'}",6968
5,"{'city': 'New Orleans', 'state': 'LA'}",6208
6,"{'city': 'Reno', 'state': 'NV'}",5932
7,"{'city': 'Edmonton', 'state': 'AB'}",5054
8,"{'city': 'Saint Louis', 'state': 'MO'}",4827
9,"{'city': 'Santa Barbara', 'state': 'CA'}",3829


In [25]:
# Check business types
# Split "categories" column into list based on "," delim
business_cats = business.aggregate([
    {
        '$project': {
            "categories": {"$split": ["$categories", ", "]}
        }
    },
    {
        '$unwind': {
            'path': '$categories'
        }
    }, {
        '$group': {
            '_id': '$categories',
            'count': {
                '$sum': 1
            }
        }
    }, {
        '$sort': {
            'count': -1
        }
    }
])
businesses_by_category = pd.DataFrame(list(business_cats))
businesses_by_category.head(20)

Unnamed: 0,_id,count
0,Restaurants,52268
1,Food,27781
2,Shopping,24395
3,Home Services,14356
4,Beauty & Spas,14292
5,Nightlife,12281
6,Health & Medical,11890
7,Local Services,11198
8,Bars,11065
9,Automotive,10773


In [26]:
keep_categories = [
    'Restaurants',
    'Food',
    'Shopping',
    'Beauty & Spas',
    'Nightlife',
    'Bars',
    'Sandwiches',
    'American (Traditional)',
    'Pizza',
    'Coffee & Tea',
    'Fast Food',
    'Breakfast & Brunch	',
    'American (New)',
    ]
# Get list of business ids that are in the categories we want to keep
# First split the categories column into a list of categories based on the "," delim
# Then match on the categories we want to keep
businesses_by_category = business.aggregate([
    {
        '$project': {
            "categories": {"$split": ["$categories", ", "]},
            "business_id": "$business_id",
            "_id": 0
        }
    },
    {
        '$unwind': {
            'path': '$categories'
        }
    }, {
        '$match': {
            'categories': {
                '$in': keep_categories
            }
        },
    }
])
businesses_by_category = pd.DataFrame(list(businesses_by_category))
businesses_by_category.head(10)


Unnamed: 0,categories,business_id
0,Shopping,tUFrWirKiKi_TAnsVWINQQ
1,Restaurants,MUTTqe8uqyMdBl186RmNeA
2,Food,0bPLkL0QhhPO5kt1_EXmNQ
3,Restaurants,0bPLkL0QhhPO5kt1_EXmNQ
4,Shopping,M0XSSHqrASOnhgbWDJIpQA
5,Coffee & Tea,WKMJwqnfZKsAae75RMP6jA
6,Food,WKMJwqnfZKsAae75RMP6jA
7,Bars,WKMJwqnfZKsAae75RMP6jA
8,Restaurants,WKMJwqnfZKsAae75RMP6jA
9,Nightlife,WKMJwqnfZKsAae75RMP6jA


In [28]:
# Return businesses where the business_id is in businesses_by_category['business_id']
kept_bus = business.find({
    'business_id': {
        '$in': businesses_by_category['business_id'].tolist()
    }
})
kept_bus = pd.DataFrame(list(kept_bus))
kept_bus.head(10)

Unnamed: 0,_id,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,631ea3aea5cde8cc0d6d80f6,---kPU91CF4Lq2-WlRu9Lw,Frankie's Raw Bar,4903 State Rd 54,New Port Richey,FL,34652,28.217288,-82.733344,4.5,24,1,"{'Alcohol': 'u'none'', 'OutdoorSeating': 'True...","Seafood, Restaurants, Latin American, Food, Fo...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-21:0', ..."
1,631ea3b2a5cde8cc0d6edb40,--0iUa4sNDFiZFrAdIWhZQ,Pupuseria Y Restaurant Melba,6 S White Horse Pike,Clementon,NJ,08021,39.81785,-74.993364,3.0,14,1,"{'Alcohol': 'u'none'', 'RestaurantsDelivery': ...","Food, Restaurants, Specialty Food, Mexican, Et...",
2,631ea3afa5cde8cc0d6dbdd1,--7PUidqRWpRSpXebiyxTg,Humpty's Family Restaurant,9910 108A Avenue,Edmonton,AB,T5H,53.554659,-113.49304,2.0,12,0,"{'GoodForKids': 'True', 'RestaurantsAttire': '...","Breakfast & Brunch, Restaurants",
3,631ea3b0a5cde8cc0d6df47e,--8IbOsAAxjKRoYsBFL-PA,The Original Italian Pie,4706 Paris Ave,Gentilly,LA,70122,30.006341,-90.074523,3.0,27,0,"{'RestaurantsDelivery': 'True', 'RestaurantsRe...","Food, Restaurants, Italian","{'Monday': '11:0-21:30', 'Tuesday': '11:0-21:3..."
4,631ea3b2a5cde8cc0d6eb869,--ARBQr1WMsTWiwOKOj-FQ,Traveling Corks,3219 Bay To Bay Blvd,Tampa,FL,33629,27.919884,-82.495936,4.5,23,0,"{'RestaurantsGoodForGroups': 'True', 'Restaura...","Wine Bars, Nightlife, Bars","{'Tuesday': '15:0-22:0', 'Wednesday': '15:0-22..."
5,631ea3aea5cde8cc0d6d882a,--FWWsIwxRwuw9vIMImcQg,Blue River Canyon Day Spa & Store,4908 Thoroughbred Ln,Brentwood,TN,37027,36.036732,-86.791045,3.5,8,0,"{'RestaurantsPriceRange2': '3', 'ByAppointment...","Day Spas, Beauty & Spas","{'Monday': '9:0-19:30', 'Tuesday': '9:0-19:30'..."
6,631ea3b1a5cde8cc0d6e81d8,--LC8cIrALInl2vyo701tg,Studio G Salon,6537 Gunn Hwy,Tampa,FL,33625,28.065886,-82.559301,5.0,8,1,"{'BikeParking': 'True', 'BusinessParking': '{'...","Hair Salons, Hair Stylists, Beauty & Spas","{'Tuesday': '9:0-17:0', 'Wednesday': '9:0-20:0..."
7,631ea3b0a5cde8cc0d6e31a4,--MbOh2O1pATkXa7xbU6LA,Sweet Lizard,"6360 N Campbell, Ste 130",Tucson,AZ,85718,32.322391,-110.928958,4.0,25,0,"{'WiFi': ''no'', 'BikeParking': 'True', 'Busin...","Food, Ice Cream & Frozen Yogurt","{'Monday': '12:0-21:0', 'Tuesday': '12:0-21:0'..."
8,631ea3b1a5cde8cc0d6e4bef,--S43ruInmIsGrnnkmavRw,Peaches Records,4318 Magazine St,New Orleans,LA,70115,29.920622,-90.100819,3.5,91,1,"{'RestaurantsPriceRange2': '2', 'WheelchairAcc...","Music & DVDs, Vinyl Records, Fashion, Women's ...","{'Monday': '0:0-0:0', 'Tuesday': '10:0-17:0', ..."
9,631ea3afa5cde8cc0d6da487,--SJXpAa0E-GCp2smaHf0A,Winn Dixie,10667 Big Bend Rd,Riverview,FL,33579,27.791333,-82.33224,2.5,13,1,"{'BikeParking': 'False', 'RestaurantsPriceRang...","Grocery, Beer, Wine & Spirits, Food","{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ..."


In [34]:
print(f'Number of businesses: {len(kept_bus):,}')
print(f"That are in any of these categories: \n{keep_categories}")

Number of businesses: 100,623
That are in any of these categories: 
['Restaurants', 'Food', 'Shopping', 'Beauty & Spas', 'Nightlife', 'Bars', 'Sandwiches', 'American (Traditional)', 'Pizza', 'Coffee & Tea', 'Fast Food', 'Breakfast & Brunch\t', 'American (New)']


In [35]:
# Write data to csv in FilteredData folder
kept_bus.to_csv('FilteredData/business.csv', index=False)

In [37]:
# Read to make sure it looks the same
kept_bus = pd.read_csv('FilteredData/business.csv')
kept_bus.head(10)

Unnamed: 0,_id,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,631ea3aea5cde8cc0d6d80f6,---kPU91CF4Lq2-WlRu9Lw,Frankie's Raw Bar,4903 State Rd 54,New Port Richey,FL,34652,28.217288,-82.733344,4.5,24,1,"{'Alcohol': ""u'none'"", 'OutdoorSeating': 'True...","Seafood, Restaurants, Latin American, Food, Fo...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-21:0', ..."
1,631ea3b2a5cde8cc0d6edb40,--0iUa4sNDFiZFrAdIWhZQ,Pupuseria Y Restaurant Melba,6 S White Horse Pike,Clementon,NJ,08021,39.81785,-74.993364,3.0,14,1,"{'Alcohol': ""u'none'"", 'RestaurantsDelivery': ...","Food, Restaurants, Specialty Food, Mexican, Et...",
2,631ea3afa5cde8cc0d6dbdd1,--7PUidqRWpRSpXebiyxTg,Humpty's Family Restaurant,9910 108A Avenue,Edmonton,AB,T5H,53.554659,-113.49304,2.0,12,0,"{'GoodForKids': 'True', 'RestaurantsAttire': ""...","Breakfast & Brunch, Restaurants",
3,631ea3b0a5cde8cc0d6df47e,--8IbOsAAxjKRoYsBFL-PA,The Original Italian Pie,4706 Paris Ave,Gentilly,LA,70122,30.006341,-90.074523,3.0,27,0,"{'RestaurantsDelivery': 'True', 'RestaurantsRe...","Food, Restaurants, Italian","{'Monday': '11:0-21:30', 'Tuesday': '11:0-21:3..."
4,631ea3b2a5cde8cc0d6eb869,--ARBQr1WMsTWiwOKOj-FQ,Traveling Corks,3219 Bay To Bay Blvd,Tampa,FL,33629,27.919884,-82.495936,4.5,23,0,"{'RestaurantsGoodForGroups': 'True', 'Restaura...","Wine Bars, Nightlife, Bars","{'Tuesday': '15:0-22:0', 'Wednesday': '15:0-22..."
5,631ea3aea5cde8cc0d6d882a,--FWWsIwxRwuw9vIMImcQg,Blue River Canyon Day Spa & Store,4908 Thoroughbred Ln,Brentwood,TN,37027,36.036732,-86.791045,3.5,8,0,"{'RestaurantsPriceRange2': '3', 'ByAppointment...","Day Spas, Beauty & Spas","{'Monday': '9:0-19:30', 'Tuesday': '9:0-19:30'..."
6,631ea3b1a5cde8cc0d6e81d8,--LC8cIrALInl2vyo701tg,Studio G Salon,6537 Gunn Hwy,Tampa,FL,33625,28.065886,-82.559301,5.0,8,1,"{'BikeParking': 'True', 'BusinessParking': ""{'...","Hair Salons, Hair Stylists, Beauty & Spas","{'Tuesday': '9:0-17:0', 'Wednesday': '9:0-20:0..."
7,631ea3b0a5cde8cc0d6e31a4,--MbOh2O1pATkXa7xbU6LA,Sweet Lizard,"6360 N Campbell, Ste 130",Tucson,AZ,85718,32.322391,-110.928958,4.0,25,0,"{'WiFi': ""'no'"", 'BikeParking': 'True', 'Busin...","Food, Ice Cream & Frozen Yogurt","{'Monday': '12:0-21:0', 'Tuesday': '12:0-21:0'..."
8,631ea3b1a5cde8cc0d6e4bef,--S43ruInmIsGrnnkmavRw,Peaches Records,4318 Magazine St,New Orleans,LA,70115,29.920622,-90.100819,3.5,91,1,"{'RestaurantsPriceRange2': '2', 'WheelchairAcc...","Music & DVDs, Vinyl Records, Fashion, Women's ...","{'Monday': '0:0-0:0', 'Tuesday': '10:0-17:0', ..."
9,631ea3afa5cde8cc0d6da487,--SJXpAa0E-GCp2smaHf0A,Winn Dixie,10667 Big Bend Rd,Riverview,FL,33579,27.791333,-82.33224,2.5,13,1,"{'BikeParking': 'False', 'RestaurantsPriceRang...","Grocery, Beer, Wine & Spirits, Food","{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ..."


In [38]:
# Get reviews for businesses in kept_bus
kept_reviews = review.find({
    'business_id': {
        '$in': kept_bus['business_id'].tolist()
    }
})
kept_reviews = pd.DataFrame(list(kept_reviews))
kept_reviews.head(10)

Unnamed: 0,_id,review_id,user_id,business_id,stars,useful,funny,cool,text,date,compliment_count
0,631e9f7fedf65856ab0dfe37,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0.0,0.0,0.0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,
1,631e9f7fedf65856ab0dfe3a,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0.0,0.0,0.0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,
2,631e9f7fedf65856ab0dfe3b,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1.0,1.0,2.0,1.0,I am a long term frequent customer of this est...,2015-09-23 23:10:31,
3,631e9f7fedf65856ab0dfe3d,_ZeMknuYdlQcUqng_Im3yg,yfFzsLmaWF2d4Sr0UNbBgg,LHSTtnW3YHCeUkRDGyJOyw,5.0,2.0,0.0,0.0,Amazingly amazing wings and homemade bleu chee...,2015-08-07 02:29:16,
4,631e9f7fedf65856ab0dfe3e,8JFGBuHMoiNDyfcxuWNtrA,smOvOajNG0lS4Pq7d8g4JQ,RZtGWDLCAtuipwaZ-UfjmQ,4.0,0.0,0.0,0.0,Good food--loved the gnocchi with marinara\nth...,2009-10-14 19:57:14,
5,631e9f7fedf65856ab0dfe3f,XW_LfMv0fV21l9c6xQd_lw,9OAtfnWag-ajVxRbUTGIyg,lj-E32x9_FA7GmUrBGBEWg,4.0,0.0,0.0,0.0,Love going here for happy hour or dinner! Gre...,2014-06-27 22:44:01,
6,631e9f7fedf65856ab0dfe40,l3Wk_mvAog6XANIuGQ9C7Q,ZbqSHbgCjzVAqaa7NKWn5A,EQ-TZ2eeD_E0BHuvoaeG5Q,4.0,0.0,0.0,0.0,"Locals recommended Milktooth, and it's an amaz...",2015-08-19 14:31:45,
7,631e9f7fedf65856ab0dfe41,LnGZB0fjfgeVDVz5IHuEVA,j2wlzrntrbKwyOcOiB3l3w,rBdG_23USc7DletfZ11xGA,4.0,1.0,0.0,0.0,The hubby and I have been here on multiple occ...,2014-08-10 19:41:43,
8,631e9f7fedf65856ab0dfe42,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1.0,0.0,1.0,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,
9,631e9f7fedf65856ab0dfe43,J-4NdnDZ0pUQaUEEwDI9KQ,vrKkXsozqqecF3CW4cGaVQ,rjuWz_AD3WfXJc03AhIO_w,5.0,2.0,2.0,2.0,I thoroughly enjoyed the show. Chill way to s...,2012-12-04 16:46:20,


In [50]:
# Print out all column types in kept_reviews
for col in kept_reviews.columns:
    print(f'{col}: {kept_reviews[col].dtype}')
    if str(kept_reviews[col].dtype) != 'float64':
        kept_reviews[col] = kept_reviews[col].astype('str')
# Print after converting to string
for col in kept_reviews.columns:
    print(f'{col}: {kept_reviews[col].dtype}')

_id: object
review_id: object
user_id: object
business_id: object
stars: float64
useful: float64
funny: float64
cool: float64
text: object
date: object
compliment_count: float64
_id: object
review_id: object
user_id: object
business_id: object
stars: float64
useful: float64
funny: float64
cool: float64
text: object
date: object
compliment_count: float64


In [51]:
# Print out length of data
print(f'Number of reviews: {len(kept_reviews):,}')
# Write to parquet in FilteredData folder
kept_reviews.to_parquet('FilteredData/review.parquet', index=False, compression='gzip')

Number of reviews: 5,977,341


In [52]:
# Read to make sure it looks the same
kept_reviews = pd.read_parquet('FilteredData/review.parquet')
kept_reviews.head(10)

Unnamed: 0,_id,review_id,user_id,business_id,stars,useful,funny,cool,text,date,compliment_count
0,631e9f7fedf65856ab0dfe37,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0.0,0.0,0.0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,
1,631e9f7fedf65856ab0dfe3a,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0.0,0.0,0.0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,
2,631e9f7fedf65856ab0dfe3b,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1.0,1.0,2.0,1.0,I am a long term frequent customer of this est...,2015-09-23 23:10:31,
3,631e9f7fedf65856ab0dfe3d,_ZeMknuYdlQcUqng_Im3yg,yfFzsLmaWF2d4Sr0UNbBgg,LHSTtnW3YHCeUkRDGyJOyw,5.0,2.0,0.0,0.0,Amazingly amazing wings and homemade bleu chee...,2015-08-07 02:29:16,
4,631e9f7fedf65856ab0dfe3e,8JFGBuHMoiNDyfcxuWNtrA,smOvOajNG0lS4Pq7d8g4JQ,RZtGWDLCAtuipwaZ-UfjmQ,4.0,0.0,0.0,0.0,Good food--loved the gnocchi with marinara\nth...,2009-10-14 19:57:14,
5,631e9f7fedf65856ab0dfe3f,XW_LfMv0fV21l9c6xQd_lw,9OAtfnWag-ajVxRbUTGIyg,lj-E32x9_FA7GmUrBGBEWg,4.0,0.0,0.0,0.0,Love going here for happy hour or dinner! Gre...,2014-06-27 22:44:01,
6,631e9f7fedf65856ab0dfe40,l3Wk_mvAog6XANIuGQ9C7Q,ZbqSHbgCjzVAqaa7NKWn5A,EQ-TZ2eeD_E0BHuvoaeG5Q,4.0,0.0,0.0,0.0,"Locals recommended Milktooth, and it's an amaz...",2015-08-19 14:31:45,
7,631e9f7fedf65856ab0dfe41,LnGZB0fjfgeVDVz5IHuEVA,j2wlzrntrbKwyOcOiB3l3w,rBdG_23USc7DletfZ11xGA,4.0,1.0,0.0,0.0,The hubby and I have been here on multiple occ...,2014-08-10 19:41:43,
8,631e9f7fedf65856ab0dfe42,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1.0,0.0,1.0,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,
9,631e9f7fedf65856ab0dfe43,J-4NdnDZ0pUQaUEEwDI9KQ,vrKkXsozqqecF3CW4cGaVQ,rjuWz_AD3WfXJc03AhIO_w,5.0,2.0,2.0,2.0,I thoroughly enjoyed the show. Chill way to s...,2012-12-04 16:46:20,


In [56]:
# Check city and state counts in kept_bus
kept_bus_by_city = kept_bus.groupby(['city', 'state']).count()
# Sort and print top 10
kept_bus_by_city.sort_values('business_id', ascending=False).head(10)['business_id']
# Find businesses in Philadelphia
kept_bus_philly = kept_bus[kept_bus['city'] == 'Philadelphia']
kept_bus_philly.sort_values('review_count', ascending=False).head(10)

Unnamed: 0,_id,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
98908,631ea3b3a5cde8cc0d6f3b6f,ytynqOUb3hjKeJfRj5Tshw,Reading Terminal Market,51 N 12th St,Philadelphia,PA,19107,39.953341,-75.158855,4.5,5721,1,"{'RestaurantsGoodForGroups': 'True', 'Restaura...","Candy Stores, Shopping, Department Stores, Fas...","{'Monday': '8:0-18:0', 'Tuesday': '8:0-18:0', ..."
41166,631ea3b2a5cde8cc0d6ec980,PP3BBaVxZLcJU54uP_wL6Q,Pat's King of Steaks,1237 E Passyunk Ave,Philadelphia,PA,19147,39.933201,-75.159266,3.0,4250,1,"{'RestaurantsReservations': 'False', 'Corkage'...","Italian, American (Traditional), Sandwiches, F...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."
30930,631ea3b0a5cde8cc0d6dec2b,IkY2ticzHEn4QFn8hQLSWg,Geno's Steaks,1219 S 9th St,Philadelphia,PA,19147,39.933837,-75.158814,2.5,3401,1,"{'RestaurantsReservations': 'False', 'Corkage'...","Sandwiches, Cheesesteaks, Steakhouses, Restaur...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."
16468,631ea3b3a5cde8cc0d6f0e92,9PZxjhTIU7OgPIzuGi89Ew,El Vez,121 S 13th St,Philadelphia,PA,19107,39.949702,-75.16177,4.0,3187,1,"{'RestaurantsReservations': 'True', 'BYOBCorka...","Lounges, Bars, Nightlife, Breakfast & Brunch, ...","{'Monday': '0:0-0:0', 'Tuesday': '12:0-22:0', ..."
63988,631ea3aea5cde8cc0d6d85fc,ctHjyadbDQAtUFfkcAFEHw,Zahav,237 St James Pl,Philadelphia,PA,19106,39.946261,-75.145135,4.5,3065,1,"{'RestaurantsAttire': ""'casual'"", 'BikeParking...","Nightlife, Bars, Food, Ethnic Food, Middle Eas...","{'Monday': '0:0-0:0', 'Tuesday': '16:45-21:30'..."
12030,631ea3b0a5cde8cc0d6e09db,6ajnOk0GcY9xbb5Ocaw8Gw,Barbuzzo,110 S 13th St,Philadelphia,PA,19107,39.950007,-75.162158,4.5,2893,1,"{'WiFi': ""u'no'"", 'Caters': 'False', 'Restaura...","Mediterranean, Restaurants, Pizza, Italian","{'Monday': '17:0-22:0', 'Tuesday': '17:0-22:0'..."
73705,631ea3aea5cde8cc0d6d5aa9,j-qtdD55OLfSqfsWuQTDJg,Parc,227 S 18th St,Philadelphia,PA,19103,39.949172,-75.170727,4.0,2761,1,"{'OutdoorSeating': 'True', 'RestaurantsGoodFor...","Restaurants, French, Wine Bars, Nightlife, Ame...","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."
88594,631ea3afa5cde8cc0d6ddd03,sTPueJEwcRDj7ZJmG7okYA,Jim's South St,400 S St,Philadelphia,PA,19147,39.941498,-75.149272,3.5,2736,1,"{'BusinessParking': ""{'garage': False, 'street...","Bars, Restaurants, Pizza, Cheesesteaks, Italia...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-0:0', '..."
44427,631ea3afa5cde8cc0d6d9c06,RQAF6a0akMiot5lZZnMNNw,Dalessandro’s Steaks & Hoagies,600 Wendover St,Philadelphia,PA,19128,40.029494,-75.205971,4.0,2686,1,"{'NoiseLevel': ""u'average'"", 'RestaurantsAttir...","Sandwiches, Delis, Restaurants, Cheesesteaks","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."
2290,631ea3b3a5cde8cc0d6f5383,0RuvlgTnKFbX3IK0ZOOocA,Green Eggs Café,212 S 13th St,Philadelphia,PA,19107,39.948123,-75.162463,4.0,2679,1,"{'BusinessParking': ""{'garage': False, 'street...","Restaurants, American (New), Diners, Breakfast...","{'Monday': '9:0-15:0', 'Tuesday': '9:0-15:0', ..."


In [57]:
# New dataframe for businesses in Philly
kept_bus_philly = kept_bus[kept_bus['city'] == 'Philadelphia']
# Write to csv in FilteredData folder
kept_bus_philly.to_csv('FilteredData/business_philly.csv', index=False)


In [59]:
print(f"Number of businesses in Philly: {len(kept_bus_philly):,}")

Number of businesses in Philly: 10,391


In [60]:
# Get reviews for businesses in kept_bus_philly
kept_reviews_philly = review.find({
    'business_id': {
        '$in': kept_bus_philly['business_id'].tolist()
    }
})
kept_reviews_philly = pd.DataFrame(list(kept_reviews_philly))
kept_reviews_philly.head(10)

Unnamed: 0,_id,review_id,user_id,business_id,stars,useful,funny,cool,text,date,compliment_count
0,631e9f7fedf65856ab0dfe3b,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1.0,1.0,2.0,1.0,I am a long term frequent customer of this est...,2015-09-23 23:10:31,
1,631e9f7fedf65856ab0dfe3e,8JFGBuHMoiNDyfcxuWNtrA,smOvOajNG0lS4Pq7d8g4JQ,RZtGWDLCAtuipwaZ-UfjmQ,4.0,0.0,0.0,0.0,Good food--loved the gnocchi with marinara\nth...,2009-10-14 19:57:14,
2,631e9f7fedf65856ab0dfe42,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1.0,0.0,1.0,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,
3,631e9f7fedf65856ab0dfe43,J-4NdnDZ0pUQaUEEwDI9KQ,vrKkXsozqqecF3CW4cGaVQ,rjuWz_AD3WfXJc03AhIO_w,5.0,2.0,2.0,2.0,I thoroughly enjoyed the show. Chill way to s...,2012-12-04 16:46:20,
4,631e9f7fedf65856ab0dfe55,JBWZmBy69VMggxj3eYn17Q,aFa96pz67TwOFu4Weq5Agg,kq5Ghhh14r-eCxlVmlyd8w,5.0,0.0,0.0,0.0,My boyfriend and I tried this deli for the fir...,2018-08-23 21:39:38,
5,631e9f7fedf65856ab0dfe58,Xs8Z8lmKkosqW5mw_sVAoA,IQsF3Rc6IgCzjVV9DE8KXg,eFvzHawVJofxSnD7TgbZtg,5.0,0.0,0.0,0.0,My absolute favorite cafe in the city. Their b...,2014-11-12 15:30:27,
6,631e9f7fedf65856ab0dfe5c,oyaMhzBSwfGgemSGuZCdwQ,Dd1jQj7S-BFGqRbApFzCFw,YtSqYv1Q_pOltsVPSx54SA,5.0,0.0,0.0,0.0,Tremendous service (Big shout out to Douglas) ...,2013-06-24 11:21:25,
7,631e9f7fedf65856ab0dfe67,YcLXh-3UC9y6YFAI9xxzPQ,G0DHgkSsDozqUPWtlxVEMw,oBhJuukGRqPVvYBfTkhuZA,4.0,0.0,0.0,0.0,The only reason I didn't give this restaurant ...,2015-03-05 03:37:54,
8,631e9f7fedf65856ab0dfe69,cvQXRFLCyr0S7EgFb4lZqw,ZGjgfSvjQK886kiTzLwfLQ,EtKSTHV5Qx_Q7Aur9o4kQQ,5.0,3.0,1.0,1.0,"On a scale of one to things that are awesome, ...",2009-10-14 01:15:04,
9,631e9f7fedf65856ab0dfe6a,r2IBPY_E8AE5_GpsqlONyg,IKbjLnfBQtEyVzEu8CuOLg,VJEzpfLs_Jnzgqh5A_FVTg,4.0,0.0,0.0,0.0,It was my fiance's birthday and he decided he ...,2014-04-01 13:05:18,


In [61]:
# Print number of reviews
print(f'Number of reviews in Philly: {len(kept_reviews_philly):,}')

Number of reviews in Philly: 855,080


In [63]:
# Write to parquet in FilteredData folder
# Convert any obj columns to string
for col in kept_reviews_philly.columns:
    if str(kept_reviews_philly[col].dtype) != 'float64':
        kept_reviews_philly[col] = kept_reviews_philly[col].astype('str')
kept_reviews_philly.to_parquet('FilteredData/review_philly.parquet', index=False, compression='gzip')
# File is 238MB

In [64]:
kept_reviews_philly.to_csv('FilteredData/review_philly.csv', index=False)
# File is 613MB

In [65]:
kept_reviews_philly.to_parquet('FilteredData/review_philly.parquet', index=False, compression='snappy')
# File is 375 MB

In [66]:
kept_reviews_philly.to_parquet('FilteredData/review_philly.parquet', index=False, compression='brotli')
# File is 202 MB

In [70]:
kept_reviews_philly.to_feather('FilteredData/review_philly.feather', compression='zstd')
# 265 MB
# But very fast to write

In [71]:
kept_reviews_philly = pd.read_feather('FilteredData/review_philly.feather')
kept_reviews_philly.head(10)
print(f'Number of reviews in Philly: {len(kept_reviews_philly):,}')

Number of reviews in Philly: 855,080


In [72]:
kept_reviews_philly.head(10)

Unnamed: 0,_id,review_id,user_id,business_id,stars,useful,funny,cool,text,date,compliment_count
0,631e9f7fedf65856ab0dfe3b,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1.0,1.0,2.0,1.0,I am a long term frequent customer of this est...,2015-09-23 23:10:31,
1,631e9f7fedf65856ab0dfe3e,8JFGBuHMoiNDyfcxuWNtrA,smOvOajNG0lS4Pq7d8g4JQ,RZtGWDLCAtuipwaZ-UfjmQ,4.0,0.0,0.0,0.0,Good food--loved the gnocchi with marinara\nth...,2009-10-14 19:57:14,
2,631e9f7fedf65856ab0dfe42,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1.0,0.0,1.0,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,
3,631e9f7fedf65856ab0dfe43,J-4NdnDZ0pUQaUEEwDI9KQ,vrKkXsozqqecF3CW4cGaVQ,rjuWz_AD3WfXJc03AhIO_w,5.0,2.0,2.0,2.0,I thoroughly enjoyed the show. Chill way to s...,2012-12-04 16:46:20,
4,631e9f7fedf65856ab0dfe55,JBWZmBy69VMggxj3eYn17Q,aFa96pz67TwOFu4Weq5Agg,kq5Ghhh14r-eCxlVmlyd8w,5.0,0.0,0.0,0.0,My boyfriend and I tried this deli for the fir...,2018-08-23 21:39:38,
5,631e9f7fedf65856ab0dfe58,Xs8Z8lmKkosqW5mw_sVAoA,IQsF3Rc6IgCzjVV9DE8KXg,eFvzHawVJofxSnD7TgbZtg,5.0,0.0,0.0,0.0,My absolute favorite cafe in the city. Their b...,2014-11-12 15:30:27,
6,631e9f7fedf65856ab0dfe5c,oyaMhzBSwfGgemSGuZCdwQ,Dd1jQj7S-BFGqRbApFzCFw,YtSqYv1Q_pOltsVPSx54SA,5.0,0.0,0.0,0.0,Tremendous service (Big shout out to Douglas) ...,2013-06-24 11:21:25,
7,631e9f7fedf65856ab0dfe67,YcLXh-3UC9y6YFAI9xxzPQ,G0DHgkSsDozqUPWtlxVEMw,oBhJuukGRqPVvYBfTkhuZA,4.0,0.0,0.0,0.0,The only reason I didn't give this restaurant ...,2015-03-05 03:37:54,
8,631e9f7fedf65856ab0dfe69,cvQXRFLCyr0S7EgFb4lZqw,ZGjgfSvjQK886kiTzLwfLQ,EtKSTHV5Qx_Q7Aur9o4kQQ,5.0,3.0,1.0,1.0,"On a scale of one to things that are awesome, ...",2009-10-14 01:15:04,
9,631e9f7fedf65856ab0dfe6a,r2IBPY_E8AE5_GpsqlONyg,IKbjLnfBQtEyVzEu8CuOLg,VJEzpfLs_Jnzgqh5A_FVTg,4.0,0.0,0.0,0.0,It was my fiance's birthday and he decided he ...,2014-04-01 13:05:18,


To set up GIT LFS to push to GitHub go to
https://git-lfs.github.com/