## **Imports**


In [1]:
import os
import pprint
import asyncio
import aiohttp
import requests
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## **Loading Dataset**


In [2]:
reviewsDf = pd.read_csv('./data/reviews.csv', encoding='latin-1')
restaurantsDf = pd.read_csv('./data/registered_restaurants.csv', encoding='latin-1')
usersDf = pd.read_csv('./data/review_users.csv', encoding='latin-1')

print(f'Reviews: {len(reviewsDf)} , Restaurants: {len(restaurantsDf)} , Users: {len(usersDf)}')

Reviews: 459915 , Restaurants: 5000 , Users: 293001


In [3]:
reviewsDf.head(1)

Unnamed: 0,review_id,user_id,business_id,rating,content,createdAt
0,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1.0,I am a long term frequent customer of this est...,2015-09-23 23:10:31


In [4]:
restaurants_dict = {}

for index, row in restaurantsDf.iterrows():
    restaurants_dict[row["business_id"]] = row["id"]
    
len(restaurants_dict)

5000

In [5]:
users_dict = {}

grouped_reviews = reviewsDf.groupby("user_id").size().reset_index(name='counts')
for index, row in grouped_reviews.iterrows():
    user_row = usersDf.loc[index]
    users_dict[row['user_id']] = {"id": user_row["id"], "token": user_row["token"]}

len(users_dict)

292250

### **Start from columns where there is no DineEase Id**


In [6]:
modified_reviews = pd.DataFrame()
modified_reviews_file = './data/modified_reviews.csv'

if os.path.exists(modified_reviews_file):
    modified_reviews = pd.read_csv(modified_reviews_file)
    filteredDf = reviewsDf[len(modified_reviews):]
else:
    filteredDf = reviewsDf

print(f'Modified Reviews: {len(modified_reviews)} , Filtered Reviews: {len(filteredDf)} , Total Reviews: {len(reviewsDf)}')

Modified Reviews: 41100 , Filtered Reviews: 418815 , Total Reviews: 459915


## **Workers**


### **Insert Review**


In [7]:
async def process_review(session, index, row):

    payload = {
        'rating': row['rating'],
        'content': row['content'],
        'createdAt': row['createdAt'],
    }
    
    user = users_dict.get(row['user_id'])
    restaurantId = restaurants_dict.get(row['business_id'])
    
    headers = {'Authorization': 'Bearer ' + user['token'], 'Content-Type': 'application/json'}
    
    async with session.post(f'http://dine-ease.dev/api/review/{restaurantId}', json=payload, headers=headers) as response:
        if response.status == 201:
            data = await response.json()
                
            modified_reviews.at[index, 'id'] = data['id']
            modified_reviews.at[index, 'review_id'] = row['review_id']
            modified_reviews.at[index, 'user_id'] = user['id']
            modified_reviews.at[index, 'restaurant_id'] = restaurantId
            modified_reviews.at[index, 'slug'] = data['slug']
            modified_reviews.at[index, 'rating'] = row['rating']
            modified_reviews.at[index, 'content'] = row['content']
            modified_reviews.at[index, 'createdAt'] = row['createdAt']
            
        else:
            pprint.pprint(f"Request failed for row {index + 1}. Status code: {response.status}")
            pprint.pprint(await response.text())

In [8]:
async def main():
    async with aiohttp.ClientSession() as session:
        batch_size = 10
        num_batches = 1000
        
        for i in range(0, len(filteredDf), batch_size):
            batch_tasks = [process_review(session, index, row) for index, row in filteredDf.iloc[i:i+batch_size].iterrows()]
            await asyncio.gather(*batch_tasks)
            
            modified_reviews.to_csv(modified_reviews_file, index=False)
            
            num_batches -= 1
            if num_batches == 0:
                break
        
await main()