In [1]:
import re
import json
import requests
import pandas as pd
import datetime as dt
import time

class ShopeeCrawler:
    """ A class to crawl product reviews on Shopee.vn """
    def __init__(self):
        self.data = {"itemid": [], "shopid": [], "username": [], "rating": [], "time": [], "source": [], "comment": []}
        self.shop_id, self.item_id = None, None

    def get_ids_from_link(self, base_url):
        """
        Gets Product id and Shop id

        Parameters
        ----------
        base_url : str
            Product link

        Returns
        ----------
        tuple
            a tuple containing Product id and Shop id
        """

        r = re.search(r"i\.(\d+)\.(\d+)", base_url)
        return (r[1], r[2])

    def Crawl(self, item_id, shop_id, display = False, most_recent = False, verbose = 100):
        """
        Gets reviews and related information about a product

        Parameters
        ----------
        item_id : int
            Product ID
        
        shop_id : int
            Shop ID 
        
        display : bool 
            Display data as crawled

        most_recent: bool
            only save reviews that are at most 1 day old

        Returns
        ----------
        dict
            a dictionary containing username (reviewer's account name), rating (number of stars the reviewer gave), comment (the review on the product), time (date and time of the comment in unix time),
            itemid and shopid.


        """
        offset = 0
        while True:
            ratings_url = f"https://shopee.vn/api/v2/item/get_ratings?filter=0&flag=1&itemid={item_id}&limit=20&offset={offset}&shopid={shop_id}&type=0"
            response = requests.get(ratings_url).json()
            if not response["data"]["ratings"]:
                break

            for i, rating in enumerate(response["data"]["ratings"], 1):
                if most_recent:
                    delta = dt.datetime.now() - dt.datetime.fromtimestamp(rating["ctime"])
                    if delta <= dt.timedelta(days=1):
                        self.data["username"].append(rating["author_username"])
                        self.data["rating"].append(rating["rating_star"])
                        self.data["comment"].append(rating["comment"])
                        self.data["time"].append(rating["ctime"]) # convert to unix timestamp
                        self.data["shopid"].append(item_id)
                        self.data["itemid"].append(shop_id)
                        self.data["source"].append("Shopee")
                else:
                        self.data["username"].append(rating["author_username"])
                        self.data["rating"].append(rating["rating_star"])
                        self.data["comment"].append(rating["comment"])
                        self.data["time"].append(rating["ctime"])
                        self.data["shopid"].append(item_id)
                        self.data["itemid"].append(shop_id)
                        self.data["source"].append("Shopee")
            if display:
                print(rating["author_username"])
                print(rating["rating_star"])
                print(rating["comment"])
                print("-" * 100)
                print(offset)
            

            offset += 20
            

        return self.data
    
    def get_data(self):
        """
        Get all data crawled within the object

        Returns
        ----------
        dict
            a dictionary containing username (reviewer's account name), rating (number of stars the reviewer gave), comment (the review on the product), time (date and time of the comment in unix time),
            itemid and shopid.    
        """
        return self.data
    
    def CrawlByCat(self, catid, cat_level = 2, limit = None):
        """
        Crawl reviews by categories

        Parameters
        ----------
        catid : int
            category ID

        cat_level : int
            1 for category, 2 for subcategory
        
        limit : int
            limit number of products in the category (None to crawl all products)

        Returns
        ----------
        dict
            a dictionary containing username (reviewer's account name), rating (number of stars the reviewer gave), comment (the review on the product), time (date and time of the comment in unix time),
            itemid and shopid.        
        
        """
        
        params = {
        "bundle": "category_landing_page",
        "cat_level": cat_level,
        "catid": catid, 
        "offset": 0,
        }
        
            
        crawler = ShopeeCrawler()
        product_data = []
        url = 'https://shopee.vn/api/v4/recommend/recommend'

        response = requests.get(url, params=params)
        n = response.json().get('data').get('sections')[0].get('data').get('item')

        for record in n:
            product_data.append({'itemid': record['itemid'], 'shopid': record['shopid']})

        for product in product_data:
            crawler.Crawl(product["itemid"], product["shopid"])
        self.data = crawler.get_data()
        return self.data

    def GetShopInfo(self):
        """
        Get shop information

        Returns
        ----------
        dict
            a dictionary containing shop information    
        
        """
        df = pd.DataFrame(self.data)
        itemids = df["itemid"].unique()

        output = {"shopid": [], "name": [], "ctime": [], "is_shopee_verified": [], "is_preferred_plus_seller": [], "is_official_shop": [], "shop_location": [], "item_count": [],
                  "rating_star": [], "response_rate": [], "response_time": [],  'rating_bad': [],'rating_good': [], 'rating_normal': []}
        for itemid in itemids:
            url = f'https://shopee.vn/api/v4/product/get_shop_info?shopid={itemid}'
            response = requests.get(url).json()
            data = response.get("data")
            for key in list(output.keys())[1::]:
                output[key].append(data[key])
            output["shopid"].append(df[df["itemid"] == str(itemid)]["shopid"].iloc[0])
        return output
    


In [2]:
products = pd.read_csv(r'prod_final_clean_jul26_sort.csv')[["product_url", 'average_rating']]

crawler = ShopeeCrawler()

products["productid"] = products["product_url"].apply(lambda i: crawler.get_ids_from_link(i)[0])
products["shopid"] = products["product_url"].apply(lambda i: crawler.get_ids_from_link(i)[1])

  products = pd.read_csv(r'prod_final_clean_jul26_sort.csv')[["product_url", 'average_rating']]


In [3]:
products.shape

(18040, 4)

In [4]:
products.groupby('average_rating').count()

Unnamed: 0_level_0,product_url,productid,shopid
average_rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,219,219,219
1.0,8,8,8
2.0,2,2,2
2.3,3,3,3
2.5,2,2,2
2.7,2,2,2
3.0,19,19,19
3.1,2,2,2
3.2,2,2,2
3.3,6,6,6


In [14]:
low_rating_df = products[products['average_rating'] <= 4.4]
low_rating_df.shape

(830, 4)

In [15]:
review_df = pd.read_csv(r'C:\Users\ADMIN\PycharmProjects\NLP\Product_review.csv')
review_df.shape

(358600, 7)

In [17]:
review_df = review_df[~review_df['comment'].isnull()]
review_df.shape

(200249, 7)

In [None]:
review_df.to_csv("./Product_review_1.csv", index= False)

In [2]:
review_df1 = pd.read_csv(r'C:\Users\ADMIN\PycharmProjects\NLP\Product_review_1.csv')
review_df2 = pd.read_csv(r'C:\Users\ADMIN\PycharmProjects\NLP\Product_review_2.csv')

In [3]:
review_df1 = review_df1[~review_df1['comment'].isnull()]
review_df1.shape

(200249, 7)

In [5]:
review_df2 = review_df2[~review_df2['comment'].isnull()]
review_df2.shape

(300646, 7)

In [9]:
review_df_final = pd.concat([review_df1, review_df2])
review_df_final

Unnamed: 0,itemid,shopid,username,rating,time,source,comment
0,149456305,23469575157,lynguyen030585,5,1692601818,Shopee,Chất lượng sản phẩm:tamj oonr\n\nTúi to . Đựng...
1,149456305,23469575157,yumishoploan,5,1695100282,Shopee,"Ko có ngon bổ rẻ ,mà chỉ có hợp lý các bác nhé..."
2,149456305,23469575157,5*****7,4,1692351022,Shopee,"Chất lượng sản phẩm:sản phẩm giống quảng cáo, ..."
3,149456305,23469575157,grey.nguyen,5,1692684394,Shopee,Chất lượng sản phẩm:san pham chat luong tot\n\...
4,607816557,14297501887,t*****6,1,1691942775,Shopee,Bấm hết nút và thử đủ hết các chế độ rồi máy c...
...,...,...,...,...,...,...,...
451132,54927730,7339265886,p*****8,5,1659973204,Shopee,https://phohen.com/post/top-16-am-is-are-works...
451133,54927730,7339265886,c3m0wjel_7,5,1650979887,Shopee,"Chào mừng bạn đến với bảng nhớ tạm của Gboard,..."
451993,133777682,19836854796,nhimtit2009,5,1668255280,Shopee,Hương vị:dịu ngọt k quá mặn\nBao bì/Mẫu mã:bìn...
451994,133777682,19836854796,nhimtit2009,4,1661319588,Shopee,Bao bì/Mẫu mã:ổn áp\nHương vị:chưa dùng nên k ...


In [10]:
review_df_final.to_csv("./Product_review_unlabel.csv", index= False)

In [14]:
import xlsxwriter
review_df_final.to_excel("./Product_review_unlabel.xlsx", engine='xlsxwriter', index= False)