In [None]:
#runtime sample(30): 289 reviews [crawl - 3m32s, parse - 32s] 

from bs4 import BeautifulSoup
import scrapy
from scrapy.crawler import CrawlerProcess
import re
import logging
import os
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from autocorrect import Speller

stopWords = set(stopwords.words('english'))
spell = Speller(lang = 'en')
number_of_pages_to_scrape = 100             # 10 reviews per page

game_name = "Among Us"
game_steam_number = 945360
# Vampire Survivors - 1794680
# Elden Ring - 1245620
# Among Us - 945360

In [None]:
class MySpider(scrapy.Spider):
    name = "MySpider"
    download_delay = 6
    page_number = 1
    start_urls = (
    'https://steamcommunity.com/app/{}/reviews/'.format(game_steam_number), 
    )

    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'LOG_ENABLED': False,
        'LOG_FILE': 'logging.txt',
        'LOG_FILE_APPEND': False,
        'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
        'FEEDS': {"items.json": {"format": "json", 'overwrite': True},},
    }

    def parse(self, response):
        print("hello, page ", self.page_number)
        soup = BeautifulSoup(response.text, 'lxml')
        for review in soup.find_all('div', class_="apphub_UserReviewCardContent"):
            new_line = []
            
            #thumbs_up
            if(review.find('div', class_="title").text == "Recommended"):
                new_line.append(True)
            else:
                new_line.append(False)
            
            #text
            new_line.append("")
            for x in (review.find('div', class_="date_posted").next_siblings):
                new_line[-1] += x.text.strip()
            
            #helpful_votes              
            temp = review.find('div', class_="found_helpful").text.strip().split(" ")[0]
            if(temp == "No"):
                new_line.append(0)
            else:
                new_line.append(int(temp))
            
            #funny_votes
            if(review.find('div', class_="found_helpful").find('br')):
                new_line.append(review.find('div', class_="found_helpful").find('br').next_sibling.text.strip().split(" ")[0])
            else:
                new_line.append(0)
            
            #reward
            if(review.find("div", class_="reward_btn_icon")):
                new_line.append(int(review.find("div", class_="reward_btn_icon").text))
            else:
                new_line.append(0)
            
            #gametime_record
            new_line.append(float(review.find("div", class_="hours").text.split(" ")[0]))
            
            yield{
                'thumbs_up': new_line[0],
                'text': new_line[1],
                'helpful_votes': new_line[2],
                'funny_votes': new_line[3],
                'reward': new_line[4],
                'gametime_record': new_line[5]
            }
            
        # send Request for more reviews
        if(self.page_number<number_of_pages_to_scrape):
            form = soup.find('form', id="MoreContentForm{}".format(self.page_number)).find_all()
            #form = soup.find('form', id=re.compile("^MoreContentForm[0-9]+$")).find_all()
            self.page_number +=1
            #yield scrapy.Request('https://steamcommunity.com/app/{gameNum}/homecontent/?userreviewscursor={x}&userreviewsoffset={offset}&p={p}&workshopitemspage={p}&readytouseitemspage={p}&mtxitemspage={p}&itemspage={p}&screenshotspage={p}&videospage={p}&artpage={p}&allguidepage={p}&webguidepage={p}&integratedguidepage={p}&discussionspage={p}&numperpage=10&browsefilter=trendweek&browsefilter=trendweek&l=english&appHubSubSection=10&filterLanguage=default&searchText=&maxInappropriateScore=100'.format(gameNum=game_steam_number, x=form[0]['value'], offset=10*(self.page_number-1) ,p=self.page_number),method='GET', callback=self.parse)
            inputs = [game_steam_number] + [x['value'] for x in form][:14]
            yield scrapy.Request('https://steamcommunity.com/app/{0}/homecontent/?userreviewscursor={1}&userreviewsoffset={2}&p={3}&workshopitemspage={4}&readytouseitemspage={5}&mtxitemspage={6}&itemspage={7}&screenshotspage={8}&videospage={9}&artpage={10}&allguidepage={11}&webguidepage={12}&integratedguidepage={13}&discussionspage={14}&numperpage=10&browsefilter=trendweek&browsefilter=trendweek&l=english&appHubSubSection=10&filterLanguage=default&searchText=&maxInappropriateScore=100'.format(*inputs),method='GET', callback=self.parse)
            

In [None]:
process = CrawlerProcess()
process.crawl(MySpider)
process.start() # the script will block here until the crawling is finished

In [None]:
# cleaning text to only "useful"
def sentence_parse(badText, applyList=[2,1,3]):
        funcDict = {}
        funcDict[1]=lambda x:[spell(word) for word in x]
        funcDict[2]=lambda x:[nltk.stem.SnowballStemmer('english').stem(word) for word in x]
        funcDict[3]=lambda x:[nltk.stem.WordNetLemmatizer().lemmatize(word) for word in betterText]
        badText = badText.replace("[^\s\w]", "").lower().encode('ascii', 'ignore').decode('ascii').translate(str.maketrans('', '', string.punctuation))
        betterText = badText.split()
        betterText = [word for word in betterText if word not in stopWords]
        for i in applyList:
            betterText = funcDict[i](betterText)
        if not betterText:
            return None
        betterText = " ".join(i for i in betterText)
        return betterText

In [None]:
# create folder for the searched game
if(not(os.path.exists(game_name))):
    os.makedirs(game_name)

# add cleaned text to the json and save it in the new folder
df = pd.read_json("items.json")
df = df.assign(cleaned_text = df['text'].apply(sentence_parse)).dropna()
df.to_json(game_name + "/" + game_name + ".json", orient='records', lines=True)