In [34]:
# Author: Brian Hardenstein
# pixelatedbrian@gmail.com
# v0.33

from selenium import webdriver

# import MongoDB modules
from pymongo import MongoClient

# we can always use more time
import time

def extract_user_from_div(html):
    '''
    Takes in an html div from the steam website and pulls out the user_id
    that is embedded in the profile url
    
    returns the username as a string
    '''
    # find the profile by splitting by links
    links = html.split("<a href=")
    
    # step through the different links until we (hopefully) find what we want
    for i in range(1,len(links)):
        # see which link we're examining (debugging)
#         print i
#         print links[i]

        # check to see if the desired url is in this link split
        if "http://steamcommunity.com/id/" in links[i] or "http://steamcommunity.com/profiles/" in links[i]:
            #break out the string that we care about
            path = links[i].split(">")[0].strip('"').encode('ascii', 'ignore')
            
            # http://steamcommunity.com/id/koltira/
            user_name = path[:-1].split("/")[-1]
            
            # rarely there is a div in the username, trying to figure out why
            if "div" in user_name:
                for item in links:
                    print "####################### Error in username:", item
                    print
                    print
            
            # mission complete, return the username
            return user_name
    
    # It seems like some users don't have alias/usernames but still have
    # profile numbers.  Attempt to fail down to this profile # and use
    # that in place of the username
    
    
    return "-17 Error extracting username"
        
def extract_review_from_subdiv(review_html):
    '''
    Having been provided the div that contains the review text attempt
    to strip out some extra formatting and html and return a string
    that contains the written review.
    
    return the review as a string
    '''
    # chop out the junk div with the date in front
    #
    # ex:
    # u'\n\t\t\t\t<div class="date_posted">Posted: July 2</div>\n\t\t\t\t\t\t\t\t\t
    # \t\t\ti had 20 hours into this game within 2 days of buying 
    # it please send help\t\t\t'
    
#     print
#     print review_html
#     print
    
    raw_review = review_html.split("/div>")[1].encode("ascii", "ignore")
    
    review = " ".join(raw_review.split())
    
    return review

def extract_rating_from_div(html):
    '''
    Take in review tab html and extract if the user did thumbs up
    or thumbs down
    
    Return 1 if thumbs up, otherwise return 0
    '''
    vote = html.get_attribute("innerHTML")
    
    # try to be specific
    # if it's Recommended return 1
    # if it's Not Recommended return 0
    # if it's something else return -17 and print an error
    # (negative values being indicators that something unexpected occurred)
    if vote.lower() == "recommended":
        return 1
    elif vote.lower() == "not recommended":
        return 0
    else:
        print "Error in extract_rating_from_div!!! vote =", vote
        return -17
    
    
def get_game_reviews(app_id, count, app_title="test"):
    '''
    Using the path and the app_id go to the review website for a game
    and pull the first <count> top rated reviews for both positive and
    negative. (or the number of reviews that do exist if there's less than
    1000 reviews)
    
    returns:
    dictionary that contains:
    {
        "app_id": xxxx,
        "title": yyyy,
        "positive_reviews" : [<list of user info dictionaries>],
        "negative_reviews": [<list of user info dictionaries>]    
    }
    '''
    
    pos_path = "http://steamcommunity.com/app/{}/positivereviews/?p=1&browsefilter=toprated".format(app_id)
    neg_path = "http://steamcommunity.com/app/{}/negativereviews/?p=1&browsefilter=toprated".format(app_id)
    
    pos_reviews = strip_mine_path(pos_path, count)
    neg_reviews = strip_mine_path(neg_path, count)
    
    
    results = {
        "app_id": app_id,
        "title": app_title,
        "positive_reviews": pos_reviews,
        "negative_reviews": neg_reviews
    }
    
    return results


def strip_mine_path(path, count):
    '''
    Drill into the path provided and pull <count> amount
    of reviews.  Process the page and pull data from the
    review divs. Insert data into a small dictionary and
    add it to an ongoing list.
    
    Returns:
    List of user reviews which consist of dictionaries.
    ex:
    {"user_id": user_id, "rating": 1, "review": 'review text'}
    '''
    
    #driver = webdriver.Chrome()
    driver = webdriver.PhantomJS()
    driver.set_window_size(900, 800)
    driver.get(path)

    assert "Steam" in driver.title

    # try to scroll down several times then slight pause before scrolling again
    # count // 10 because there's 10 reviews per page
    for x in range(count // 10):
        # try to get the before height
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        print "{}th tab of reviews loading...\r".format(x) ,
        
        # trying to scroll down
        time.sleep(2)

    # At this point data should be loaded by the page
    # start processing!
    big_results = driver.find_elements_by_class_name("apphub_Card")

    # find the div with the review text
    text_results = driver.find_elements_by_class_name("apphub_CardTextContent")

    # find the div with thumbs up/down
    y_labels =  driver.find_elements_by_class_name("title")

    print
    print "Number of divs found:", len(big_results)
    print
    print

    user_data = []
    user_set = set()
    
    # step through all off the resulting divs
    # use an index because we access multiple lists
    # at the same place
    for idx in xrange(len(big_results)):

        # full review div element
        big_result = big_results[idx]
        text_result = text_results[idx]
        y_label = y_labels[idx]

        # extract html from selenium element
        big_data = big_result.get_attribute("innerHTML")
        text_review = text_result.get_attribute("innerHTML")
        raw_text_review = big_result.find_element_by_class_name("apphub_CardTextContent").get_attribute("innerHTML")

        # extract data
        text_review = extract_review_from_subdiv(raw_text_review)
        user_name = extract_user_from_div(big_data)
        rating = extract_rating_from_div(y_label)
        
        # consolidate data (not being appended to a list anymore but
        # still of use for printing to the log)
        user_info = {"user":user_name, 
                     "rating":rating, 
                     "review":text_review}
    
        # add data to list
        # user_data.append(user_info)
        
        user_set.add((user_name, rating, text_review))

        # log what was found on screen
        print "Rating: {} User: {:<20} Review: {:<40}".format(user_info["rating"], 
                                                              user_info["user"], 
                                                              user_info["review"][:50])
    
    # close the web page
    driver.close()  
    
    print "list len:", len(user_data)
    print "set len:", len(user_set)
    
    # mongoDB doesn't like sets, convert to a list of dictionaries that happens
    # to have unique entries
    results = [{"user":user[0], "rating":user[1], "review":user[2]}  for user in user_set]
    
    return results

def insert(collection, dictionary, _key):
    '''
    Using the provided collection attempt to add the provided dictionary
    to the collection. Check to see if the new dictionary being added
    already exists in the collection before adding.
    '''
    if not collection.find_one({_key: dictionary[_key]}):
        try:
            collection.insert_one(dictionary)
            print "inserted", dictionary[_key]

        except Exception, e:
            print e

    else:
        print dictionary[_key], "already exists"

def scrape_to_db(collection, app_id_list, count):
    '''
    Attempt to scrape <count> reviews from each game in the <app_list>
    and then try to add the resulting dictionary to the provided
    <collection>.
    '''
    
    for app_id in app_id_list:
        game_results = get_game_reviews(app_id, count)
        
    
        insert(collection, game_results, "app_id")
    
def main():
    '''
    For __name__ == "__main__" establish link to db/collection
    and start scraping
    '''
    # connect to the hosted MongoDB instance
    db = MongoClient('mongodb://localhost:27017/')["capstone"]
    
    dest_collection = db.selenium_game_review_scrape
    
    list_of_apps = ["413150", "367520", "286160", "246620", "257850", "105600", "211820", "311690", "233450", "250760"]
    
    
    
    scrape_to_db(dest_collection, list_of_apps, 1100)

In [35]:
game_results = get_game_reviews(413150, 10)

0th tab of reviews loading...
Number of divs found: 20


Rating: 1 User: mysticmad            Review: <b>**NOTE: This review will constantly update to a
Rating: 1 User: bexyish              Review: This is a game that I have been looking forward to
Rating: 1 User: 76561197989330960    Review: The new update promises to improve the quality of 
Rating: 1 User: nuance               Review: I'm genuinely impressed an indie game of this scal
Rating: 1 User: 76561198155735547    Review: Can I just say thankyou to the developer of this g
Rating: 1 User: Evanz111             Review: I spent three years building up hype, and I was st
Rating: 1 User: 76561198082459183    Review: There is a character in this game that is sufferin
Rating: 1 User: 76561198042161073    Review: Ok so far from my game this is what I got:<br>&gt;
Rating: 1 User: ErickaUnlimited      Review: <div class="bb_h1"> Overview <          
Rating: 1 User: amandath3panda       Review: This game has so much thought and detail put

In [36]:
len(game_results)

4

In [39]:
for x in range(10):
    print game_results["negative_reviews"][x]
    print

{'rating': 0, 'review': "I hate that there's no neutral option on steam.", 'user': 'atlas1205'}

{'rating': 0, 'review': "I have put a huge amount of time into this game over the last few weeks. I absolutely love the game, It is like Harvest Moon, but better. Like, 10 times better, deeper battle system, and everyone loves mayonnaise. I have had a huge amount of fun making bombs to go down the desert mine in the winter, making ancient fruit wine for money, and raising black demon chickens.<br>However, I LOATHE the fishing mini game. I have probably played dozens of fishing mini games in other games, and i usually at least tolerate them. But this one is so frustrating, I just can't stand it. Unfortunately, you can't fix up the old community center without it,<br>Update: 10/9/16<br>Game no longer works. Something about a disk read error. It has been saying this since the 1.1 update WHICH CANNOT BE OPTED OUT OF, and I can't reload an earlier version of the game.<br>I've put my problem out 

In [12]:
print game_results["title"]

test


In [14]:
for x in range(10):
    print '{0}\r'.format(x),
    time.sleep(0.05)
print


9


In [41]:
main()

2th tab of reviews loading...
Number of divs found: 40


Rating: 1 User: mysticmad            Review: <b>**NOTE: This review will constantly update to a
Rating: 1 User: bexyish              Review: This is a game that I have been looking forward to
Rating: 1 User: 76561197989330960    Review: The new update promises to improve the quality of 
Rating: 1 User: nuance               Review: I'm genuinely impressed an indie game of this scal
Rating: 1 User: 76561198155735547    Review: Can I just say thankyou to the developer of this g
Rating: 1 User: Evanz111             Review: I spent three years building up hype, and I was st
Rating: 1 User: 76561198082459183    Review: There is a character in this game that is sufferin
Rating: 1 User: 76561198042161073    Review: Ok so far from my game this is what I got:<br>&gt;
Rating: 1 User: ErickaUnlimited      Review: <div class="bb_h1"> Overview <          
Rating: 1 User: amandath3panda       Review: This game has so much thought and detail put

In [1]:
big_list_of_apps = [383870, 251570,387290, 252950,305620,433340,242760,304430,391540,26800,204360, 275850,265000, 437220,308420,205730,271240,250700,231200,251270,329130,474750,264710,222880,9500,237990,281640,4000,65300,219990,398850,241600,220780,95300,239820,107100,294100,224760,427520,22000, 322500,221910,48000]

In [2]:
len(big_list_of_apps)

43

In [3]:
app_set = set(big_list_of_apps)

In [4]:
len(app_set)

43

In [5]:
def add_to_set(game, a_set):
    a_set.add(game)
    print len(a_set)
    
    return a_set

In [52]:
app_set = add_to_set(361300 , app_set)

90


In [53]:
print repr(list(app_set))

[427520, 371200, 398850, 264710, 219150, 431120, 252950, 210970, 290340, 281640, 475190, 261180, 322110, 274500, 204360, 107100, 554600, 425580, 474750, 296470, 405640, 220780, 595140, 234650, 222880, 26800, 251570, 433340, 308420, 212680, 265930, 239820, 312530, 294100, 221910, 356570, 247080, 470260, 383870, 231160, 421120, 65300, 230190, 318230, 9500, 387290, 231200, 265000, 304430, 469820, 365450, 253250, 435530, 250700, 361300, 219990, 367450, 326460, 206190, 391540, 527230, 48000, 505730, 364420, 251270, 271240, 275850, 504210, 387990, 95300, 568220, 4000, 205730, 237990, 329130, 242760, 457140, 239030, 538100, 241600, 322500, 394970, 396750, 305620, 282070, 437220, 258030, 22000, 248820, 224760]


In [56]:
big_app_list = [ str(app) for app in list(app_set)]

In [64]:
print repr(big_app_list)

['427520', '371200', '398850', '264710', '219150', '431120', '252950', '210970', '290340', '281640', '475190', '261180', '322110', '274500', '204360', '107100', '554600', '425580', '474750', '296470', '405640', '220780', '595140', '234650', '222880', '26800', '251570', '433340', '308420', '212680', '265930', '239820', '312530', '294100', '221910', '356570', '247080', '470260', '383870', '231160', '421120', '65300', '230190', '318230', '9500', '387290', '231200', '265000', '304430', '469820', '365450', '253250', '435530', '250700', '361300', '219990', '367450', '326460', '206190', '391540', '527230', '48000', '505730', '364420', '251270', '271240', '275850', '504210', '387990', '95300', '568220', '4000', '205730', '237990', '329130', '242760', '457140', '239030', '538100', '241600', '322500', '394970', '396750', '305620', '282070', '437220', '258030', '22000', '248820', '224760']


In [60]:
big_set = set(big_app_list)

In [65]:
len(big_app_list)

90

In [62]:
for item in ["413150", "367520", "286160", "246620", "257850", "105600", "211820", "311690", "233450", "250760"]:
    big_set.add(item)

In [63]:
len(big_set)

100

In [66]:
boink = big_app_list[:30]

In [67]:
len(boink)

30