In [2]:
# import MongoDB modules
from pymongo import MongoClient

# import the Requests HTTP library
import requests

# import the Beautiful Soup module 
from bs4 import BeautifulSoup

# import the time module for the sleep functionality
# in order to be polite when scraping
import time

# randomize a bit for the scraping
import random


In [3]:
def insert(collection, dictionary, d_key):
    if not collection.find_one({d_key: dictionary[d_key]}):
        try:
            collection.insert_one(dictionary)
            print "inserted", dictionary[d_key]
            
        except Exception, e:
            print e
            
    else:
        print ">>>>>>>>>>>>>>>>>>>>>>>>>>>", d_key, "already exists"

In [3]:
# true_path = doc["path"].replace("reviews/?p=", "") + doc["title"]

true_path = "http://steamcommunity.com/app/413150/Stardew_Valley"

# make the request for the path
req = requests.get(true_path)

print "web page code:", str(req)
data = BeautifulSoup(req.content, "lxml")

print data.prettify()

# page_dict = {"title":doc["title"], "data": data}

# print page_dict["title"]

# insert(dest_collection, page_dict, "title")

web page code: <Response [200]>
<!DOCTYPE html>
<html class=" responsive" lang="en">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width,initial-scale=1" name="viewport"/>
  <meta content="#171a21" name="theme-color"/>
  <title>
   Steam Community :: Stardew Valley
  </title>
  <link href="/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
  <link href="http://community.edgecast.steamstatic.com/public/shared/css/motiva_sans.css?v=Sd0odMs2NjL1" rel="stylesheet" type="text/css"/>
  <link href="http://community.edgecast.steamstatic.com/public/shared/css/buttons.css?v=FMXZx9fv9yp_" rel="stylesheet" type="text/css"/>
  <link href="http://community.edgecast.steamstatic.com/public/shared/css/shared_global.css?v=aXnR3BCCrqcR" rel="stylesheet" type="text/css"/>
  <link href="http://community.edgecast.steamstatic.com/public/css/globalv2.css?v=3X1Hz39cVBqV" rel="stylesheet" type="text/css"/>
  <link href="http://community.edgecas

In [16]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait

# we can always use more time
import time

def extract_user_from_div(html):
    '''
    Takes in an html div from the steam website and pulls out the user_id
    that is embedded in the profile url
    
    returns the username as a string
    '''
    # find the profile by splitting by links
    links = html.split("<a href=")
    
    # step through the different links until we (hopefully) find what we want
    for i in range(1,len(links)):
        # see which link we're examining (debugging)
#         print i
#         print links[i]

        # check to see if the desired url is in this link split
        if "http://steamcommunity.com/id/" in links[i] or "http://steamcommunity.com/profiles/" in links[i]:
            #break out the string that we care about
            path = links[i].split(">")[0].strip('"').encode('ascii', 'ignore')
            
            # http://steamcommunity.com/id/koltira/
            user_name = path[:-1].split("/")[-1]
            
            # rarely there is a div in the username, trying to figure out why
            if "div" in user_name:
                for item in links:
                    print "####################### Error in username:", item
                    print
                    print
            
            # mission complete, return the username
            return user_name
    
    # It seems like some users don't have alias/usernames but still have
    # profile numbers.  Attempt to fail down to this profile # and use
    # that in place of the username
    
    
    return "-17 Error extracting username"
        
def extract_review_from_subdiv(review_html):
    '''
    Having been provided the div that contains the review text attempt
    to strip out some extra formatting and html and return a string
    that contains the written review.
    
    return the review as a string
    '''
    # chop out the junk div with the date in front
    #
    # ex:
    # u'\n\t\t\t\t<div class="date_posted">Posted: July 2</div>\n\t\t\t\t\t\t\t\t\t
    # \t\t\ti had 20 hours into this game within 2 days of buying 
    # it please send help\t\t\t'
    
#     print
#     print review_html
#     print
    
    raw_review = review_html.split("/div>")[1].encode("ascii", "ignore")
    
    review = " ".join(raw_review.split())
    
    return review

def extract_rating_from_div(html):
    '''
    Take in review tab html and extract if the user did thumbs up
    or thumbs down
    
    Return 1 if thumbs up, otherwise return 0
    '''
    vote = html.get_attribute("innerHTML")
    
    # try to be specific
    # if it's Recommended return 1
    # if it's Not Recommended return 0
    # if it's something else return -17 and print an error
    # (negative values being indicators that something unexpected occurred)
    if vote.lower() == "recommended":
        return 1
    elif vote.lower() == "not recommended":
        return 0
    else:
        print "Error in extract_rating_from_div!!! vote =", vote
        return -17
    
    
def get_game_reviews(app_id, count, app_title="test"):
    '''
    Using the path and the app_id go to the review website for a game
    and pull the first <count> top rated reviews for both positive and
    negative. (or the number of reviews that do exist if there's less than
    1000 reviews)
    
    returns:
    dictionary that contains:
    {
        "app_id": xxxx,
        "title": yyyy,
        "positive_reviews" : [<list of user info dictionaries>],
        "negative_reviews": [<list of user info dictionaries>]    
    }
    '''
    
    pos_path = "http://steamcommunity.com/app/{}/positivereviews/?p=1&browsefilter=toprated".format(app_id)
    neg_path = "http://steamcommunity.com/app/{}/negativereviews/?p=1&browsefilter=toprated".format(app_id)
    
    pos_reviews = strip_mine_path(pos_path, count)
    neg_reviews = strip_mine_path(neg_path, count)
    
    
    results = {
        "app_id": app_id,
        "title": app_title,
        "positive_reviews": pos_reviews,
        "negative_reviews": neg_reviews
    }
    
    return results


def strip_mine_path(path, count):
    '''
    Drill into the path provided and pull <count> amount
    of reviews.  Process the page and pull data from the
    review divs. Insert data into a small dictionary and
    add it to an ongoing list.
    
    Returns:
    List of user reviews which consist of dictionaries.
    ex:
    {"user_id": user_id, "rating": 1, "review": 'review text'}
    '''
    
    driver = webdriver.Chrome()
    #driver = webdriver.PhantomJS()
    driver.set_window_size(900, 800)
    driver.get(path)

    assert "Steam" in driver.title

    # try to scroll down several times then slight pause before scrolling again
    # count // 10 because there's 10 reviews per page
    for x in range(count // 10):
        
        if x == 0:
            previous_page = len(driver.page_source)
        
        # try to get the before height
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        # trying to scroll down
        time.sleep(1)
        
        current_page = len(driver.page_source)
        
        if current_page == previous_page:
            print "seems like hit the end of the page, breaking now"
            break
        else:
            previous_page = current_page
        
        

    # At this point data should be loaded by the page
    # start processing!
    big_results = driver.find_elements_by_class_name("apphub_Card")

    # find the div with the review text
    text_results = driver.find_elements_by_class_name("apphub_CardTextContent")

    # find the div with thumbs up/down
    y_labels =  driver.find_elements_by_class_name("title")

    print
    print "Number of divs found:", len(big_results)
    print
    print

    user_data = []
    user_set = set()
    
    # step through all off the resulting divs
    # use an index because we access multiple lists
    # at the same place
    for idx in xrange(len(big_results)):

        # full review div element
        big_result = big_results[idx]
        text_result = text_results[idx]
        y_label = y_labels[idx]

        # extract html from selenium element
        big_data = big_result.get_attribute("innerHTML")
        text_review = text_result.get_attribute("innerHTML")
        raw_text_review = big_result.find_element_by_class_name("apphub_CardTextContent").get_attribute("innerHTML")

        # extract data
        text_review = extract_review_from_subdiv(raw_text_review)
        user_name = extract_user_from_div(big_data)
        rating = extract_rating_from_div(y_label)
        
        # consolidate data
        user_info = {"user":user_name, 
                     "rating":rating, 
                     "review":text_review}
    
        # add data to list
        user_data.append(user_info)
        
        user_set.add((user_name, rating, text_review))

        # log what was found on screen
        print "Rating: {} User: {:<20} Review: {:<40}".format(user_info["rating"], 
                                                              user_info["user"], 
                                                              user_info["review"][:50])
    
    # close the web page
    driver.close()  
    
    print "list len:", len(user_data)
    print "set len:", len(user_set)
        
    return user_data

In [18]:
game_results = get_game_reviews(638850, 1250)

seems like hit the end of the page, breaking now

Number of divs found: 13


Rating: 1 User: TomGirlGamer         Review: <div class="received_compensation">Product receive
Rating: 1 User: 76561198397151566    Review: <div class="early_access_review">Early Access Revi
Rating: 1 User: WarmTummyRubs        Review: <div class="early_access_review">Early Access Revi
Rating: 1 User: 76561197965334713    Review: <div class="early_access_review">Early Access Revi
Rating: 1 User: 76561198168764957    Review: <div class="received_compensation">Product receive
Rating: 1 User: 76561198136846931    Review: <div class="early_access_review">Early Access Revi
Rating: 1 User: everenchanted        Review: <div class="early_access_review">Early Access Revi
Rating: 1 User: celticsting          Review: <div class="early_access_review">Early Access Revi
Rating: 1 User: 76561198089978432    Review: <div class="received_compensation">Product receive
Rating: 1 User: DukusMaximus         Review: <div class="ea

In [193]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait

# we can always use more time
import time

# path = "http://steamcommunity.com/app/413150/reviews/"

# prob intentionally scrap top reviews for positive and negative to attempt to balance classes
path = "http://steamcommunity.com/app/413150/positivereviews/?p=1&browsefilter=toprated"
neg_path = "http://steamcommunity.com/app/413150/negativereviews/?p=1&browsefilter=toprated"

driver = webdriver.Chrome()
driver.get(path)

assert "Steam" in driver.title

# <div class="apphub_CardTextContent">

# elem.clear()
# elem.send_keys("pycon")
# elem.send_keys(Keys.RETURN)
# assert "No results found." not in driver.page_source
# driver.close()

info_list = []

# try to scroll down several times then slight pause before scrolling again
for x in range(100):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)
    
    
# At this point data should be loaded by the page
# start processing!
big_results = driver.find_elements_by_class_name("apphub_Card")
results = driver.find_elements_by_class_name("apphub_CardContentMain")

# find the div with the review text
text_results = driver.find_elements_by_class_name("apphub_CardTextContent")

# find the div with thumbs up/down
y_labels =  driver.find_elements_by_class_name("title")

print
print "Number of divs found:", len(results)
print
print

user_data = []
user_set = set()
# step through all off the resulting divs
# use an index because we access multiple lists
# at the same place
for idx in xrange(len(big_results)):

    big_result = big_results[idx]
    result = results[idx]
    text_result = text_results[idx]
    y_label = y_labels[idx]

    big_data = big_result.get_attribute("innerHTML")
    data = result.get_attribute("innerHTML")
    text_review = text_result.get_attribute("innerHTML")

#     print big_data[:500]
#     for i in range(5):
#         print

    raw_text_review = big_result.find_element_by_class_name("apphub_CardTextContent").get_attribute("innerHTML")

    text_review = extract_review_from_subdiv(raw_text_review)
    user_name = extract_user_from_div(big_data)
    rating = extract_rating_from_div(y_label)

#     print "#############################"
#     print "### Review ##################"
#     print "#############################"
#     print
#     print text_review
#     print
#     print "#############################"
#     print "### User Id #################"
#     print "#############################"
#     print
#     print user_name
#     print
#     print "#############################"
#     print "### Rating ##################"
#     print "#############################"
#     print
#     print rating
#     print
#     print "#############################"

    user_info = {"user":user_name, "rating":rating, "review":text_review}
    
    user_data.append(user_info)
    user_set.add(user_name, rating, text_review)

    print repr(user_info)
    print
    print

    # print
    # print data
    # print
    # print text_review



print
print
print
print "number of user_data[] found:", len(user_data)
print "number of user_set() found:", len(user_set)


    
    

WebDriverException: Message: chrome not reachable
  (Session info: chrome=58.0.3029.110)
  (Driver info: chromedriver=2.30.477691 (6ee44a7247c639c0703f291d320bdf05c1531b57),platform=Linux 4.8.0-56-generic x86_64)


In [43]:
hrefs = big_data.split("<a href=")

#print big_data
for i in range(len(hrefs)):
    print i
#     print hrefs[i]
    if "http://steamcommunity.com/id/" in hrefs[i]:
        print str(hrefs[i].split(">")[0].strip('"'))
        break

# for item in hrefs[0]:
#     print item
#     print
#     print "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<>>>>>>>>>>>>>>>>>>>>>>>>>>"
#     print

0
1
http://steamcommunity.com/id/koltira/


In [117]:
other_user_name = big_results[1].get_attribute("innerHTML")

print other_user_name

# find the profile by splitting by links
links = other_user_name.split("<a href=")

# step through the different links until we (hopefully) find what we want
for i in range(len(links)):
    # see which link we're examining (debugging)
#         print i
#         print links[i]

    # check to see if the desired url is in this link split
    if "http://steamcommunity.com/id/" in links[i]:
        #break out the string that we care about
        path = links[i].split(">")[0].strip('"').encode('ascii', 'ignore')

        # http://steamcommunity.com/id/koltira/
        user_name = path[:-1].split("/")[-1]

        # mission complete, return the username
        print user_name
        
    if "http://steamcommunity.com/profiles/" in links[i]:
                #break out the string that we care about
        path = links[i].split(">")[0].strip('"').encode('ascii', 'ignore')

        # http://steamcommunity.com/id/koltira/
        user_name = path[:-1].split("/")[-1]

        # mission complete, return the username
        print user_name


	<div class="apphub_CardContentMain" style="height: 287px;">
		<div class="apphub_UserReviewCardContent">
			<div class="found_helpful">
				75 of 90 people (83%) found this review helpful<br>4 people found this review funny			</div>

			<div class="vote_header">
								<div class="reviewInfo">
					<div class="thumb">
						<img src="http://steamcommunity-a.akamaihd.net/public/shared/images/userreviews/icon_thumbsUp.png?v=1" width="44" height="44">
					</div>

										<div class="title">Recommended</div>
															<div class="hours">27.3 hrs on record</div>
									</div>
				<div style="clear: left"></div>
			</div>

			<div class="apphub_CardTextContent">
				<div class="date_posted">Posted: July 2</div>
												Just started playing this game, but its a whole lot of fun, and a steal @ 8.99 on the summer sale.  A definite must buy.			</div>
		</div>
		<div class="UserReviewCardContent_Footer">
			<div class="gradient">&nbsp;</div>
					</div>
	</div>

		<div class="app

In [85]:
raw_review = text_review.split("/div>")[1].encode("ascii", "ignore")

print repr(text_review)

test = '\n\t\t\t\t\t\t\t\t\t\t\t\tI had 20 hours into this game within 2 days of buying it please send help. Then I went to the store and, then, I bought a Darwin.\t\t\t'

print repr(raw_review)
review = " ".join(test.split())

print
print review

u'\n\t\t\t\t<div class="date_posted">Posted: July 2</div>\n\t\t\t\t\t\t\t\t\t\t\t\ti had 20 hours into this game within 2 days of buying it please send help\t\t\t'
'\n\t\t\t\t\t\t\t\t\t\t\t\ti had 20 hours into this game within 2 days of buying it please send help\t\t\t'

I had 20 hours into this game within 2 days of buying it please send help. Then I went to the store and, then, I bought a Darwin.


In [102]:
print y_label.get_attribute("innerHTML")

Recommended


In [187]:
# list of apps to scrape reviews from
list_of_apps = ["413150", "367520", "286160", "246620", "257850", "105600", "211820", "311690", "233450", "250760"]








In [188]:
boink = {"a":1, "b":2}

In [192]:
boink.keys().count("a")

1