In [1]:
from urllib.request import Request
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
import pandas as pd
from datetime import datetime
import csv
import unicodedata

In [2]:
#writing a function to get all relevant info about game from single page
def getinfo(url):
    #opening the url with BeautifulSoup
    user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
    headers={'User-Agent':user_agent,}    
    request=Request(url,None,headers)
    response = urlopen(request)
    bs = BeautifulSoup(response.read(), 'html.parser')
    
    #creating list of all games on single page using BeautifulSoup syntax
    games_list = bs.find_all('div', {'class':'col-md-12 col-xs-12 col-sm-12 item'})
    
    #iterating over the list of games to get all relevant info using BeautifulSoup syntax
    price_list = []
    for game in games_list:
        price = game.find('div', {'class':'col-md-2 col-sm-2 col-xs-2 text-right oswald pc'}).find('p', {'class':'prc'}).get_text()
        price = float(price.replace(" ", "").replace("zł", "").replace(",", "."))
        offers = game.find('div', {'class':'col-md-2 col-sm-2 col-xs-2 text-right oswald pc'}).find('p', {'class':'prc-text'}).get_text()
        offers = float(offers.replace("ofert", "").replace("y", "").replace("a", "").replace("od", "").replace(" ", ""))
        name = game.find('div', {'class':'col-md-7 col-sm-4 col-xs-6 nopadding'}).h4.a.get_text()
        tags = game.find('div', {'class':'col-md-7 col-sm-4 col-xs-6 nopadding'}).find('div', {'class':'ptsans zero trader pc'}).get_text()
        tags = tags.replace('            ', '').replace('\n', '').replace(' ', '', 1)
        #getting rid of special Polish characters
        tags = unicodedata.normalize('NFD', tags).encode('ascii', 'ignore')
        tags = str(tags).replace("b", "", 1).replace("'", "")
        day = datetime.now().strftime('%Y/%m/%d')
        info = [name, price, offers, tags, day]
        price_list.append(info)      
    return price_list

In [3]:
#iterating over "last activity" pages to get most recent data
final_list = []
for i in range(1, 200):
    getinfo_list = getinfo('https://bazar.lowcygier.pl/?type%5B0%5D=sell&platform=&payment%5B0%5D=1&game_type=&game_genre=&title=&game_id=&sort=-last_activity&per-page=100&page=' + str(i))
    #breaking the loop if the page is the same as the previous one
    if  (getinfo_list == getinfo('https://bazar.lowcygier.pl/?type%5B0%5D=sell&platform=&payment%5B0%5D=1&game_type=&game_genre=&title=&game_id=&sort=-last_activity&per-page=100&page=' + str(i - 1)) and i != 1):
        print("Completed!")
        break
    else:
        final_list = final_list + getinfo_list
        #adding some visual feedback
        if(i) % 10 == 0:
            print(i)
        #let's not kill the server!
        time.sleep(1)

10
20
30
40
50
60
70
80
Completed!


In [4]:
#creating DataFrame from collected data
new = pd.DataFrame(data = final_list, index = range(1, len(final_list) + 1), 
                   columns = ['Game', 'Price', 'Offers', 'Tags', 'Day'])

In [5]:
#loading current database
old = pd.read_csv('games_full_data.csv', index_col = 0)
#adding new data to the old Dataframe
final = old.append(new, ignore_index = 1)

In [6]:
#checking for multiplied entries and deleting them
unique_pairs = []
for index, row in final.iterrows():
    entry = row["Game"], row["Day"]
    if entry not in unique_pairs:
        unique_pairs.append(entry)
    else:
        final = final.drop([index])

In [7]:
final.describe()

Unnamed: 0,Price,Offers
count,25232.0,25232.0
mean,3.970338,2.712349
std,6.188498,3.046495
min,0.5,1.0
25%,0.79,1.0
50%,1.895,2.0
75%,4.0,3.0
max,55.0,61.0


In [8]:
#overwriting old file with new one with new data
final.to_csv('games_full_data.csv')

In [13]:
final.head(10)

Unnamed: 0,Game,Price,Offers,Tags,Day
0,Zombie Night Terror,1.0,12.0,"Niezalezne, Akcja, Strategiczne",2019/05/02
1,Aaero,0.5,15.0,"Niezalezne, Akcja",2019/05/02
2,Dandara,0.5,49.0,"Niezalezne, Akcja",2019/05/02
3,Sword of the Stars: The Pit - Osmium Edition,9.99,1.0,"Niezalezne, RPG, Strategiczne, Rekreacyjne",2019/05/02
4,Wandersong,0.99,37.0,"Niezalezne, Przygodowe",2019/05/02
5,Just Cause 3 XXL Edition,13.98,18.0,"Akcja, RPG, Przygodowe",2019/05/02
6,This War of Mine - The Little Ones DLC,18.0,1.0,"Niezalezne, Symulacje, Przygodowe",2019/05/02
7,Absolver,3.68,27.0,"Niezalezne, Akcja, Przygodowe",2019/05/02
8,Full Metal Furies,2.49,10.0,"Niezalezne, Akcja, RPG, Przygodowe",2019/05/02
9,Worms Ultimate Mayhem - Deluxe Edition,10.4,7.0,Strategiczne,2019/05/02
