<a href="https://colab.research.google.com/github/Cecax27/DS-Stardew-Valley-Crops-Profit/blob/main/notebooks/Stardew_valley_web_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Web Scraper on Stardew Valley wiki

This notebook is to make web scraping on the Stardew Valley Wiki. We need to extract information about the crops, and the products that you can create with them.

The url of the website is https://stardewvalleywiki.com. You can visit it to find a lot of information about the game.

## Importing libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from pprint import pprint

## Declaring constants

In [2]:
home_url = 'https://stardewvalleywiki.com'
crops_url = 'https://stardewvalleywiki.com/Crops'

## Get crops urls list

In [3]:
# Making the request
crops_page = requests.get(crops_url)

# Pasing the HTML with BeatifulSoup
crops_soup = BeautifulSoup(crops_page.text, 'lxml')

#Finding the urls
crops_names = crops_soup.find('div', attrs = {'class' : 'mw-parser-output'}).find_all('h3')[10:-1]

#Making a links list
crops_links_list = [home_url + crop_name.find_all('a')[1].get('href') for crop_name in crops_names]
crops_names_list = [crop_name.get_text().strip() for crop_name in crops_names]

print(f'{len(crops_links_list)} links founded')

43 links founded


## Declaring functions to extract data

In [4]:
# Function to extract information about the seed

def get_seed_information(link) -> dict:

  # Getting data to work
  try:
    s = requests.get(link)
    raw_information = BeautifulSoup(s.text, 'lxml')
  except Exception as e:
    print('An error ocurred making the request:')
    print(e)
    print('\n')
    return dict()

  # Dictionary to save the data
  price_dict = dict()

  # Finding data
  for row in raw_information.find('table').find_all('tr'):

    title = row.find('td').get_text().strip().replace(':', '').lower().replace(' ', '_')

    if title == 'general_store':
      try:
        price_dict[title+'_price'] = int(row.find_all('td')[1].find_all('span')[1].get_text()[:-1])
      except:
        price_dict[title+'_price'] = np.nan

    elif title == 'jojamart':
      try:
        price_dict[title+'_price'] = int(row.find_all('td')[1].find_all('span')[1].get_text()[:-1])
      except:
        price_dict[title+'_price'] = np.nan

    elif title == 'oasis':
      price_dict[title+'_price'] = int(row.find_all('td')[1].find_all('span')[1].get_text()[:-1])

    elif title == 'traveling_cart':
      try:
        price_dict[title+'_price_min'] = int(row.find_all('td')[1].get_text().split('"')[-1].split('g')[0].replace(',','').split('–')[0])
        price_dict[title+'_price_max'] = int(row.find_all('td')[1].get_text().split('"')[-1].split('g')[0].replace(',','').split('–')[1])
      except:
        price_dict[title+'_price_min'] = np.nan
        price_dict[title+'_price_max'] = np.nan

    elif title == 'night_market(winter_15)':
      price_dict[title.split('(')[0]+'_price'] = int(row.find_all('td')[1].find_all('span')[1].get_text()[:-1])

    elif title == 'other':
      price_dict[row.find_all('td')[1].find('a').get_text().strip().lower().replace(' ', '_')+'_price'] = int(row.find_all('td')[1].find_all('span')[-1].get_text().split('g')[0])

  return price_dict

In [5]:
# Testing
pprint(get_seed_information('https://stardewvalleywiki.com/Starfruit_Seeds'))

{'general_store_price': nan,
 'jojamart_price': nan,
 'oasis_price': 400,
 'traveling_cart_price_max': 1000,
 'traveling_cart_price_min': 600}


In [23]:
from re import S
def get_crop_information(link) -> dict:

  # Getting data to work
  try:
    request = requests.get(link)
    s = BeautifulSoup(request.text, 'lxml')
    raw_information = s.find('table').find_all('tr')
  except Exception as e:
    print('An error ocurred making the request:')
    print(e)
    print('\n')
    return dict()

  # Constants
  prices_sufix = ['regular', 'silver', 'gold', 'iridium']

  # Dictionary to save the data
  inf_dict = {
      'name' : raw_information[0].get_text().strip(),
      'description' : raw_information[2].get_text().strip()
  }

  # Finding data
  for data in raw_information[4:]:

    field = data.find('td').get_text().strip().lower().replace(' ', '_')

    if '\t' in field or ':' in field or '/' in field or field == '' or field[0].isdigit():
      continue

    if field == 'sell_price' or field == 'sell_prices':
      try:
        prices = data.find('table').find_all('td')
        prices = list(map(lambda i: i.get_text().split('g')[0], prices))[1:]
        prices = list(filter(lambda i: i != '', prices))
        prices = list(map(lambda i: int(i.replace(',', '')), prices))
        for index, price in enumerate(prices):
          inf_dict[field+'_'+prices_sufix[index]] = price
        continue
      except Exception as e:
        try:
          index = raw_information.index(data) + 2
          prices = raw_information[index].find('table').find_all('td')
          prices = list(map(lambda i: i.get_text().split('g')[0].replace(',',''), prices))[1:]
          prices = list(filter(lambda i: i != '', prices))
          prices = list(map(lambda i: int(i), prices))
          for index, price in enumerate(prices):
            inf_dict[field+'_'+prices_sufix[index]] = price
          continue
        except Exception as e:
          td = data.find_all('td')
          if td:
            span = td[-1].find_all('span')
            if span:
              price = int(span[-1].get_text().split('g')[0].strip())
              for index in range(4):
                inf_dict[field+'_'+prices_sufix[index]] = price
              continue
          print(f'An error ocurred extracting the data "{field}" from {link}:')
          print(e)
          print('\n')
        continue

    if field == 'seed':
      try:
        seed_link = data.find_all('td')[1].find('a').get('href')
        if seed_link:
          url = home_url + seed_link
          inf_dict.update(get_seed_information(url))
      except Exception as e:
        print(f'An error ocurred extracting the data "{field}" from {link}:')
        print(e)
        print('\n')
    try:
      inf_dict[field] = data.find_all('td')[1].get_text().strip().replace(' • ', ',')
    except:
      pass

  # Found if it regrowth or not
  try:
    table = s.find('table', attrs = {'class': 'wikitable roundedborder'})
    row = table.find_all('tr')[2]
    cell = row.find_all('td')[-1]
    if 'Regrowth' in cell.get_text():
      days = int(cell.get_text().split(' ')[1])
      inf_dict['regrowth_time'] = days
    else:
      inf_dict['regrowth_time'] = np.nan
  except:
    inf_dict['regrowth_time'] = np.nan

  return inf_dict

In [24]:
# Testing
pprint(get_crop_information('https://stardewvalleywiki.com/Green_Bean'))

{'base': 'Artisan (+40%)',
 'description': 'A juicy little bean with a cool, crisp snap.',
 'general_store_price': 60,
 'growth_time': '10 days',
 'jojamart_price': 75,
 'name': 'Green Bean',
 'night_market_price': 60,
 'regrowth_time': 3,
 'season': 'Spring',
 'seed': 'Bean Starter',
 'sell_prices_gold': 60,
 'sell_prices_iridium': 80,
 'sell_prices_regular': 40,
 'sell_prices_silver': 50,
 'traveling_cart_price_max': 1000,
 'traveling_cart_price_min': 100,
 'xp': '9 Farming XP'}


## Extracting all the data

I'll call my extract data functions with my links list. Also I'll save the data in a Pandas DataFrame.

In [25]:
# Extracting data
df_data = [get_crop_information(link) for link in crops_links_list]

# Extracting DataFrame columns
columns = set()
for sublist in [list(item.keys()) for item in df_data]:
  for element in sublist:
    columns.add(element)
columns = list(columns)

# Making the DataFrame
df = pd.DataFrame(df_data, columns = columns)

print(f'DataFrame created with {df.shape[0]} rows and {df.shape[1]} columns\n')
print('Columns:')
print(df.dtypes)

DataFrame created with 43 rows and 26 columns

Columns:
growth_time                  object
source                       object
sell_prices_silver          float64
seed                         object
sell_prices_gold            float64
description                  object
general_store_price         float64
night_market_price          float64
sell_price_regular          float64
egg_festival_price          float64
sell_price_silver           float64
traveling_cart_price_min    float64
sell_prices_iridium         float64
sell_price_iridium          float64
artisan_sell_price           object
sell_price_gold             float64
sell_prices_regular         float64
traveling_cart_price_max    float64
regrowth_time               float64
base                         object
oasis_price                 float64
xp                           object
energy                       object
season                       object
jojamart_price              float64
name                         object
dtype: o

In [26]:
print(f'DataFrame created with {df.shape[0]} rows and {df.shape[1]} columns\n')
print('Columns:')
print(df.dtypes)

DataFrame created with 43 rows and 26 columns

Columns:
growth_time                  object
source                       object
sell_prices_silver          float64
seed                         object
sell_prices_gold            float64
description                  object
general_store_price         float64
night_market_price          float64
sell_price_regular          float64
egg_festival_price          float64
sell_price_silver           float64
traveling_cart_price_min    float64
sell_prices_iridium         float64
sell_price_iridium          float64
artisan_sell_price           object
sell_price_gold             float64
sell_prices_regular         float64
traveling_cart_price_max    float64
regrowth_time               float64
base                         object
oasis_price                 float64
xp                           object
energy                       object
season                       object
jojamart_price              float64
name                         object
dtype: o

Now, I'll save my data in crops_raw_data.csv file.

In [27]:
df.to_csv('crops_raw_data.csv')