<a href="https://colab.research.google.com/github/Cecax27/DS-Stardew-Valley-Crops-Profit/blob/main/notebooks/Stardew_valley_web_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Web Scraper on Stardew Valley wiki

This notebook is to make web scraping on the Stardew Valley Wiki. We need to extract information about the crops, and the products that you can create with them.

The url of the website is https://stardewvalleywiki.com. You can visit it to find a lot of information about the game.

## Importing libraries

In [42]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from pprint import pprint

## Declaring constants

In [3]:
home_url = 'https://stardewvalleywiki.com'
crops_url = 'https://stardewvalleywiki.com/Crops'

## Get crops urls list

In [19]:
# Making the request
crops_page = requests.get(crops_url)

# Pasing the HTML with BeatifulSoup
crops_soup = BeautifulSoup(crops_page.text, 'lxml')

#Finding the urls
crops_names = crops_soup.find('div', attrs = {'class' : 'mw-parser-output'}).find_all('h3')[10:-1]

#Making a links list
crops_links_list = [home_url + crop_name.find_all('a')[1].get('href') for crop_name in crops_names]
crops_names_list = [crop_name.get_text().strip() for crop_name in crops_names]

print(f'{len(crops_links_list)} links founded')

43 links founded


## Declaring functions to extract data

In [39]:
# Function to extract information about the seed

def get_seed_information(link) -> dict:

  # Getting data to work
  try:
    s = requests.get(link)
    raw_information = BeautifulSoup(s.text, 'lxml')
  except Exception as e:
    print('An error ocurred making the request:')
    print(e)
    print('/n')
    return dict()

  # Dictionary to save the data
  price_dict = dict()

  # Finding data
  for row in raw_information.find('table').find_all('tr'):

    title = row.find('td').get_text().strip().replace(':', '').lower().replace(' ', '_')

    if title == 'general_store':
      try:
        price_dict[title+'_price'] = int(row.find_all('td')[1].find_all('span')[1].get_text()[:-1])
      except:
        price_dict[title+'_price'] = np.nan

    elif title == 'jojamart':
      try:
        price_dict[title+'_price'] = int(row.find_all('td')[1].find_all('span')[1].get_text()[:-1])
      except:
        price_dict[title+'_price'] = np.nan

    elif title == 'oasis':
      price_dict[title+'_price'] = int(row.find_all('td')[1].find_all('span')[1].get_text()[:-1])

    elif title == 'traveling_cart':
      price_dict[title+'_price_min'] = int(row.find_all('td')[1].get_text().split('"')[-1].split('g')[0].replace(',','').split('–')[0])
      price_dict[title+'_price_max'] = int(row.find_all('td')[1].get_text().split('"')[-1].split('g')[0].replace(',','').split('–')[1])

    elif title == 'night_market(winter_15)':
      price_dict[title.split('(')[0]+'_price'] = int(row.find_all('td')[1].find_all('span')[1].get_text()[:-1])

  return price_dict

In [43]:
# Testing
pprint(get_seed_information('https://stardewvalleywiki.com/Starfruit_Seeds'))

{'general_store_price': nan,
 'jojamart_price': nan,
 'oasis_price': 400,
 'traveling_cart_price_max': 1000,
 'traveling_cart_price_min': 600}


In [44]:
def get_crop_information(link) -> dict:

  # Getting data to work
  try:
    s = requests.get(link)
    raw_information = BeautifulSoup(s.text, 'lxml').find('table').find_all('tr')
  except Exception as e:
    print('An error ocurred making the request:')
    print(e)
    print('/n')
    return dict()

  # Constants
  prices_sufix = ['regular', 'silver', 'gold', 'iridium']

  # Dictionary to save the data
  inf_dict = {
      'name' : raw_information[0].get_text().strip(),
      'description' : raw_information[2].get_text().strip()
  }

  # Finding data
  for data in raw_information[4:]:

    field = data.find('td').get_text().strip().lower().replace(' ', '_')

    if '\t' in field or ':' in field or '/' in field or field == '' or field[0].isdigit():
      continue

    if field == 'sell_price' or field == 'sell_prices':
      try:
        prices = data.find('table').find_all('td')
        prices = list(map(lambda i: i.get_text().split('g')[0], prices))[1:]
        prices = list(filter(lambda i: i != '', prices))
        prices = list(map(lambda i: int(i), prices))
        for index, price in enumerate(prices):
          inf_dict[field+'_'+prices_sufix[index]] = price
        continue
      except Exception as e:
        try:
          index = raw_information.index(data) + 2
          prices = raw_information[index].find('table').find_all('td')
          prices = list(map(lambda i: i.get_text().split('g')[0].replace(',',''), prices))[1:]
          prices = list(filter(lambda i: i != '', prices))
          prices = list(map(lambda i: int(i), prices))
          for index, price in enumerate(prices):
            inf_dict[field+'_'+prices_sufix[index]] = price
          continue
        except Exception as e:
          print(e)
        continue
    if field == 'seed':
      try:
        url = home_url + data.find_all('td')[1].find('a').get('href')
        inf_dict.update(get_seed_information(url))
      except Exception as e:
        print(e)
    try:
      inf_dict[field] = data.find_all('td')[1].get_text().strip().replace(' • ', ',')
    except:
      pass

  return inf_dict

In [46]:
# Testing
pprint(get_crop_information('https://stardewvalleywiki.com/Blue_Jazz'))

{'base': 'Artisan (+40%)',
 'description': 'The flower grows in a sphere to invite as many butterflies as '
                'possible.',
 'general_store_price': 30,
 'growth_time': '7 days',
 'jojamart_price': 37,
 'name': 'Blue Jazz',
 'night_market_price': 30,
 'season': 'Spring',
 'seed': 'Jazz Seeds',
 'sell_prices_gold': 75,
 'sell_prices_iridium': 100,
 'sell_prices_regular': 50,
 'sell_prices_silver': 62,
 'traveling_cart_price_max': 1000,
 'traveling_cart_price_min': 100,
 'xp': '10 Farming XP'}


Extracting all the data

In [49]:
df_data = [get_crop_information(link) for link in crops_links_list]
columns = [list(item.keys()) for item in df_data]
columns = set()
for sublist in [list(item.keys()) for item in df_data]:
  for element in sublist:
    columns.add(element)
columns = list(columns)
df = pd.DataFrame(df_data, columns = columns)

can only concatenate str (not "NoneType") to str
invalid literal for int() with base 10: 'Not Sold\n'
invalid literal for int() with base 10: 'Not sold\n'
list index out of range
invalid literal for int() with base 10: 'Not sold\n'
invalid literal for int() with base 10: 'Not sold\n'
'NoneType' object has no attribute 'find_all'


In [51]:
name_column = df.pop('name')
df.insert(0, 'name', name_column)
df.sort_values('name')
#Joining the sell_price columns

df['sell_price_regular'] = df['sell_price_regular'].combine_first(df['sell_prices_regular'])
df = df.drop('sell_prices_regular', axis=1)

df['sell_price_silver'] = df['sell_price_silver'].combine_first(df['sell_prices_silver'])
df = df.drop('sell_prices_silver', axis=1)

df['sell_price_gold'] = df['sell_price_gold'].combine_first(df['sell_prices_gold'])
df = df.drop('sell_prices_gold', axis=1)

df['sell_price_iridium'] = df['sell_price_iridium'].combine_first(df['sell_prices_iridium'])
df = df.drop('sell_prices_iridium', axis=1)

# Create separate columns for each season
seasons = ['Spring', 'Summer', 'Fall', 'Winter']
for season in seasons:
    df[season.lower()] = df['season'].str.contains(season, case=False)

# Fill the values with True or False
df = df.replace({True: 'True', False: 'False'})
df = df.drop('season', axis=1)

# Print the resulting dataframe
df.head()

KeyError: ignored

In [52]:
df

Unnamed: 0,name,description,sell_price_gold,traveling_cart_price_max,traveling_cart_price_min,source,sell_price_silver,energy,base,xp,...,sell_price_iridium,jojamart_price,sell_price_regular,growth_time,seed,artisan_sell_price,spring,summer,fall,winter
0,Blue Jazz,The flower grows in a sphere to invite as many...,75.0,1000.0,100.0,,62.0,,Artisan (+40%),10 Farming XP,...,100.0,37.0,50.0,7 days,Jazz Seeds,,True,False,False,False
1,Cauliflower,"Valuable, but slow-growing. Despite its pale c...",262.0,1000.0,120.0,,218.0,,Artisan (+40%),23 Farming XP,...,350.0,100.0,175.0,12 days,Cauliflower Seeds,,True,False,False,False
2,Coffee Bean,Plant in spring or summer to grow a coffee pla...,22.0,,,"Dust Sprite,Traveling Cart",18.0,Inedible,,4 Farming XP per harvest,...,30.0,,15.0,10 days,Coffee Bean,150g,True,True,False,False
3,Garlic,Adds a wonderful zestiness to dishes. High qua...,90.0,1000.0,100.0,,75.0,,Artisan (+40%),12 Farming XP,...,120.0,,60.0,4 days,Garlic Seeds,,True,False,False,False
4,Green Bean,"A juicy little bean with a cool, crisp snap.",60.0,1000.0,100.0,,50.0,,Artisan (+40%),9 Farming XP,...,80.0,75.0,40.0,10 days,Bean Starter,,True,False,False,False
5,Kale,The waxy leaves are great in soups and stir frys.,165.0,1000.0,105.0,,137.0,,Artisan (+40%),17 Farming XP,...,220.0,87.0,110.0,6 days,Kale Seeds,,True,False,False,False
6,Parsnip,A spring tuber closely related to the carrot. ...,52.0,1000.0,100.0,,43.0,,Artisan (+40%),8 Farming XP,...,70.0,25.0,35.0,4 days,Parsnip Seeds,,True,False,False,False
7,Potato,A widely cultivated tuber.,120.0,1000.0,100.0,,100.0,,Artisan (+40%),14 Farming XP,...,160.0,62.0,80.0,6 days,Potato Seeds,,True,False,False,False
8,Rhubarb,"The stalks are extremely tart, but make a grea...",330.0,1000.0,150.0,,275.0,Inedible,Artisan (+40%),26 Farming XP,...,440.0,,220.0,13 days,Rhubarb Seeds,,True,False,False,False
9,Strawberry,"A sweet, juicy favorite with an appealing red ...",180.0,,,,150.0,,Artisan (+40%),18 Farming XP,...,240.0,,120.0,8 days,Strawberry Seeds,,True,False,False,False


In [None]:
df['growth_time'] = (
    df
    .growth_time
    .fillna('0')
    .apply(lambda i: i.split(' ')[0])
    .astype(int)
)

In [None]:
df.to_csv('crops.csv')

**Deleting the Ginger Island crops**

I deleted the Ginger Island crops because you can't buy the seed in the stores. You can trade they but it's more difficult to compare the price of the trade with gold. The Ginger Island crops are:

* Taro roots
* Pineapple

Also, I deleted some crops in similar situations, like:

* Tea Leaves
* Sweet Gem Berry
* Mixed Seeds
* Fiber
* Cactus Fruit
* Strawberry
* Coffee Bean




In [None]:
df = df.drop(df[df['name'] == 'Taro Root'].index)
df = df.drop(df[df['name'] == 'Pineapple'].index)

df = df.drop(df[df['name'] == 'Tea Leaves'].index)
df = df.drop(df[df['name'] == 'Sweet Gem Berry'].index)
df = df.drop(df[df['name'] == 'Mixed Seeds'].index)
df = df.drop(df[df['name'] == 'Fiber'].index)
df = df.drop(df[df['name'] == 'Cactus Fruit'].index)
df = df.drop(df[df['name'] == 'Strawberry'].index)
df = df.drop(df[df['name'] == 'Coffee Bean'].index)

In [None]:
df.columns

Index(['name', 'sell_price_gold', 'growth_time', 'traveling_cart_price_max',
       'seed', 'energy', 'base', 'night_market_price', 'source',
       'sell_price_regular', 'sell_price_iridium', 'oasis_price', 'xp',
       'general_store_price', 'sell_price_silver', 'jojamart_price',
       'artisan_sell_price', 'description', 'traveling_cart_price_min',
       'spring', 'summer', 'fall', 'winter', 'price_min', 'gold_per_day'],
      dtype='object')

In [None]:
df['price_min'] = df[['night_market_price', 'oasis_price', 'general_store_price', 'jojamart_price', 'traveling_cart_price_min']].min(axis=1)

In [None]:
df[['name', 'sell_price_regular', 'growth_time', 'price_min']].head()

Unnamed: 0,name,sell_price_regular,growth_time,price_min
0,Blue Jazz,50.0,7,30.0
1,Cauliflower,175.0,12,80.0
3,Garlic,60.0,4,40.0
4,Green Bean,40.0,10,60.0
5,Kale,110.0,6,70.0


In [None]:
df['gold_per_day'] = ( df['sell_price_iridium'] - df['price_min'])  / df['growth_time']

In [None]:
df[df['spring'] == 'True'][['name', 'sell_price_iridium', 'growth_time', 'price_min', 'gold_per_day']].sort_values('gold_per_day', ascending = False)

Unnamed: 0,name,sell_price_iridium,growth_time,price_min,gold_per_day
35,Ancient Fruit,1100.0,28,100.0,35.714286
8,Rhubarb,440.0,13,100.0,26.153846
5,Kale,220.0,6,70.0,25.0
1,Cauliflower,350.0,12,80.0,22.5
3,Garlic,120.0,4,40.0,20.0
7,Potato,160.0,6,50.0,18.333333
6,Parsnip,70.0,4,20.0,12.5
0,Blue Jazz,100.0,7,30.0,10.0
10,Tulip,60.0,6,20.0,6.666667
11,Unmilled Rice,60.0,8,40.0,2.5
