# Scrape Wallstreet online

In [6]:
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint
import pandas as pd
from datetime import datetime, date, timedelta
import re
import sqlite3

In [10]:
def scrape_futureprice():
    '''Scrapes wallstreet-online for current day future prices'''
    try: 
        url = 'https://www.wallstreet-online.de/rohstoffe/hu0002045586-weizen-1-tonne-1000-kg-europe-preis'
        page = requests.get(url)
        soup = BeautifulSoup(page.content, features="lxml")
        print(f'Server respsonse: {page}')
    except:
        print('Could not retrieve data from Wallstreet online')
    
    
    ## Table to Dateframe
    future_price_today = pd.DataFrame(columns=['Kontrakt', 'letzer_kurs', 'absolut', 'perf_perc', 'Fälligkeit', 'Vergleich', 'date_scraped'])
    soup.select('div:nth-child(11) > table > tbody >tr')
    try:
        for i in soup.select('div:nth-child(11) > table > tbody >tr'): #extrakt tablerows
            row = []
            for j in i.select('td'): # extract table data of rows
                row.append(j.get_text())
                print(f'row: {row}')
            row.append(date.today() - timedelta(days=1))
            ser = pd.Series(row, index=['Kontrakt', 'letzer_kurs', 'absolut', 'perf_perc', 'Fälligkeit', 'Vergleich', 'date_scraped'])
            
            future_price_today = pd.concat([future_price_today, pd.DataFrame(ser).transpose()], ignore_index=True)
        print("Managed to load data into dataframe 'future_price_today'")
        #print(future_price_today)
    except:
        print("Error loading data into dataframe 'future_price_today'")
    try:
        # Data cleaning
        future_price_today['currency']=[re.split('(\d+,\d+)', string)[2] for string in future_price_today['absolut']] #splits currency from price
        future_price_today['absolut']=[re.split('(\d+,\d+)', string)[1] for string in future_price_today['absolut']]
        future_price_today['letzer_kurs']=[re.split('(\d+,\d+)', string)[1] for string in future_price_today['letzer_kurs']]
        future_price_today.drop(['Vergleich'], inplace=True, axis=1)
        future_price_today.columns =['kontrakt', 'price', 'absolut_inc', 'perc_inc', 'date_fullfillment', 'date_price', 'currency'] # rename column names

        #change datatype from string to date
        future_price_today['date_fullfillment'] = pd.to_datetime(future_price_today['date_fullfillment'], format='%d.%m.%Y') 
        future_price_today['date_price'] = pd.to_datetime(future_price_today['date_price'], format='%Y-%m-%d')

        #change coma seperation of decimals to point
        future_price_today['price'] = [string.replace(',','.') for string in future_price_today['price']]
        future_price_today['price'] = pd.to_numeric(future_price_today['price'])
        future_price_today['commodity_id'] = 2 # commodity-id 2 = wheat
        print(f"Data loaded from wallstreet online:/n {future_price_today}")
    except: 
        print("Data cleaning was not succesful")
    return future_price_today

future_df = scrape_futureprice()

Server respsonse: <Response [200]>
Managed to load data into dataframe 'future_price_today'
Data loaded from wallstreet online:/n Empty DataFrame
Columns: [kontrakt, price, absolut_inc, perc_inc, date_fullfillment, date_price, currency, commodity_id]
Index: []


In [14]:
future_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   kontrakt           10 non-null     object        
 1   price              10 non-null     float64       
 2   absolut_inc        10 non-null     object        
 3   perc_inc           10 non-null     object        
 4   date_fullfillment  10 non-null     datetime64[ns]
 5   date_price         10 non-null     datetime64[ns]
 6   currency           10 non-null     object        
 7   commodity_id       10 non-null     int64         
dtypes: datetime64[ns](2), float64(1), int64(1), object(4)
memory usage: 768.0+ bytes


In [15]:
#Simulate random wheat price in the future
for i in range(0,9):
    for day in range(200):
        ts = future_df.iloc[i]["date_price"] - timedelta(days=day)
        price = future_df.iloc[i]['price'] + randint(-60, 60)
        series = future_df.iloc[i].copy()
        series["date_price"] = ts    
        series["price"] = price
        future_df = pd.concat([future_df, pd.DataFrame(series).transpose()], axis=0)
future_df    

Unnamed: 0,kontrakt,price,absolut_inc,perc_inc,date_fullfillment,date_price,currency,commodity_id
0,Mai 2022,333.1,2540,"-7,09 %",2022-05-23,2022-05-18,GBP,2
1,Jul 2022,341.1,1745,"-4,87 %",2022-07-07,2022-05-18,GBP,2
2,Nov 2022,340.1,1140,"-3,24 %",2022-11-23,2022-05-18,GBP,2
3,Jan 2023,342.6,1090,"-3,08 %",2023-01-23,2022-05-18,GBP,2
4,Mär 2023,343.85,1040,"-2,94 %",2023-03-23,2022-05-18,GBP,2
...,...,...,...,...,...,...,...,...
8,Jan 2024,236.15,590,"-2,06 %",2024-01-23,2021-11-04,GBP,2
8,Jan 2024,320.15,590,"-2,06 %",2024-01-23,2021-11-03,GBP,2
8,Jan 2024,338.15,590,"-2,06 %",2024-01-23,2021-11-02,GBP,2
8,Jan 2024,265.15,590,"-2,06 %",2024-01-23,2021-11-01,GBP,2


In [19]:
future_df_clean = future_df[['commodity_id', 'date_fullfillment', 'date_price', 'price', 'currency']]

In [10]:
import sqlite3
def future_price_to_sql(future_price_today):
    con = sqlite3.connect('contrcalc.db') 
    future_price_today.to_sql('price_table', con, if_exists='append', index=False)
    return
future_price_to_sql(tosql_df)

In [17]:
import pandas as pd
con = sqlite3.connect('contrcalc.db') 
price_df = pd.read_sql('Select * FROM price_table', con, index_col='price_id')

In [18]:
price_df

Unnamed: 0_level_0,commodity_id,date_fullfillment,date_price,price,currency
price_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2,2022-05-23 00:00:00,2022-05-17 00:00:00,358.50,GBP
2,2,2022-07-07 00:00:00,2022-05-17 00:00:00,358.55,GBP
3,2,2022-11-23 00:00:00,2022-05-17 00:00:00,351.50,GBP
4,2,2023-01-23 00:00:00,2022-05-17 00:00:00,353.50,GBP
5,2,2023-03-23 00:00:00,2022-05-17 00:00:00,354.25,GBP
...,...,...,...,...,...
6906,2,2024-01-23 00:00:00,2021-11-04 00:00:00,298.15,GBP
6907,2,2024-01-23 00:00:00,2021-11-03 00:00:00,300.15,GBP
6908,2,2024-01-23 00:00:00,2021-11-02 00:00:00,290.15,GBP
6909,2,2024-01-23 00:00:00,2021-11-01 00:00:00,235.15,GBP


In [20]:
price_df = pd.concat([price_df, future_df_clean], axis=0)

In [22]:
price_df.drop_duplicates(keep= 'first', inplace=True)

In [20]:
import pandas as pd

In [46]:
wheat_df = pd.read_csv('wheat_df')
wheat_df

Unnamed: 0.1,Unnamed: 0,date,price
0,0,2021-12-31,247.123269
1,1,2022-01-01,243.810245
2,2,2022-01-02,243.885781
3,3,2022-01-03,243.904026
4,4,2022-01-04,242.068998
...,...,...,...
120,120,2022-05-07,384.904240
121,121,2022-05-08,384.904240
122,122,2022-05-09,385.118540
123,123,2022-05-10,376.465201


In [49]:
from datetime import date
price = wheat_df['price']
date_price = wheat_df['date']
price_id = wheat_df['Unnamed: 0']
date_fullfillment = date.today()
currency = 'GBP'
commodity_id = 2

In [50]:
fake_prices = {'price_id':price_id, 'commodity_id':commodity_id, 'date_fullfillment':date_fullfillment, 'date_price':date_price, 'price':price, 'currency':currency}

In [52]:
price_table_df = pd.DataFrame(fake_prices)

In [58]:
def populate_prices():
    '''stores wheat_df in database price_table'''
    wheat_df = pd.read_csv('wheat_df')
    price = wheat_df['price']
    date_price = wheat_df['date']
    price_id = wheat_df['Unnamed: 0']
    date_fullfillment = date.today()
    currency = 'GBP'
    commodity_id = 2
    fake_prices = {'price_id':price_id, 'commodity_id':commodity_id, 'date_fullfillment':date_fullfillment, 'date_price':date_price, 'price':price, 'currency':currency}
    price_table_df = pd.DataFrame(fake_prices)
    try:
        con = sqlite3.connect('contrcalc.db')
        print('connected to db')

        price_table_df.to_sql('price_table', con, if_exists='replace', index=False, index_label='price_id')
        print('price_df stored to db')

        output_message = 'Sucessfuly saved to db'
        print(output_message)
    except:
        output_message = 'Failed to store to db'
    return output_message

connected to db
price_df stored to db
Sucessfuly saved to db
