In [53]:
import requests
import json
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import urllib
import datetime
import os
from pathlib import Path

In [60]:
# VARIABLES

# DCurl stores DukasCopy URL
DCurl = "https://www.dukascopy.com/swiss/english/home/?utm_source=freeserv"

In [95]:
def validatePath(path):
    """
    Checks if path refers to a valid directory. If not an exception is raised.
    """
    
    try:
        os.path.isfile(filename)
    except:
        raise NotADirectoryError("The specified path does not exist or is invalid. The data will be collected and stored in the current working directory in the file: ")

In [46]:
# Converts epoch time to GMT Time

def convertFromEpoch(t):
    """
    Returns the GMT representation of a given epoch time.
    
    Parameters:
    t (int): The epoch time which is to be converted into GMT time
    """
    
    return time.strftime('%Y-%d-%m %H:%M:%S', time.gmtime(t))

print(convertFromEpoch(1578448320))

2020-08-01 01:52:00


In [47]:
# Fetches the time from DukasCopy Webpage and converts into readable format

def getDukasTime(soup):
    """
    Returns the extracted time as given on the DukasCopy webpage from where the DukasCopy data is being extracted from.
    
    Parameters:
    
    soup (soup): The BeautifulSoup object containing the parsed HTML representation of the DukasCopy webpage.
    
    
    """
    tdata = soup.find_all("span", {"id": "timeUpdate"})
    dtxt = tdata[0].text
    
    date_time_obj = datetime.datetime.strptime(dtxt, ' %a, %d %b %Y %H:%M:%S GMT')
    
    return date_time_obj

In [69]:
def updateData(NewFrame, filename):
    """
    Updates the pickle file specified bt "filepath" by appending new data from the "NewFrame" dataframe to it.
    
    Parameters:
    
    NewFrame (DataFrame): The dataframe which is to be appended to the end of the pickle file to update the pickle file with the new data.
    filepath (str): The filepath of the pickle file where new dataframe data is to be appended.
    
    """
    if (not filename.endswith(".pkl")):
        raise Exception("The filename specified must end with .pkl extension. The filename provided was: " + filename)
    
    if not os.path.isfile(filename):
        pd.to_pickle(NewFrame, filename)
        
    else:   
        df = pd.read_pickle(filename)
        df = df.append(NewFrame, sort=False)
        df.to_pickle(filename)

In [101]:
# Function to fetch Gold Data from MetalsAPI. Returns a new DataFrame with most recent data

def getMetalsApiData():
    """
    Returns a DataFrame object containing the most recent Gold Data from MetalsAPI. 
    Makes a request to the MetalsAPI to fetch the Gold Data before formatting it and returning it as a Pandas DataFrame.
    
    Returns:
    DataFrame: A pandas DataFrame containing the most recent Gold Data from MetalsAPI.
    """
    
    requestUrl = "http://metals-api.com/api/latest?access_key=anbne4hpic0ev3c2ymdyscbk5emkoni00qurxhpyrcvbuh2bu1j1yugbqbl17avu&base=USD&symbols = XAU"
    
    try:
        MetalsApiResponse = requests.get(requestUrl)
    except:
        raise Exception("Connection to MetalsAPI could not be made.")
    
    # Just making sure it's fine
    print("MetalsAPI Connection " + str(MetalsApiResponse.status_code))
    
    # Get the result json
    result = MetalsApiResponse.json()
    
    # Store results in dataframe
    df = pd.DataFrame(result)
    df["timestamp"] = df["timestamp"].map(convertFromEpoch)
    
    # Store only gold data and make time stamp the index
    df = df.loc[df.index == "XAU"]
    df = df.set_index("timestamp")
    
    return df

In [110]:
# Function to fetch Gold Data from DukasCopy. Returns a new DataFrame with most recent data.

def getDukasCopyData(pathToChromeDriver):
    """
    Returns a DataFrame object containing the most recent Gold Data from the DukasCopy website.
    Scrapes the data from the DukasCopy website before formatting and returning the data as a pandas DataFrame.
    
    Parameters:
    pathToChromeDriver (str): Path to the chromedriver executable to be used to open web pages.
    
    Returns:
    DataFrame: A pandas DataFrame containing the most recent Gold Data from the DukasCopy website.
    """
    
    browser = webdriver.Chrome(pathToChromeDriver)
    browser.get(DCurl)
    
    DChtml = browser.page_source
    soup = BeautifulSoup(DChtml, "lxml")
    
    # Finding the exact table with all the required data
    data = soup.find_all("table", {"id": "list"})
    
    # Creating and Modifying dataframe with the table 
    DCDataFrame = pd.read_html(str(data))[0]
    
    # Add a timestamp to the data
    DCDataFrame["timestamp"] = getDukasTime(soup)
    
    # Make the timestamp the index
    DCDataFrame = DCDataFrame.set_index("timestamp")
    
    # Filter dataframe only to Gold Data
    DCDataFrame = DCDataFrame[DCDataFrame["Live"] == "XAU/USD"]
    
    browser.quit()
    return DCDataFrame

In [111]:
# Fetches Dukas data and saves it to the pickle file.

def fetchAndSaveDukas(filename, chromedriverpath):
    """
    Fetches the most recent Gold Data from the DukasCopy website and updates the pickle file specified by "dukasFilepath" with the newest data.
    If no pickle file with the specified name exists, a new one will be created.
    If no filepath is specified, the current working directory will be used and data will be stored in the pickle file .
    
    Parameters:
    dukasFilepath(str): The filepath to the pickle file where the fetched data will be stored.
    """
    df = getDukasCopyData(chromedriverpath)
    updateData(df, filename)
    
# Fetches MetalsAPI data and saves it to the pickle file.
def fetchAndSaveMetals(filename):
    """
    Fetches the most recent Gold Data from MetalsAPI and updates the pickle file specified by "metalsFilepath" with the newest data.
    If no pickle file with the specified name exists, a new one will be created.
    If no filepath is specified, the default file path "ScrapedData/MetalsAPIGoldData.pkl" will be used.
    
    Parameters:
    metalsFilepath(str): The filepath to the pickle file where the fetched data will be stored.
    """
    
    df = getMetalsApiData() # dont forget to remove the dummy thing
    updateData(df, metalsFilepath)

In [118]:
def mainDriver(dukaspath="DukasGoldData.pkl", metalsApipath="MetalsAPIGoldData.pkl", chromedriverpath="chromedriver.exe"):
    try:
        validatePath(dukaspath)
    except NotADirectoryError as e:
        dukaspath = "DukasGoldData.pkl"
        print(repr(e) + dukaspath)
        
    try:
        validatePath(metalsApipath)
    except NotADirectoryError as e:
        metalsApipath = "MetalsAPIGoldData.pkl"
        print(repr(e) + metalsApipath)
    
    for i in range(5):
        try:
            fetchAndSaveDukas(dukaspath, chromedriverpath)
            fetchAndSaveMetals(metalsApipath)
        except Exception as e:
            print(repr(e))

In [119]:
mainDriver()

NotADirectoryError('The specified path does not exist or is invalid. The data will be collected and stored in the current working directory in the file: ')DukasGoldData.pkl
NotADirectoryError('The specified path does not exist or is invalid. The data will be collected and stored in the current working directory in the file: ')MetalsAPIGoldData.pkl
MetalsAPI Connection 200
NameError("name 'metalsFilepath' is not defined")
MetalsAPI Connection 200
NameError("name 'metalsFilepath' is not defined")


KeyboardInterrupt: 

In [116]:
df = pd.read_pickle("DukasGoldData.pkl")

In [117]:
df

Unnamed: 0_level_0,Live,Bid,Ask,Spread
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-09 08:15:37,XAU/USD,1545.57,1545.81,23.7
2020-01-09 08:15:48,XAU/USD,1545.52,1545.76,23.7
2020-01-09 08:16:00,XAU/USD,1545.18,1545.44,25.7
2020-01-09 08:16:11,XAU/USD,1545.2,1545.46,25.7
2020-01-09 08:16:22,XAU/USD,1544.98,1545.26,27.7
