# Coinmarketcap - BTC Web Scraper

In [32]:
import urllib.request
import warnings
from bs4 import BeautifulSoup
import pandas as pd

We start by connecting to the coinmarket URL and parse the web data using BeautifulSoup. To update the data being scraped you need to change the url to the current date.

In [33]:
url = "https://coinmarketcap.com/currencies/bitcoin/historical-data/?start=20130428&end=20180226"
page =  urllib.request.urlopen(url)
soup = BeautifulSoup(page, "lxml")

The price data that we want is in the td tag, we append the web data into a pandas dataframe

In [34]:
all_data=soup.find_all('td')

df = pd.DataFrame()

cols = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Market Cap']

for i, v in enumerate(cols):
    df[v] = all_data[i::7]

We look at the head of the dataframe, we want to check that all of the data is in the right columns. Now that we know that it is we can start cleaning the data up

In [35]:
df = df[0:1519]
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Market Cap
0,"<td class=""text-left"">Feb 25, 2018</td>","<td data-format-fiat="""" data-format-value=""979...","<td data-format-fiat="""" data-format-value=""992...","<td data-format-fiat="""" data-format-value=""940...","<td data-format-fiat="""" data-format-value=""966...","<td data-format-market-cap="""" data-format-valu...","<td data-format-market-cap="""" data-format-valu..."
1,"<td class=""text-left"">Feb 24, 2018</td>","<td data-format-fiat="""" data-format-value=""102...","<td data-format-fiat="""" data-format-value=""105...","<td data-format-fiat="""" data-format-value=""954...","<td data-format-fiat="""" data-format-value=""981...","<td data-format-market-cap="""" data-format-valu...","<td data-format-market-cap="""" data-format-valu..."
2,"<td class=""text-left"">Feb 23, 2018</td>","<td data-format-fiat="""" data-format-value=""993...","<td data-format-fiat="""" data-format-value=""104...","<td data-format-fiat="""" data-format-value=""973...","<td data-format-fiat="""" data-format-value=""103...","<td data-format-market-cap="""" data-format-valu...","<td data-format-market-cap="""" data-format-valu..."
3,"<td class=""text-left"">Feb 22, 2018</td>","<td data-format-fiat="""" data-format-value=""106...","<td data-format-fiat="""" data-format-value=""110...","<td data-format-fiat="""" data-format-value=""993...","<td data-format-fiat="""" data-format-value=""100...","<td data-format-market-cap="""" data-format-valu...","<td data-format-market-cap="""" data-format-valu..."
4,"<td class=""text-left"">Feb 21, 2018</td>","<td data-format-fiat="""" data-format-value=""113...","<td data-format-fiat="""" data-format-value=""114...","<td data-format-fiat="""" data-format-value=""104...","<td data-format-fiat="""" data-format-value=""106...","<td data-format-market-cap="""" data-format-valu...","<td data-format-market-cap="""" data-format-valu..."


Note that I only keep the first 1519 rows since the bottom rows contain no volume data

We start by cleaning up the numeric columns, to do this we convert the from type object to string, we extract part of the string of interest using the split method, finally we convert the data to from type string to numeric.

In [36]:
num_list = ['Open', 'High', 'Low', 'Close', 'Volume', 'Market Cap']

for item in num_list:
    df[item] = df[item].astype(str)
    df[item] = df[item].str.split('="').str.get(2)
    df[item] = df[item].str.split('">').str.get(0)
    df[item] = pd.to_numeric(df[item])

We then clean up the Date column, to do this we convert the from type object to string, we extract part of the string of interest using the split method, finally we convert the data to from type string to datetime.

In [37]:
df['Date'] = df['Date'].astype(str)
df['Date'] = df['Date'].str.split('">').str.get(1)
df['Date'] = df['Date'].str.split('</td>').str.get(0)
df['Date'] = pd.to_datetime(df['Date'])

We now look at our dataframe using the head and info methods. Everything looks ok!

In [38]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Market Cap
0,2018-02-25,9796.42,9923.22,9407.06,9664.73,5706940000.0,165407000000.0
1,2018-02-24,10287.7,10597.2,9546.97,9813.07,6917930000.0,173682000000.0
2,2018-02-23,9937.07,10487.3,9734.56,10301.1,7739500000.0,167746000000.0
3,2018-02-22,10660.4,11039.1,9939.09,10005.0,8040080000.0,179936000000.0
4,2018-02-21,11372.2,11418.5,10479.1,10690.4,9405340000.0,191927000000.0


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1519 entries, 0 to 1518
Data columns (total 7 columns):
Date          1519 non-null datetime64[ns]
Open          1519 non-null float64
High          1519 non-null float64
Low           1519 non-null float64
Close         1519 non-null float64
Volume        1519 non-null float64
Market Cap    1519 non-null float64
dtypes: datetime64[ns](1), float64(6)
memory usage: 83.1 KB
