# Webscraping bvc.cv with beautifulsoup4 

### Import libraries and make request to bvc historico de cotações page

In [48]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import plotly.express as px
from pathlib import Path

html_text = requests.get('https://bvc.cv/pagina/historico-cotacoes-bvc-62').text
soup = BeautifulSoup(html_text, 'lxml')
epfds = soup.find('tbody', class_= 'text-right').find_all('td')

list = []

for index, epfd in enumerate(epfds):
    list.append(epfd.text)

#Convert list to list of lists (entidade, preco_fecho, dates)
list_epfd = [list[i:i+3] for i in range(0, len(list), 3)]
#Convert list of list to dataframe
df_hist_cot = pd.DataFrame(list_epfd, columns=['entidade', 'preco_fecho', 'dates'])



In [49]:
print(df_hist_cot.head())
print(df_hist_cot.info())

  entidade preco_fecho       dates
0      SCT        8910  2024-05-31
1      ENA       11000  2024-05-31
2    CAIXA        5368  2024-05-31
3      BCA        7224  2024-05-31
4      SCT       10480  2024-04-30
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   entidade     816 non-null    object
 1   preco_fecho  816 non-null    object
 2   dates        816 non-null    object
dtypes: object(3)
memory usage: 19.2+ KB
None


### Convert data types

In [50]:
df_hist_cot = df_hist_cot.astype({'preco_fecho': float})
df_hist_cot['dates'] = pd.to_datetime(df_hist_cot['dates'])
print(df_hist_cot.info())
print(df_hist_cot)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   entidade     816 non-null    object        
 1   preco_fecho  816 non-null    float64       
 2   dates        816 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 19.2+ KB
None
    entidade  preco_fecho      dates
0        SCT       8910.0 2024-05-31
1        ENA      11000.0 2024-05-31
2      CAIXA       5368.0 2024-05-31
3        BCA       7224.0 2024-05-31
4        SCT      10480.0 2024-04-30
..       ...          ...        ...
811      BCA       3900.0 2006-02-08
812      SCT       6500.0 2006-02-01
813      BCA       4000.0 2006-01-16
814      BCA       3500.0 2006-01-12
815      BCA       3500.0 2005-12-21

[816 rows x 3 columns]


## Clean the data

Check for duplicate dates

In [51]:
#find duplicate rows across based off entidade and dates columns
duplicateRows = df_hist_cot[df_hist_cot.duplicated(['entidade', 'dates'], keep=False)]
duplicateRows

Unnamed: 0,entidade,preco_fecho,dates
101,BCA,1920.0,2021-12-28
102,BCA,1900.0,2021-12-28
188,ENA,2772.0,2019-05-02
189,ENA,2772.0,2019-05-02


ENA has 2 same preco_fecho's for the same date. This causes the preco_fecho of that day to be the sum of the duplicate values. <br>
BCA has 2 diferent preco_fecho on same date. This causes the same issue, the preco_fecho of that day will be the sum of the duplicate values.

In [52]:
# So I will drop a an entirely duplicate row, this will solve the issue for ENA.

df_hist_cot = df_hist_cot[~df_hist_cot.duplicated()]


Check for duplicate rows again

In [53]:
duplicateRows = df_hist_cot[df_hist_cot.duplicated(['entidade', 'dates'], keep=False)]
duplicateRows

Unnamed: 0,entidade,preco_fecho,dates
101,BCA,1920.0,2021-12-28
102,BCA,1900.0,2021-12-28


The second row with the date of 2021-12-28 should be in fact 2021-12-29. That's obviously not right so will edit the date manualy. 

In [54]:
#df_hist_cot[df_hist_cot.duplicated(['entidade', 'dates'], keep='last')].index

df_hist_cot.loc[df_hist_cot[df_hist_cot.duplicated(['entidade', 'dates'], keep='last')].index, 'dates'] = '2021-12-29'


In [55]:
#check if dates are good
print(df_hist_cot[df_hist_cot['dates'] == '2021-12-29'])
print(df_hist_cot[df_hist_cot['dates'] == '2021-12-28'])

    entidade  preco_fecho      dates
100    CAIXA       2735.0 2021-12-29
101      BCA       1920.0 2021-12-29
    entidade  preco_fecho      dates
102      BCA       1900.0 2021-12-28


### Start EDA

In [56]:
df_hist_cot.describe()

Unnamed: 0,preco_fecho
count,815.0
mean,4362.446626
std,1928.764959
min,1428.0
25%,3000.0
50%,3900.0
75%,6000.0
max,14950.0


### Create dataframes for each entidade

In [57]:
df_SCT = df_hist_cot[df_hist_cot['entidade'] == 'SCT']
print('_________SCT_________')
print(df_SCT.describe())

df_BCA = df_hist_cot[df_hist_cot['entidade'] == 'BCA']
print('_________BCA_________')
print(df_BCA.describe())

df_CAIXA = df_hist_cot[df_hist_cot['entidade'] == 'CAIXA']
print('_________CAIXA_________')
print(df_CAIXA.describe())

df_ENA = df_hist_cot[df_hist_cot['entidade'] == 'ENA']
print('_________ENA_________')
print(df_ENA.describe())

_________SCT_________
        preco_fecho
count    121.000000
mean    5902.272727
std     1976.553608
min     3000.000000
25%     4890.000000
50%     5850.000000
75%     6500.000000
max    14950.000000
_________BCA_________
       preco_fecho
count   279.000000
mean   2994.283154
std     680.576202
min    1428.000000
25%    2825.000000
50%    3100.000000
75%    3200.000000
max    7224.000000
_________CAIXA_________
       preco_fecho
count    87.000000
mean   3169.908046
std    1088.591790
min    2070.000000
25%    2495.000000
50%    2910.000000
75%    3225.000000
max    6500.000000
_________ENA_________
        preco_fecho
count    328.000000
mean    5274.487805
std     1853.160982
min     2000.000000
25%     4000.000000
50%     4615.000000
75%     6741.750000
max    13680.000000


In [58]:
filepath = Path('data/bvc_quotes_history.csv')
df_hist_cot.to_csv(filepath, index=False)