# Webscraping bvc.cv with beautifulsoup4 

### Import libraries and make request to bvc historico de cotações page

In [31]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import plotly.express as px
from pathlib import Path

html_text = requests.get('https://bvc.cv/pagina/historico-cotacoes-bvc-62').text
soup = BeautifulSoup(html_text, 'lxml')
epfds = soup.find('tbody', class_= 'text-right').find_all('td')

lista = []

for index, epfd in enumerate(epfds):
    lista.append(epfd.text)

#Convert lista to list of lists epfd
lista_epfd = [lista[i:i+3] for i in range(0, len(lista), 3)]
#Convert list of list to dataframe
df_hist_cot = pd.DataFrame(lista_epfd, columns=['entidade', 'preco_fecho', 'dates'])



In [32]:
print(df_hist_cot.head())
print(df_hist_cot.info())

  entidade preco_fecho       dates
0      SCT        6500  2022-03-01
1      SCT        6500  2022-02-25
2      SCT        6500  2022-02-23
3      SCT        6500  2022-02-15
4      BCA        1900  2022-02-11
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 729 entries, 0 to 728
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   entidade     729 non-null    object
 1   preco_fecho  729 non-null    object
 2   dates        729 non-null    object
dtypes: object(3)
memory usage: 17.2+ KB
None


### Convert dtypes

In [33]:
df_hist_cot = df_hist_cot.astype({'preco_fecho': float})
df_hist_cot['dates'] = pd.to_datetime(df_hist_cot['dates'])
print(df_hist_cot.info())
print(df_hist_cot)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 729 entries, 0 to 728
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   entidade     729 non-null    object        
 1   preco_fecho  729 non-null    float64       
 2   dates        729 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 17.2+ KB
None
    entidade  preco_fecho      dates
0        SCT       6500.0 2022-03-01
1        SCT       6500.0 2022-02-25
2        SCT       6500.0 2022-02-23
3        SCT       6500.0 2022-02-15
4        BCA       1900.0 2022-02-11
..       ...          ...        ...
724      BCA       3900.0 2006-02-08
725      SCT       6500.0 2006-02-01
726      BCA       4000.0 2006-01-16
727      BCA       3500.0 2006-01-12
728      BCA       3500.0 2005-12-21

[729 rows x 3 columns]


### Start EDA

In [34]:
df_hist_cot.describe()

Unnamed: 0,preco_fecho
count,729.0
mean,4162.112483
std,1607.83429
min,1428.0
25%,3000.0
50%,3740.0
75%,5500.0
max,7200.0


In [35]:
print(df_hist_cot.entidade.unique())

df_SCT = df_hist_cot[df_hist_cot['entidade'] == 'SCT']
print(df_SCT.describe())
df_BCA = df_hist_cot[df_hist_cot['entidade'] == 'BCA']
print(df_BCA.describe())
df_CAIXA = df_hist_cot[df_hist_cot['entidade'] == 'CAIXA']
print(df_CAIXA.describe())
df_ENA = df_hist_cot[df_hist_cot['entidade'] == 'ENA']
print(df_ENA.describe())

['SCT' 'BCA' 'CAIXA' 'ENA']
       preco_fecho
count   104.000000
mean   5388.221154
std    1155.557680
min    3000.000000
25%    4890.000000
50%    5500.000000
75%    6500.000000
max    7150.000000
       preco_fecho
count   254.000000
mean   2941.519685
std     576.908653
min    1428.000000
25%    2900.000000
50%    3100.000000
75%    3200.000000
max    4000.000000
       preco_fecho
count    69.000000
mean   3079.492754
std    1181.119280
min    2070.000000
25%    2400.000000
50%    2800.000000
75%    3000.000000
max    6500.000000
       preco_fecho
count   302.000000
mean   5013.821192
std    1558.021277
min    2000.000000
25%    4000.000000
50%    4410.000000
75%    6500.000000
max    7200.000000


### Start trying out plotly

In [37]:
filepath = Path('bvc.csv')
df_hist_cot.to_csv(filepath, index=False)