# Web Scraping e Análise Exploratória de Dados

### Web Scraping com Selenium

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import json
import lxml.html as lh
from selenium import webdriver
from selenium.webdriver.firefox.options import Options

In [2]:
# acessando a pagina 
url = "https://globoesporte.globo.com/futebol/brasileirao-serie-a/"
option = Options()
option.headless = True
driver = webdriver.Firefox(options=option)
driver.get(url)
driver.implicitly_wait(10)

In [3]:
# Obtendo a 1ª tabela com os nomes dos times
element = driver.find_element_by_xpath('//*[@id="classificacao__wrapper"]/article/section[1]/div/table[1]')
html_content = element.get_attribute('outerHTML')

soup = BeautifulSoup(html_content, 'html.parser')
html_time_names= soup.find(name='table')
df_time_names = pd.read_html(str(html_time_names))[0]
time_names = df_time_names['Classificação.1'].values

# print do nome dos clubs
print(time_names)

['FlamengoFLA' 'InternacionalINT' 'Atlético-MGCAM' 'São PauloSAO'
 'FluminenseFLU' 'GrêmioGRE' 'PalmeirasPAL' 'SantosSAN' 'Athletico-PRCAP'
 'BragantinoBGT' 'CearáCEA' 'CorinthiansCOR' 'Atlético-GOACG' 'BahiaBAH'
 'SportSPT' 'FortalezaFOR' 'VascoVAS' 'GoiásGOI' 'CoritibaCFC'
 'BotafogoBOT']


In [4]:
# Obtendo a 2ª tabela com os valores de cada time
element1 = driver.find_element_by_xpath('//*[@id="classificacao__wrapper"]/article/section[1]/div/table[2]')
html_content1 = element1.get_attribute('outerHTML')

soup = BeautifulSoup(html_content1, 'html.parser')
html_time_data= soup.find(name='table')

df_time_data1 = pd.read_html(str(html_time_data))[0]
df_time = df_time_data1[['P', 'J', 'V', 'E', 'D', 'GP', 'GC', 'SG', '%']]
df_time['Clubs'] = time_names
df_time['Classificação'] = range(1,21)
df_br = df_time[['Classificação','Clubs','P', 'J', 'V', 'E', 'D', 'GP', 'GC', 'SG', '%']]


### Análise Exploratória

In [5]:
# Tabelas de classificação Brasileirão 2020
df_br.set_index('Classificação')

Unnamed: 0_level_0,Clubs,P,J,V,E,D,GP,GC,SG,%
Classificação,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,FlamengoFLA,71,38,21,8,9,68,48,20,62.3
2,InternacionalINT,70,38,20,10,8,61,35,26,61.4
3,Atlético-MGCAM,68,38,20,8,10,64,45,19,59.6
4,São PauloSAO,66,38,18,12,8,59,41,18,57.9
5,FluminenseFLU,64,38,18,10,10,55,42,13,56.1
6,GrêmioGRE,59,38,14,17,7,53,40,13,51.8
7,PalmeirasPAL,58,38,15,13,10,51,37,14,50.9
8,SantosSAN,54,38,14,12,12,52,51,1,47.4
9,Athletico-PRCAP,53,38,15,8,15,38,36,2,46.5
10,BragantinoBGT,53,38,13,14,11,50,40,10,46.5


In [6]:
# Descrição dos dados considerando exceto coluna: "Classificação "
df_br.iloc[:,1:].describe()

Unnamed: 0,P,J,V,E,D,GP,GC,SG,%
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,51.6,38.0,13.6,10.8,13.6,47.2,47.2,0.0,45.27
std,12.713027,0.0,4.309109,2.587419,4.604346,11.34437,8.401754,16.685954,11.143849
min,27.0,38.0,5.0,6.0,7.0,31.0,35.0,-30.0,23.7
25%,41.75,38.0,11.5,9.5,10.0,37.75,40.75,-13.0,36.6
50%,52.5,38.0,13.5,10.5,12.5,49.0,45.0,1.5,46.05
75%,60.25,38.0,15.75,12.0,17.25,54.25,51.75,13.25,52.875
max,71.0,38.0,21.0,17.0,21.0,68.0,63.0,26.0,62.3


In [7]:
# Melhores ataques
df_br_ataque = df_br[df_br['GP'] > df_br['GP'].mean()]
df_br_ataque = df_br_ataque[['Clubs', 'GP']]
df_br_ataque.index=range(1,len(df_br_ataque)+1)
df_br_ataque[0:3]

Unnamed: 0,Clubs,GP
1,FlamengoFLA,68
2,InternacionalINT,61
3,Atlético-MGCAM,64


In [8]:
# Piores defesas 
df_br_ataque = df_br[df_br['GC'] > df_br['GC'].mean()]
df_br_ataque = df_br_ataque[['Clubs', 'GC']]
df_br_ataque.index=range(1,len(df_br_ataque)+1)
df_br_ataque[0:]

Unnamed: 0,Clubs,GC
1,FlamengoFLA,48
2,SantosSAN,51
3,CearáCEA,51
4,BahiaBAH,59
5,SportSPT,50
6,VascoVAS,56
7,GoiásGOI,63
8,CoritibaCFC,54
9,BotafogoBOT,62
