# Tutorial de Web Scraping con Python

Importar paquetes importantes

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

Especificar el URL del que obtendremos el html

In [2]:
html = urlopen("http://www.hubertiming.com/results/2018Resolution")

Convertir el html en un objeto de beautiful soup

In [3]:
bsobj = BeautifulSoup(html, 'lxml')
print(bsobj)

<!DOCTYPE html>
<html>
<head>
<meta content="Race results for the 2018 Resolution Relay!" property="og:title"/>
<meta content="Results of the 2018 Resolution Relay at Boring Station Trailhead Park in Boring , OR." property="og:description"/>
<meta content="https://www.hubertiming.com/results/resolutionRelay2018.jpg" property="og:image"/>
<meta content="https://www.hubertiming.com/results/2018Resolution" property="og:url"/>
<meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>
<title>2018 Resolution Relay Half Marathon 4 Person Team Race Results</title>
<link crossorigin="anonymous" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css" integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" rel="stylesheet"/>
<link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css" rel="stylesheet"/>
<link href="//cdn.rawgit.com/noelboss/featherlight/1.7.11/release/featherlight.min.css

Almacenar la información objetivo utilizando el 'tag' correspondiente

In [4]:
table_rows = bsobj.findAll('tr')
print(table_rows)

[<tr colspan="2"><b>4 Person Team:</b></tr>, <tr><td>Finishers:</td><td>26</td></tr>, <tr><td>Male:</td><td>3</td></tr>, <tr><td>Female:</td><td>8</td></tr>, <tr><td>Other:</td><td>15</td></tr>, <tr class="header">
<th>Place</th>
<th>Bib</th>
<th>Name</th>
<th>Gender</th>
<th>Chip Time</th>
<th>Chip Pace</th>
<th>Division</th>
<th>Division Place</th>
<th>Time to Start</th>
<th>Gun Time</th>
</tr>, <tr>
<td>1</td>
<td>419</td>
<td>CPDC+TRINITY+SHON=SUCCESS </td>
<td>M</td>
<td>1:17:13</td>
<td>5:53</td>
<td>Male</td>
<td>1 of 3</td>
<td>0:00</td>
<td>1:17:13</td>
</tr>, <tr>
<td>2</td>
<td>426</td>
<td>TROUTDALE TROUTS </td>
<td>M</td>
<td>1:20:23</td>
<td>6:08</td>
<td>Male</td>
<td>2 of 3</td>
<td>0:00</td>
<td>1:20:23</td>
</tr>, <tr>
<td>3</td>
<td>423</td>
<td>COUGAR RAIDER PILOTS RUN </td>
<td>M</td>
<td>1:31:56</td>
<td>7:01</td>
<td>Male</td>
<td>3 of 3</td>
<td>0:00</td>
<td>1:31:56</td>
</tr>, <tr>
<td>4</td>
<td>414</td>
<td>THERAPEUTIC ASSOCIATES </td>
<td>I</td>
<td>1:32:14

Pero en realidad lo que necesitamos es todas estas filas en una lista para poder convertirla en un dataframe

In [5]:
for row in table_rows:
    each_row= row.findAll('td')
    print(each_row)

[]
[<td>Finishers:</td>, <td>26</td>]
[<td>Male:</td>, <td>3</td>]
[<td>Female:</td>, <td>8</td>]
[<td>Other:</td>, <td>15</td>]
[]
[<td>1</td>, <td>419</td>, <td>CPDC+TRINITY+SHON=SUCCESS </td>, <td>M</td>, <td>1:17:13</td>, <td>5:53</td>, <td>Male</td>, <td>1 of 3</td>, <td>0:00</td>, <td>1:17:13</td>]
[<td>2</td>, <td>426</td>, <td>TROUTDALE TROUTS </td>, <td>M</td>, <td>1:20:23</td>, <td>6:08</td>, <td>Male</td>, <td>2 of 3</td>, <td>0:00</td>, <td>1:20:23</td>]
[<td>3</td>, <td>423</td>, <td>COUGAR RAIDER PILOTS RUN </td>, <td>M</td>, <td>1:31:56</td>, <td>7:01</td>, <td>Male</td>, <td>3 of 3</td>, <td>0:00</td>, <td>1:31:56</td>]
[<td>4</td>, <td>414</td>, <td>THERAPEUTIC ASSOCIATES </td>, <td>I</td>, <td>1:32:14</td>, <td>7:02</td>, <td>Mixed</td>, <td>1 of 15</td>, <td>0:00</td>, <td>1:32:14</td>]
[<td>5</td>, <td>420</td>, <td>BIRTHDAY BUDDIES </td>, <td>I</td>, <td>1:41:29</td>, <td>7:44</td>, <td>Mixed</td>, <td>2 of 15</td>, <td>0:00</td>, <td>1:41:29</td>]
[<td>6</td>, <td

Este texto no nos sirve porque contiene las etiquetas del html; vamos a removerlo con Beautiful Soup

A continuación realizaremos dos procesos:

1) Remover las etiquetas html.

2) Crear una lista vacía a la cual iremos pegando cada fila de texto limpia.

In [6]:
lists_of_rows = []

for row in table_rows:
    each_row= row.findAll('td')
    str_row= str(each_row)
    row_text = BeautifulSoup(str_row, "lxml").get_text()
    lists_of_rows.append(row_text)
    
print(lists_of_rows)

['[]', '[Finishers:, 26]', '[Male:, 3]', '[Female:, 8]', '[Other:, 15]', '[]', '[1, 419, CPDC+TRINITY+SHON=SUCCESS , M, 1:17:13, 5:53, Male, 1 of 3, 0:00, 1:17:13]', '[2, 426, TROUTDALE TROUTS , M, 1:20:23, 6:08, Male, 2 of 3, 0:00, 1:20:23]', '[3, 423, COUGAR RAIDER PILOTS RUN , M, 1:31:56, 7:01, Male, 3 of 3, 0:00, 1:31:56]', '[4, 414, THERAPEUTIC ASSOCIATES , I, 1:32:14, 7:02, Mixed, 1 of 15, 0:00, 1:32:14]', '[5, 420, BIRTHDAY BUDDIES , I, 1:41:29, 7:44, Mixed, 2 of 15, 0:00, 1:41:29]', '[6, 406, KICKING BUTTS & TAKING NAMES , F, 1:44:16, 7:57, Female, 1 of 8, 0:00, 1:44:16]', '[7, 417, BE TEAM , I, 1:50:36, 8:26, Mixed, 3 of 15, 0:00, 1:50:36]', '[8, 404, QUADZUKI , I, 1:50:44, 8:27, Mixed, 4 of 15, 0:00, 1:50:44]', '[9, 415, MIAMI2020 , I, 1:51:11, 8:29, Mixed, 5 of 15, 0:00, 1:51:11]', '[10, 402, HARD TO BEAT , F, 1:55:30, 8:49, Female, 2 of 8, 0:00, 1:55:30]', '[11, 425, THE BIG DOG JOGGERS , I, 1:56:39, 8:54, Mixed, 6 of 15, 0:00, 1:56:39]', '[12, 408, GANG GREEN ... AND GOLD 

El siguiente paso es convertir esta lista en un dataframe de pandas

In [7]:
import pandas as pd
import numpy as np

In [8]:
data = pd.DataFrame(lists_of_rows[5:])
print(data)

                                                    0
0                                                  []
1   [1, 419, CPDC+TRINITY+SHON=SUCCESS , M, 1:17:1...
2   [2, 426, TROUTDALE TROUTS , M, 1:20:23, 6:08, ...
3   [3, 423, COUGAR RAIDER PILOTS RUN , M, 1:31:56...
4   [4, 414, THERAPEUTIC ASSOCIATES , I, 1:32:14, ...
5   [5, 420, BIRTHDAY BUDDIES , I, 1:41:29, 7:44, ...
6   [6, 406, KICKING BUTTS & TAKING NAMES , F, 1:4...
7   [7, 417, BE TEAM , I, 1:50:36, 8:26, Mixed, 3 ...
8   [8, 404, QUADZUKI , I, 1:50:44, 8:27, Mixed, 4...
9   [9, 415, MIAMI2020 , I, 1:51:11, 8:29, Mixed, ...
10  [10, 402, HARD TO BEAT , F, 1:55:30, 8:49, Fem...
11  [11, 425, THE BIG DOG JOGGERS , I, 1:56:39, 8:...
12  [12, 408, GANG GREEN ... AND GOLD , I, 1:57:40...
13  [13, 407, MY LITTLE BRONIES , I, 1:58:35, 9:03...
14  [14, 411, WINE IN THE NEW YEAR , I, 2:01:28, 9...
15  [15, 401, LUCKY CHARM , I, 2:05:25, 9:34, Mixe...
16  [16, 424, MOSTLY MCKNIGHTS , I, 2:07:01, 9:41,...
17  [17, 421, I QUIT!!! , F,