## Proyecto Web Scraping

In [26]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [27]:
url = 'https://en.wikipedia.org/wiki/Gun_death_and_violence_in_the_United_States_by_state'
response = requests.get(url)


In [42]:
#Extraer informacion de la URL
if response:
    soup = BeautifulSoup(response.text, "html.parser")


In [43]:
#Extraer tabla
tabla = soup.find("table")

In [44]:
headers=[] #Lista para guardar encabezados
for th in tabla.find_all("th"): #Traer cada "th" en la tabla
    headers.append(th.text.strip()) #Convertir a texto y quitar espacios adelante y atras y guardar.


In [45]:
data = [] #Lista para guardar las filas de la tabla
for fila in tabla.find_all("tr"): #En cada fila buscar los "tr"
    celdas = fila.find_all("td") #Y dentro de los "tr" buscar los "td"
    celdas = [cell.text.strip() for cell in celdas] #List comprehension, convierte a texto y quita espacios
    data.append(celdas) #Guardar cada celda en la lista data

In [46]:
#Sacamos primer elemto de la tabla ya que esta en blanco
data_copy = data.copy()
del data_copy[0] #Para borrar un elemento de la lista [0] es el primero

In [101]:
#Creacion del Dataframe
df = pd.DataFrame(data=data_copy, columns=headers)

In [103]:
#Remover el "%" de la columna "% gun at home"
df.columns = df.columns.str.strip()
df["%  gun at  home"] = df["%  gun at  home"].str.replace("%", "", regex=False)

# Limpiar los datos de las columnas y luego convertirlas a numéricas
df.loc[:, df.columns != "location"] = df.loc[:, df.columns != "location"].apply(lambda col: pd.to_numeric(col.str.replace(r'[^\d.]', '', regex=True), errors='coerce')).astype(float)


In [105]:
#Dividir "%  gun at  home" entre 100
df["%  gun at  home"] = df["%  gun at  home"] / 100

In [106]:
#Convertir columnas de Object a tipo Float
for column in df.select_dtypes(include=['object']).columns:
    if column != 'Location':
        df[column] = df[column].astype(float)

In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Location             0 non-null      object 
 1   Gun  suicide  rate   52 non-null     float64
 2   Suicide  rate        52 non-null     float64
 3   Gun  homicide  rate  50 non-null     float64
 4   Homicide  rate       52 non-null     float64
 5   %  gun at  home      50 non-null     float64
dtypes: float64(5), object(1)
memory usage: 2.6+ KB


In [100]:
df


Unnamed: 0,Location,Gun suicide rate,Suicide rate,Gun homicide rate,Homicide rate,% gun at home
0,,7.9,14.5,6.3,7.8,
1,,23.7,32.8,1.7,2.8,0.61
2,,21.6,31.7,2.7,4.2,0.65
3,,19.4,30.0,4.2,6.7,0.57
4,,14.4,25.2,10.9,14.5,0.36
5,,13.8,22.0,6.4,8.6,0.55
6,,13.8,20.4,1.5,2.2,0.58
7,,13.2,20.1,2.6,3.1,0.53
8,,13.1,22.0,6.3,8.4,0.33
9,,12.9,20.4,9.3,11.1,0.52
