# Análisis Exploratorio de Muertes por COVID-19 en Estados Unidos

#### Este cuaderno es un análisis exploratorio de datos (EDA por sus siglas en inglés significa "Exploratory Data Analysis" ) sobre las muertes por COVID-19 en Estados Unidos. 

#### Se utilizarán diversas visualizaciones y técnicas para examinar y comprender los datos, buscando patrones interesantes y describiendo la historia que cuentan estos datos. 

# Librerías estándar de análisis de datos


In [14]:
import pandas as pd
import numpy as np

# Librerías de visualización

In [15]:
import matplotlib.pyplot as plt
import seaborn as sns
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool

# URL del dataset de muertes por COVID-19 en Estados Unidos

In [16]:
data_url = "https://api.covidtracking.com/v1/states/daily.json"

# Importar el dataset

In [17]:
df_death = pd.read_json(data_url)

# Visualización de las primeras filas del dataset con información de fecha y muertes

In [5]:
df_death['date'] = pd.to_datetime(df_death['date'], format='%Y%m%d')
df_death['death'] = df_death['death'].apply(lambda x: str(x).rstrip('.0'))
df_death['deathIncrease'] = df_death['deathIncrease'].apply(lambda x: 'NoIncrease' if x == 0 else x)
df_death[['date', 'state', 'death', 'deathIncrease']].sort_values(by='date', ascending=False).head(10)


Unnamed: 0,date,state,death,deathIncrease
0,2021-03-07,AK,305,NoIncrease
29,2021-03-07,MT,1381,NoIncrease
31,2021-03-07,ND,1478,NoIncrease
32,2021-03-07,NE,2113,NoIncrease
33,2021-03-07,NH,1184,3
34,2021-03-07,NJ,23574,17
35,2021-03-07,NM,3808,12
36,2021-03-07,NV,5037,1
37,2021-03-07,NY,39029,59
38,2021-03-07,OH,17656,NoIncrease


# Información básica del dataset

In [10]:
df_death.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20780 entries, 0 to 20779
Data columns (total 56 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   date                         20780 non-null  datetime64[ns]
 1   state                        20780 non-null  object        
 2   positive                     20592 non-null  float64       
 3   probableCases                9271 non-null   float64       
 4   negative                     13290 non-null  float64       
 5   pending                      2138 non-null   float64       
 6   totalTestResultsSource       20780 non-null  object        
 7   totalTestResults             20614 non-null  float64       
 8   hospitalizedCurrently        17339 non-null  float64       
 9   hospitalizedCumulative       12382 non-null  float64       
 10  inIcuCurrently               11636 non-null  float64       
 11  inIcuCumulative              3789 non-nul

# Preparación de datos

## Convertir la columna 'date' al formato datetime


In [11]:
df_death['date'] = pd.to_datetime(df_death['date'], format='%Y%m%d')

# Descripción de las variables contenidas en el dataset:
#### Fecha: Fecha del registro.
#### Estado: Estado de Estados Unidos.
#### FIPS: Código de FIPS del estado.
#### Casos totales confirmados: Total de casos confirmados de COVID-19 en el estado.
#### Muertes totales: Total de muertes por COVID-19 en el estado.

## Análisis estadístico básic


In [7]:
df_death.describe()


Unnamed: 0,date,positive,probableCases,negative,pending,totalTestResults,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,...,total,totalTestResultsIncrease,posNeg,dataQualityGrade,hospitalizedIncrease,commercialScore,negativeRegularScore,negativeScore,positiveScore,score
count,20780,20592.0,9271.0,13290.0,2138.0,20614.0,17339.0,12382.0,11636.0,3789.0,...,20780.0,20780.0,20780.0,0.0,20780.0,20780.0,20780.0,20780.0,20780.0,20780.0
mean,2020-09-02 18:13:30.779595776,165156.0,21729.123719,848224.6,1659.862956,2186936.0,1190.576965,9262.762478,359.621176,1934.191607,...,706320.8,17508.38821,706150.0,,37.36078,0.0,0.0,0.0,0.0,0.0
min,2020-01-13 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6.0,...,0.0,-130545.0,0.0,,-12257.0,0.0,0.0,0.0,0.0,0.0
25%,2020-06-02 00:00:00,5753.75,773.5,53941.25,43.0,104049.8,166.5,985.25,60.0,501.0,...,25779.75,1205.75,25658.75,,0.0,0.0,0.0,0.0,0.0,0.0
50%,2020-09-03 00:00:00,46064.5,4491.0,305972.0,212.5,655267.0,531.0,4472.0,172.0,1295.0,...,193725.5,6125.0,193581.0,,0.0,0.0,0.0,0.0,0.0,0.0
75%,2020-12-05 00:00:00,177958.0,19553.0,1056611.0,1295.75,2264766.0,1279.0,12248.5,380.0,2451.0,...,773740.8,19086.5,773740.8,,36.0,0.0,0.0,0.0,0.0,0.0
max,2021-03-07 00:00:00,3501394.0,365961.0,10186940.0,64400.0,49646010.0,22851.0,82237.0,5225.0,9263.0,...,11256790.0,473076.0,11248250.0,,16373.0,0.0,0.0,0.0,0.0,0.0
std,,326785.2,45471.459778,1344501.0,4671.028548,4436508.0,2060.041207,12620.544081,594.83115,1953.329983,...,1300339.0,33586.022461,1299997.0,,208.237151,0.0,0.0,0.0,0.0,0.0


# Número de estados distintos en el dataset


In [18]:
num_estados = df_death['state'].nunique()
print(f"Número de estados distintos en el dataset: {num_estados}")


Número de estados distintos en el dataset: 56


# Filtrar estados con un número insignificante de muertes para mejorar la visualización

In [19]:
min_deaths_threshold = 1000
df_filtered = df_death[df_death['death'] >= min_deaths_threshold]

# Obtener los 10 estados con más muertes

In [13]:
# Filtrar el DataFrame para eliminar las filas con valores no numéricos en la columna 'death'
df_death = df_death[pd.to_numeric(df_death['death'], errors='coerce').notnull()]

# Luego, continuar con el filtrado por umbral de muertes
min_deaths_threshold = 1000
df_filtered = df_death[df_death['death'].astype(float) >= min_deaths_threshold]


# Visualización de datos


## Gráfico de líneas del aumento diario de muertes por estado


In [20]:
output_notebook()

p = figure(x_axis_type="datetime", plot_width=800, plot_height=500, title="Aumento Diario de Muertes por COVID-19 en los 10 Estados con Más Muertes")
p.grid.grid_line_alpha = 0.3
p.xaxis.axis_label = 'Fecha'

for state in top_10_states:
    df_state = df_death[df_death['state'] == state]
    p.line(df_state['date'], df_state['death'], line_width=2, legend_label=state)

p.legend.location = "top_left"
p.legend.title = 'Estado'
show(p)

AttributeError: unexpected attribute 'plot_width' to figure, similar attributes are outer_width, width or min_width