# Mini Datathon F5 #

En este análisis, exploraremos las métricas clave de mortalidad, hospitalizaciones, casos positivos y vacunados por estados en Estados Unidos. Estos datos nos ayudarán a comprender el impacto de la pandemia, evaluar la eficacia de las medidas tomadas y monitorear el progreso de la vacunación en el país.


In [16]:
# IMPORTAMOS BIBLIOTECAS

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests

# CARGA DE DATOS

# URL de la API
url = "https://api.covidtracking.com/v1/states/daily.json"

# Realizar la solicitud (request) para obtener los datos JSON
response = requests.get(url)

# Verificar que la solicitud fue exitosa (código de estado 200)
if response.status_code == 200:
    # Cargar los datos JSON en un DataFrame de pandas
    data = response.json()
    df = pd.DataFrame(data)
    
    # Mostrar el DataFrame
    display(df)
else:
    print("No se pudieron obtener los datos JSON. Código de estado:", response.status_code)




Unnamed: 0,date,state,positive,probableCases,negative,pending,totalTestResultsSource,totalTestResults,hospitalizedCurrently,hospitalizedCumulative,...,dataQualityGrade,deathIncrease,hospitalizedIncrease,hash,commercialScore,negativeRegularScore,negativeScore,positiveScore,score,grade
0,20210307,AK,56886.0,,,,totalTestsViral,1731628.0,33.0,1293.0,...,,0,0,dc4bccd4bb885349d7e94d6fed058e285d4be164,0,0,0,0,0,
1,20210307,AL,499819.0,107742.0,1931711.0,,totalTestsPeopleViral,2323788.0,494.0,45976.0,...,,-1,0,997207b430824ea40b8eb8506c19a93e07bc972e,0,0,0,0,0,
2,20210307,AR,324818.0,69092.0,2480716.0,,totalTestsViral,2736442.0,335.0,14926.0,...,,22,11,50921aeefba3e30d31623aa495b47fb2ecc72fae,0,0,0,0,0,
3,20210307,AS,0.0,,2140.0,,totalTestsViral,2140.0,,,...,,0,0,f77912d0b80d579fbb6202fa1a90554fc4dc1443,0,0,0,0,0,
4,20210307,AZ,826454.0,56519.0,3073010.0,,totalTestsViral,7908105.0,963.0,57907.0,...,,5,44,0437a7a96f4471666f775e63e86923eb5cbd8cdf,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20775,20200117,WA,0.0,,,,totalTestEncountersViral,,,,...,,0,0,7cefac6b3681020741ca30f45399a7b22f2e45b4,0,0,0,0,0,
20776,20200116,WA,0.0,,,,totalTestEncountersViral,,,,...,,0,0,650501e005a5ee86d93c5f32dda56735ea2af967,0,0,0,0,0,
20777,20200115,WA,0.0,,,,totalTestEncountersViral,,,,...,,0,0,4987e61aad88182abfe641033b597304c2153d4f,0,0,0,0,0,
20778,20200114,WA,0.0,,,,totalTestEncountersViral,,,,...,,0,0,1881c8a2f0d337b22066b4f05df06eb2259e8d57,0,0,0,0,0,


## Análisis exploratorio preliminar

Dataset con **20780** entradas y **56 campos** de los cuales 14 están marcados como "deprecated" (a desestimar) en la documentación del proveedor de datos  https://covidtracking.com/data/api --> "Historic values for all states ".

En el caso de campos con muchos datos incompletos, se valorará el método para lidiar con ellos (eliminación filas, columnas, imputación etc) en caso necesario. El campo **dataQualityGrade** puede ser eliminado porque no contiene ningún valor.


In [17]:
# Definimos las columnas que queremos eliminar
columnas_eliminar = ['checkTimeEt', 'commercialScore', 'dateChecked', 'dateModified', 'grade', 'hash', 
                     'hospitalized', 'negativeIncrease', 'negativeRegularScore', 'negativeScore', 
                     'posNeg', 'positiveScore', 'score', 'total', 'dataQualityGrade']

# Creamos un nuevo dataframe que no incluya esas columnas (deprecated)
df1 = df.drop(columns=columnas_eliminar)

# Información sobre el dataset (tipo de datos, campos nulos o vacíos)

df1.info()




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20780 entries, 0 to 20779
Data columns (total 41 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   date                         20780 non-null  int64  
 1   state                        20780 non-null  object 
 2   positive                     20592 non-null  float64
 3   probableCases                9271 non-null   float64
 4   negative                     13290 non-null  float64
 5   pending                      2138 non-null   float64
 6   totalTestResultsSource       20780 non-null  object 
 7   totalTestResults             20614 non-null  float64
 8   hospitalizedCurrently        17339 non-null  float64
 9   hospitalizedCumulative       12382 non-null  float64
 10  inIcuCurrently               11636 non-null  float64
 11  inIcuCumulative              3789 non-null   float64
 12  onVentilatorCurrently        9126 non-null   float64
 13  onVentilatorCumu

In [18]:
# asignamos datetime a la variable date (actualmente float)
df1['date'] = pd.to_datetime(df1['date'], format='%Y%m%d')
df1.head()

Unnamed: 0,date,state,positive,probableCases,negative,pending,totalTestResultsSource,totalTestResults,hospitalizedCurrently,hospitalizedCumulative,...,negativeTestsPeopleAntibody,totalTestsPeopleAntigen,positiveTestsPeopleAntigen,totalTestsAntigen,positiveTestsAntigen,fips,positiveIncrease,totalTestResultsIncrease,deathIncrease,hospitalizedIncrease
0,2021-03-07,AK,56886.0,,,,totalTestsViral,1731628.0,33.0,1293.0,...,,,,,,2,0,0,0,0
1,2021-03-07,AL,499819.0,107742.0,1931711.0,,totalTestsPeopleViral,2323788.0,494.0,45976.0,...,,,,,,1,408,2347,-1,0
2,2021-03-07,AR,324818.0,69092.0,2480716.0,,totalTestsViral,2736442.0,335.0,14926.0,...,,481311.0,81803.0,,,5,165,3380,22,11
3,2021-03-07,AS,0.0,,2140.0,,totalTestsViral,2140.0,,,...,,,,,,60,0,0,0,0
4,2021-03-07,AZ,826454.0,56519.0,3073010.0,,totalTestsViral,7908105.0,963.0,57907.0,...,,,,,,4,1335,45110,5,44


In [19]:
# Análisis estadístico básico
df1.describe()

Unnamed: 0,date,positive,probableCases,negative,pending,totalTestResults,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,...,positiveTestsPeopleAntibody,negativeTestsPeopleAntibody,totalTestsPeopleAntigen,positiveTestsPeopleAntigen,totalTestsAntigen,positiveTestsAntigen,positiveIncrease,totalTestResultsIncrease,deathIncrease,hospitalizedIncrease
count,20780,20592.0,9271.0,13290.0,2138.0,20614.0,17339.0,12382.0,11636.0,3789.0,...,1094.0,972.0,999.0,633.0,3421.0,2233.0,20780.0,20780.0,20780.0,20780.0
mean,2020-09-02 18:13:30.779595776,165156.0,21729.123719,848224.6,1659.862956,2186936.0,1190.576965,9262.762478,359.621176,1934.191607,...,20516.966179,188710.969136,168188.246246,25259.048973,308919.6,31837.241379,1383.849519,17508.38821,24.790712,37.36078
min,2020-01-13 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6.0,...,0.0,1.0,3.0,3.0,1.0,0.0,-7757.0,-130545.0,-201.0,-12257.0
25%,2020-06-02 00:00:00,5753.75,773.5,53941.25,43.0,104049.8,166.5,985.25,60.0,501.0,...,3155.5,54874.0,37675.5,2682.0,20047.0,1085.0,65.0,1205.75,0.0,0.0
50%,2020-09-03 00:00:00,46064.5,4491.0,305972.0,212.5,655267.0,531.0,4472.0,172.0,1295.0,...,11956.0,100282.0,144130.0,17763.0,123384.0,13661.0,435.0,6125.0,6.0,0.0
75%,2020-12-05 00:00:00,177958.0,19553.0,1056611.0,1295.75,2264766.0,1279.0,12248.5,380.0,2451.0,...,19059.0,261121.0,255251.0,47012.0,432727.0,49010.0,1335.25,19086.5,24.0,36.0
max,2021-03-07 00:00:00,3501394.0,365961.0,10186940.0,64400.0,49646010.0,22851.0,82237.0,5225.0,9263.0,...,178979.0,816231.0,580372.0,81803.0,2664340.0,211546.0,71734.0,473076.0,2559.0,16373.0
std,,326785.2,45471.459778,1344501.0,4671.028548,4436508.0,2060.041207,12620.544081,594.83115,1953.329983,...,29267.559775,200218.073746,143748.073365,24139.253458,423285.4,41929.65807,3023.558742,33586.022461,60.162742,208.237151
