## ETAPA 1: Limpieza y preparación de datos
### Rango temporal: 1 mes de datos (Marzo 2023).
### 1. Cargar y visualizar los primeros 5 registros del archivo 03-01-2023.csv y hacer el merge para los 31 días de archivos



In [None]:
import requests
import pandas as pd
from io import StringIO

base_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/"
dfs = []
failed_files = []

for day in range(1, 29):
    day_str = str(day).zfill(2)
    file_name = f"02-{day_str}-2023.csv"
    file_url = f"{base_url}{file_name}"

    try:
        response = requests.get(file_url)
        response.raise_for_status()
        df = pd.read_csv(StringIO(response.text))
        dfs.append(df)
        print(f"Successfully loaded {file_name}")
    except requests.exceptions.RequestException as e:
        print(f"Could not load {file_name}: {e}")
        failed_files.append(file_name)
    except Exception as e:
        print(f"An unexpected error occurred while processing {file_name}: {e}")
        failed_files.append(file_name)

if dfs:
    combined_df = pd.concat(dfs, ignore_index = True)
else:
    print("No dataframes were loaded.")

if failed_files:
    print("\nFiles that could not be loaded:")
    for file in failed_files:
        print(file)

if 'combined_df' in locals() and not combined_df.empty:
    output_filename = 'covid_marzo_2023.csv'
    combined_df.to_csv(output_filename, index = False)
    print(f"DataFrame guardado como '{output_filename}'")
else:
    print("No hay un DataFrame combinado para guardar.")

Successfully loaded 02-01-2023.csv
Successfully loaded 02-02-2023.csv
Successfully loaded 02-03-2023.csv
Successfully loaded 02-04-2023.csv
Successfully loaded 02-05-2023.csv
Successfully loaded 02-06-2023.csv
Successfully loaded 02-07-2023.csv
Successfully loaded 02-08-2023.csv
Successfully loaded 02-09-2023.csv
Successfully loaded 02-10-2023.csv
Successfully loaded 02-11-2023.csv
Successfully loaded 02-12-2023.csv
Successfully loaded 02-13-2023.csv
Successfully loaded 02-14-2023.csv
Successfully loaded 02-15-2023.csv
Successfully loaded 02-16-2023.csv
Successfully loaded 02-17-2023.csv
Successfully loaded 02-18-2023.csv
Successfully loaded 02-19-2023.csv
Successfully loaded 02-20-2023.csv
Successfully loaded 02-21-2023.csv
Successfully loaded 02-22-2023.csv
Successfully loaded 02-23-2023.csv
Successfully loaded 02-24-2023.csv
Successfully loaded 02-25-2023.csv
Successfully loaded 02-26-2023.csv
Successfully loaded 02-27-2023.csv
Successfully loaded 02-28-2023.csv
DataFrame guardado c

In [None]:
combined_df.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
0,,,,Afghanistan,2023-02-02 04:20:54,33.93911,67.709953,208552,7882,,,Afghanistan,535.733079,3.779393
1,,,,Albania,2023-02-02 04:20:54,41.1533,20.1683,334177,3596,,,Albania,11612.238516,1.076076
2,,,,Algeria,2023-02-02 04:20:54,28.0339,1.6596,271385,6881,,,Algeria,618.879236,2.535512
3,,,,Andorra,2023-02-02 04:20:54,42.5063,1.5218,47839,165,,,Andorra,61915.485666,0.344907
4,,,,Angola,2023-02-02 04:20:54,-11.2027,17.8739,105184,1931,,,Angola,320.036336,1.835831


### 2. Mostrar el número total de filas y columnas del DataFrame


#### Número de filas

In [None]:
combined_df.shape[0]

112448

#### Número de columnas

In [None]:
combined_df.shape[1]

14

### 3. Describir los tipos de datos (dtypes) y convertir las columnas necesarias (por ejemplo, fechas)


In [None]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112448 entries, 0 to 112447
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   FIPS                 91504 non-null   float64
 1   Admin2               91616 non-null   object 
 2   Province_State       107436 non-null  object 
 3   Country_Region       112448 non-null  object 
 4   Last_Update          112448 non-null  object 
 5   Lat                  109900 non-null  float64
 6   Long_                109900 non-null  float64
 7   Confirmed            112448 non-null  int64  
 8   Deaths               112448 non-null  int64  
 9   Recovered            0 non-null       float64
 10  Active               0 non-null       float64
 11  Combined_Key         112448 non-null  object 
 12  Incident_Rate        109816 non-null  float64
 13  Case_Fatality_Ratio  111274 non-null  float64
dtypes: float64(7), int64(2), object(5)
memory usage: 12.0+ MB


In [None]:
combined_df['Last_Update'] = pd.to_datetime(combined_df['Last_Update'])

In [None]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112448 entries, 0 to 112447
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   FIPS                 91504 non-null   float64       
 1   Admin2               91616 non-null   object        
 2   Province_State       107436 non-null  object        
 3   Country_Region       112448 non-null  object        
 4   Last_Update          112448 non-null  datetime64[ns]
 5   Lat                  109900 non-null  float64       
 6   Long_                109900 non-null  float64       
 7   Confirmed            112448 non-null  int64         
 8   Deaths               112448 non-null  int64         
 9   Recovered            0 non-null       float64       
 10  Active               0 non-null       float64       
 11  Combined_Key         112448 non-null  object        
 12  Incident_Rate        109816 non-null  float64       
 13  Case_Fatality_

### 4. Detectar y mostrar valores nulos o faltantes por columna


In [None]:
combined_df_nan = combined_df[combined_df.isnull().any(axis = 1)]
combined_df_nan.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
0,,,,Afghanistan,2023-02-02 04:20:54,33.93911,67.709953,208552,7882,,,Afghanistan,535.733079,3.779393
1,,,,Albania,2023-02-02 04:20:54,41.1533,20.1683,334177,3596,,,Albania,11612.238516,1.076076
2,,,,Algeria,2023-02-02 04:20:54,28.0339,1.6596,271385,6881,,,Algeria,618.879236,2.535512
3,,,,Andorra,2023-02-02 04:20:54,42.5063,1.5218,47839,165,,,Andorra,61915.485666,0.344907
4,,,,Angola,2023-02-02 04:20:54,-11.2027,17.8739,105184,1931,,,Angola,320.036336,1.835831


### 5. Eliminar columnas irrelevantes (por ejemplo, códigos FIPS o coordenadas si no se usarán)


In [None]:
df = combined_df.drop(['FIPS', 'Lat', 'Long_', 'Recovered', 'Active'], axis = 1)
df.head()

Unnamed: 0,Admin2,Province_State,Country_Region,Last_Update,Confirmed,Deaths,Combined_Key,Incident_Rate,Case_Fatality_Ratio
0,,,Afghanistan,2023-02-02 04:20:54,208552,7882,Afghanistan,535.733079,3.779393
1,,,Albania,2023-02-02 04:20:54,334177,3596,Albania,11612.238516,1.076076
2,,,Algeria,2023-02-02 04:20:54,271385,6881,Algeria,618.879236,2.535512
3,,,Andorra,2023-02-02 04:20:54,47839,165,Andorra,61915.485666,0.344907
4,,,Angola,2023-02-02 04:20:54,105184,1931,Angola,320.036336,1.835831


### 6. Estandarizar nombres de columnas (formato snake_case)


In [None]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
df

Unnamed: 0,admin2,province_state,country_region,last_update,confirmed,deaths,combined_key,incident_rate,case_fatality_ratio
0,,,Afghanistan,2023-02-02 04:20:54,208552,7882,Afghanistan,535.733079,3.779393
1,,,Albania,2023-02-02 04:20:54,334177,3596,Albania,11612.238516,1.076076
2,,,Algeria,2023-02-02 04:20:54,271385,6881,Algeria,618.879236,2.535512
3,,,Andorra,2023-02-02 04:20:54,47839,165,Andorra,61915.485666,0.344907
4,,,Angola,2023-02-02 04:20:54,105184,1931,Angola,320.036336,1.835831
...,...,...,...,...,...,...,...,...,...
112443,,,West Bank and Gaza,2023-03-01 04:21:02,703228,5708,West Bank and Gaza,13784.956961,0.811686
112444,,,Winter Olympics 2022,2023-03-01 04:21:02,535,0,Winter Olympics 2022,,0.000000
112445,,,Yemen,2023-03-01 04:21:02,11945,2159,Yemen,40.048994,18.074508
112446,,,Zambia,2023-03-01 04:21:02,343012,4057,Zambia,1865.822568,1.182757


### 7. Homogeneizar nombres de países (ej. “US” → “United States”)


In [None]:
df['country_region'] = df['country_region'].replace('US', 'United States')
df


Unnamed: 0,admin2,province_state,country_region,last_update,confirmed,deaths,combined_key,incident_rate,case_fatality_ratio
0,,,Afghanistan,2023-02-02 04:20:54,208552,7882,Afghanistan,535.733079,3.779393
1,,,Albania,2023-02-02 04:20:54,334177,3596,Albania,11612.238516,1.076076
2,,,Algeria,2023-02-02 04:20:54,271385,6881,Algeria,618.879236,2.535512
3,,,Andorra,2023-02-02 04:20:54,47839,165,Andorra,61915.485666,0.344907
4,,,Angola,2023-02-02 04:20:54,105184,1931,Angola,320.036336,1.835831
...,...,...,...,...,...,...,...,...,...
112443,,,West Bank and Gaza,2023-03-01 04:21:02,703228,5708,West Bank and Gaza,13784.956961,0.811686
112444,,,Winter Olympics 2022,2023-03-01 04:21:02,535,0,Winter Olympics 2022,,0.000000
112445,,,Yemen,2023-03-01 04:21:02,11945,2159,Yemen,40.048994,18.074508
112446,,,Zambia,2023-03-01 04:21:02,343012,4057,Zambia,1865.822568,1.182757


### 8. Convertir la columna Last_Update al formato YYYY-MM-DD

In [None]:
df['last_update'] = pd.to_datetime(df['last_update'], errors = 'coerce')
df['last_update'] = df['last_update'].dt.strftime('%Y-%m-%d')

df.head()

Unnamed: 0,admin2,province_state,country_region,last_update,confirmed,deaths,combined_key,incident_rate,case_fatality_ratio
0,,,Afghanistan,2023-02-02,208552,7882,Afghanistan,535.733079,3.779393
1,,,Albania,2023-02-02,334177,3596,Albania,11612.238516,1.076076
2,,,Algeria,2023-02-02,271385,6881,Algeria,618.879236,2.535512
3,,,Andorra,2023-02-02,47839,165,Andorra,61915.485666,0.344907
4,,,Angola,2023-02-02,105184,1931,Angola,320.036336,1.835831


### 9. Crear una columna active_cases = Confirmed - Deaths - Recovered

In [None]:
df['active_cases'] = df['confirmed'] - df['deaths']
df

Unnamed: 0,admin2,province_state,country_region,last_update,confirmed,deaths,combined_key,incident_rate,case_fatality_ratio,active_cases
0,,,Afghanistan,2023-02-02,208552,7882,Afghanistan,535.733079,3.779393,200670
1,,,Albania,2023-02-02,334177,3596,Albania,11612.238516,1.076076,330581
2,,,Algeria,2023-02-02,271385,6881,Algeria,618.879236,2.535512,264504
3,,,Andorra,2023-02-02,47839,165,Andorra,61915.485666,0.344907,47674
4,,,Angola,2023-02-02,105184,1931,Angola,320.036336,1.835831,103253
...,...,...,...,...,...,...,...,...,...,...
112443,,,West Bank and Gaza,2023-03-01,703228,5708,West Bank and Gaza,13784.956961,0.811686,697520
112444,,,Winter Olympics 2022,2023-03-01,535,0,Winter Olympics 2022,,0.000000,535
112445,,,Yemen,2023-03-01,11945,2159,Yemen,40.048994,18.074508,9786
112446,,,Zambia,2023-03-01,343012,4057,Zambia,1865.822568,1.182757,338955


### 10. Guardar el DataFrame limpio como covid_clean_enero2020.csv e indicar su tamaño en MB

In [None]:
df.to_csv('covid_clean_febrero2023.csv', index = False)
print(f"DataFrame guardado como 'covid_clean_febrero2023.csv' ")

import os
file_size_mb = os.path.getsize(output_filename) / (1024 * 1024)
print(f"Tamaño del archivo: {file_size_mb:.2f} MB")

DataFrame guardado como 'covid_clean_febrero2023.csv' 
Tamaño del archivo: 15.01 MB
