############################################################
# ðŸ“Š COVID-19 Global Data Analysis / AnÃ¡lisis Global COVID-19
# Final Project - R Notebook
# Author: Fernando LÃ³pez Arenas
############################################################

In [11]:
# --- Libraries / LibrerÃ­as ---
library(httr)   # HTTP requests / Solicitudes HTTP
library(rvest)  # Web scraping / ExtracciÃ³n de datos HTML

############################################################
# Task 1: Get Wiki page / Obtener pÃ¡gina Wiki
############################################################

In [12]:
get_wiki_covid19_page <- function() {
  wiki_base_url <- "https://en.wikipedia.org/w/index.php"
  query_params <- list(title = "Template:COVID-19_testing_by_country")
  response <- GET(url = wiki_base_url, query = query_params)
  return(response)
}

resp <- get_wiki_covid19_page()
print(resp)

Response [https://en.wikipedia.org/w/index.php?title=Template%3ACOVID-19_testing_by_country]
  Date: 2025-12-14 17:36
  Status: 200
  Content-Type: text/html; charset=UTF-8
  Size: 456 kB
<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-fea...
<head>
<meta charset="UTF-8">
<title>Template:COVID-19 testing by country - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-heade...
RLSTATE={"ext.globalCssJs.user.styles":"ready","site.styles":"ready","user.st...
<script>(RLQ=window.RLQ||[]).push(function(){mw.loader.impl(function(){return...
}];});});</script>
<link rel="stylesheet" href="/w/load.php?lang=en&amp;modules=ext.cite.styles%...
...


############################################################
# Task 2: Extract table / Extraer tabla
############################################################

In [13]:
root <- content(resp)
table_nodes <- html_nodes(root, "table")
covid19_table <- html_table(table_nodes, fill = TRUE)

# Preview / Vista previa
head(covid19_table[[2]])

Country or region,Date[a],Tested,Units[b],Confirmed(cases),"Confirmedâ€‰/tested,%","Testedâ€‰/population,%","Confirmedâ€‰/population,%",Ref.
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Afghanistan,17 Dec 2020,154767,samples,49621,32.1,0.4,0.13,[1]
Albania,18 Feb 2021,428654,samples,96838,22.6,15.0,3.4,[2]
Algeria,2 Nov 2020,230553,samples,58574,25.4,0.53,0.13,[3][4]
Andorra,23 Feb 2022,300307,samples,37958,12.6,387.0,49.0,[5]
Angola,2 Feb 2021,399228,samples,20981,5.3,1.3,0.067,[6]
Antigua and Barbuda,6 Mar 2021,15268,samples,832,5.4,15.9,0.86,[7]


############################################################
# Task 3: Preprocess & Export / Preprocesar y Exportar
############################################################

In [14]:
preprocess_covid_data_frame <- function(df) {
  # Remove irrelevant rows / Eliminar filas irrelevantes
  df <- df[!(df$`Country or region` == "World"), ]
  df <- df[1:172, ]
  
  # Remove unused columns / Eliminar columnas innecesarias
  df$`Units[b]` <- NULL
  df$Ref. <- NULL
  
  # Rename columns / Renombrar columnas
  names(df) <- c("country", "date", "tested", "confirmed",
                 "confirmed_tested_ratio", "tested_population_ratio",
                 "confirmed_population_ratio")
  # Convert types / Convertir tipos
  df$country <- as.factor(df$country)
  df$date <- as.factor(df$date)
  df$tested <- as.numeric(gsub("[^0-9]", "", df$tested))
  df$confirmed <- as.numeric(gsub("[^0-9]", "", df$confirmed))
  df$confirmed_tested_ratio <- as.numeric(gsub("[^0-9.]", "", df$confirmed_tested_ratio))
  df$tested_population_ratio <- as.numeric(gsub("[^0-9.]", "", df$tested_population_ratio))
  df$confirmed_population_ratio <- as.numeric(gsub("[^0-9.]", "", df$confirmed_population_ratio))
  
  return(df)
}

cleaned_covid_data <- preprocess_covid_data_frame(covid19_table[[2]])
summary(cleaned_covid_data)

# Export CSV / Exportar CSV
write.csv(cleaned_covid_data, file = "cleaned_covid_data.csv", row.names = FALSE)

                country             date         tested         
 Afghanistan        :  1   2 Feb 2023 :  6   Min.   :     3880  
 Albania            :  1   1 Feb 2023 :  4   1st Qu.:   512037  
 Algeria            :  1   31 Jan 2023:  4   Median :  3029859  
 Andorra            :  1   1 Mar 2021 :  3   Mean   : 31377219  
 Angola             :  1   23 Jul 2021:  3   3rd Qu.: 12386725  
 Antigua and Barbuda:  1   29 Jan 2023:  3   Max.   :929349291  
 (Other)            :166   (Other)    :149                      
   confirmed        confirmed_tested_ratio tested_population_ratio
 Min.   :       0   Min.   : 0.00          Min.   :   0.0065      
 1st Qu.:   37839   1st Qu.: 5.00          1st Qu.:   9.4750      
 Median :  281196   Median :10.05          Median :  46.9500      
 Mean   : 2508340   Mean   :11.25          Mean   : 175.5043      
 3rd Qu.: 1278105   3rd Qu.:15.25          3rd Qu.: 156.5000      
 Max.   :90749469   Max.   :46.80          Max.   :3223.0000      
           

############################################################
# Task 4: Subset rows / Subconjunto de filas
############################################################

In [15]:
cleaned_covid_data[5:10, c("country", "confirmed")]

country,confirmed
<fct>,<dbl>
Angola,20981
Antigua and Barbuda,832
Argentina,9060495
Armenia,422963
Australia,10112229
Austria,5789991


############################################################
# Task 5: Worldwide positive ratio / Ratio positivo mundial
############################################################

In [16]:
total_confirmed <- sum(cleaned_covid_data$confirmed, na.rm = TRUE)
total_tested <- sum(cleaned_covid_data$tested, na.rm = TRUE)
positive_ratio <- total_confirmed / total_tested
total_confirmed
total_tested
positive_ratio

############################################################
# Task 6: Country list / Lista de paÃ­ses
############################################################

In [None]:
countries <- as.character(cleaned_covid_data$country)
countries_sorted_AtoZ <- sort(countries)
countries_sorted_ZtoA <- sort(countries, decreasing = TRUE)
countries_sorted_ZtoA[1:10]

############################################################
# Task 7: Regex pattern / PatrÃ³n con regex
############################################################

In [18]:
matched_countries <- grep("^United.+", countries, value = TRUE)
matched_countries

############################################################
# Task 8 & 9: Compare two countries / Comparar dos paÃ­ses
############################################################

In [19]:
compare_countries <- function(df, c1, c2) {
  d1 <- df[df$country == c1, ]
  d2 <- df[df$country == c2, ]
  
  if (d1$confirmed_population_ratio > d2$confirmed_population_ratio) {
    print(paste(c1, "has higher confirmed/population ratio than", c2))
  } else {
    print(paste(c2, "has higher confirmed/population ratio than", c1))
  }
}

compare_countries(cleaned_covid_data, "India", "United States")

[1] "India has higher confirmed/population ratio than United States"


############################################################
# Task 10: Threshold filter / Filtro por umbral
############################################################

In [20]:
threshold <- 1
low_confirmed_population_ratio <- cleaned_covid_data[
  cleaned_covid_data$confirmed_population_ratio < threshold, ]
head(low_confirmed_population_ratio)

country,date,tested,confirmed,confirmed_tested_ratio,tested_population_ratio,confirmed_population_ratio
<fct>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Afghanistan,17 Dec 2020,154767,49621,32.1,0.4,0.13
Algeria,2 Nov 2020,230553,58574,25.4,0.53,0.13
Angola,2 Feb 2021,399228,20981,5.3,1.3,0.067
Antigua and Barbuda,6 Mar 2021,15268,832,5.4,15.9,0.86
Bangladesh,24 Jul 2021,7417714,1151644,15.5,4.5,0.7
Benin,4 May 2021,595112,7884,1.3,5.1,0.067
