# En este archivo voy a proceder a limpiar y corregir los datos, así como de añadir nuevos que aporten valor al análisis.

# Empezamos con la colección de los casos

In [1]:
from pymongo import MongoClient
from dotenv import load_dotenv
import pandas as pd
import os

In [13]:
load_dotenv()
username=os.getenv("MONGO_USER")
password=os.getenv("MONGO_PASS")

In [14]:
url = f"mongodb+srv://{username}:{password}@cluster0.gnfmn.mongodb.net"

In [15]:
client = MongoClient(url)

In [16]:
db = client.get_database("Covid")

In [17]:
covid_cases = db["Covid_Cases"]

In [18]:
data = pd.DataFrame(covid_cases.find({}))

In [19]:
data.head()

Unnamed: 0,_id,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,4/2/21,4/3/21,4/4/21,4/5/21,4/6/21,4/7/21,4/8/21,4/9/21,4/10/21,Province/State
0,61c9dcc250d5b7208c17eaf7,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,56572,56595,56676,56717,56779,56873,56943,57019,57144,
1,61c9dcc250d5b7208c17eaf8,Albania,41.1533,20.1683,0,0,0,0,0,0,...,125842,126183,126531,126795,126936,127192,127509,127795,128155,
2,61c9dcc250d5b7208c17eaf9,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,117429,117524,117622,117739,117879,118004,118116,118251,118378,
3,61c9dcc250d5b7208c17eafa,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,12115,12174,12231,12286,12328,12363,12409,12456,12497,
4,61c9dcc250d5b7208c17eafb,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,22467,22579,22631,22717,22885,23010,23108,23242,23331,


In [20]:
#250 countries
data.shape

(274, 450)

In [10]:
#No hay duplicados
data.duplicated().sum()

0

In [11]:
data.isna().sum()

_id                 0
Country/Region      0
Lat                 1
Long                1
1/22/20             0
                 ... 
4/7/21              0
4/8/21              0
4/9/21              0
4/10/21             0
Province/State    189
Length: 450, dtype: int64

In [21]:
#Comprobamos que casi el 70% de los valores de Province/State son nulos por tanto los voy a eliminar porque tampoco aportan
#información valiosa al estudio por paises
(data.isna().sum()/data.shape[0])*100

_id                0.000000
Country/Region     0.000000
Lat                0.364964
Long               0.364964
1/22/20            0.000000
                    ...    
4/7/21             0.000000
4/8/21             0.000000
4/9/21             0.000000
4/10/21            0.000000
Province/State    68.978102
Length: 450, dtype: float64

In [22]:
data = data.drop(columns=["Province/State"])

In [23]:
#La columna id tampoco es necesaria asi que la elimino también
data = data.drop(columns=["_id"])

In [24]:
#A lo hora de hacer calculos interesa que los valores de contagios sean numeros y no strings por tanto hay que cambiar el type
data.dtypes

Country/Region    object
Lat               object
Long              object
1/22/20           object
1/23/20           object
                   ...  
4/6/21            object
4/7/21            object
4/8/21            object
4/9/21            object
4/10/21           object
Length: 448, dtype: object

In [15]:
data.columns

Index(['_id', 'Country/Region', 'Lat', 'Long', '1/22/20', '1/23/20', '1/24/20',
       '1/25/20', '1/26/20', '1/27/20',
       ...
       '4/1/21', '4/2/21', '4/3/21', '4/4/21', '4/5/21', '4/6/21', '4/7/21',
       '4/8/21', '4/9/21', '4/10/21'],
      dtype='object', length=449)

In [25]:
data.iloc[:,3]

0      0
1      0
2      0
3      0
4      0
      ..
269    0
270    0
271    0
272    0
273    0
Name: 1/22/20, Length: 274, dtype: object

In [27]:
for i in range(3,448):
    data.iloc[:,i] = data.iloc[:,i].apply(pd.to_numeric)

In [73]:
data.dtypes

Country/Region    object
Lat               object
Long              object
1/22/20            int64
1/23/20            int64
                   ...  
4/6/21             int64
4/7/21             int64
4/8/21             int64
4/9/21             int64
4/10/21            int64
Length: 448, dtype: object

In [29]:
# Como hay varios países repetidos, deberíamos agrupar los valores de casos para tener solo una fila por país. Hay siete países
#con más de dos filas
(data["Country/Region"].value_counts() > 1).sum()

7

In [30]:
data["Country/Region"].value_counts().iloc[:7]

China             33
Canada            16
France            12
United Kingdom    12
Australia          8
Netherlands        5
Denmark            3
Name: Country/Region, dtype: int64

In [31]:
# Los países repetidos son China, Canada, France, United Kingdom, Australia, Netherlands and Denmark.
data.head()

Unnamed: 0,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,...,4/1/21,4/2/21,4/3/21,4/4/21,4/5/21,4/6/21,4/7/21,4/8/21,4/9/21,4/10/21
0,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,0,...,56517,56572,56595,56676,56717,56779,56873,56943,57019,57144
1,Albania,41.1533,20.1683,0,0,0,0,0,0,0,...,125506,125842,126183,126531,126795,126936,127192,127509,127795,128155
2,Algeria,28.0339,1.6596,0,0,0,0,0,0,0,...,117304,117429,117524,117622,117739,117879,118004,118116,118251,118378
3,Andorra,42.5063,1.5218,0,0,0,0,0,0,0,...,12053,12115,12174,12231,12286,12328,12363,12409,12456,12497
4,Angola,-11.2027,17.8739,0,0,0,0,0,0,0,...,22399,22467,22579,22631,22717,22885,23010,23108,23242,23331


In [50]:
columnas = ["China", "Canada", "France", "United Kingdom", "Australia", "Netherlands", "Denmark"]

In [32]:
data.loc[data["Country/Region"] == "Denmark"]

Unnamed: 0,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,...,4/1/21,4/2/21,4/3/21,4/4/21,4/5/21,4/6/21,4/7/21,4/8/21,4/9/21,4/10/21
101,Denmark,61.8926,-6.9118,0,0,0,0,0,0,0,...,661,661,661,661,661,661,661,661,661,661
102,Denmark,71.7069,-42.6043,0,0,0,0,0,0,0,...,31,31,31,31,31,31,31,31,31,31
103,Denmark,56.2639,9.5018,0,0,0,0,0,0,0,...,231265,231973,232718,233318,233797,234317,234931,235648,236346,237101


In [33]:
data.loc[data["Country/Region"] == "Denmark"].iloc[:,447].sum()

237793

In [34]:
data.loc[data["Country/Region"] == "Denmark"].iloc[0:1,3]

101    0
Name: 1/22/20, dtype: int64

In [35]:
data.loc[data["Country/Region"] == "Denmark"].iloc[:,3]

101    0
102    0
103    0
Name: 1/22/20, dtype: int64

In [36]:
#Denmark 61.8926 -6.9118
#Netherlands 12.5211 -69.9683
#Australia -35.4735 149.0124
#United Kingdom 18.2206 -63.0686
#France 46.2276 2.2137
#Canada 51.2538 -85.3232
#China 30.9756 112.2707

In [111]:
dinamarca = data.loc[data["Country/Region"] == "Denmark"].groupby("Country/Region").sum()

In [37]:
# Aqui agrupo las filas por pais y cada columna con los casos totales
data_cleanish = data.groupby("Country/Region",as_index=False).sum()

In [38]:
data_cleanish.loc[data_cleanish["Country/Region"] == "Denmark"]

Unnamed: 0,Country/Region,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,4/1/21,4/2/21,4/3/21,4/4/21,4/5/21,4/6/21,4/7/21,4/8/21,4/9/21,4/10/21
47,Denmark,0,0,0,0,0,0,0,0,0,...,231957,232665,233410,234010,234489,235009,235623,236340,237038,237793


In [39]:
data_aux = data

In [40]:
data_aux

Unnamed: 0,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,...,4/1/21,4/2/21,4/3/21,4/4/21,4/5/21,4/6/21,4/7/21,4/8/21,4/9/21,4/10/21
0,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,0,...,56517,56572,56595,56676,56717,56779,56873,56943,57019,57144
1,Albania,41.1533,20.1683,0,0,0,0,0,0,0,...,125506,125842,126183,126531,126795,126936,127192,127509,127795,128155
2,Algeria,28.0339,1.6596,0,0,0,0,0,0,0,...,117304,117429,117524,117622,117739,117879,118004,118116,118251,118378
3,Andorra,42.5063,1.5218,0,0,0,0,0,0,0,...,12053,12115,12174,12231,12286,12328,12363,12409,12456,12497
4,Angola,-11.2027,17.8739,0,0,0,0,0,0,0,...,22399,22467,22579,22631,22717,22885,23010,23108,23242,23331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269,Vietnam,14.058324,108.277199,0,2,2,2,2,2,2,...,2617,2620,2626,2631,2637,2648,2659,2668,2683,2692
270,West Bank and Gaza,31.9522,35.2332,0,0,0,0,0,0,0,...,244645,246893,248482,251288,253922,256461,259133,262017,264395,265897
271,Yemen,15.552727,48.516388,0,0,0,0,0,0,0,...,4531,4620,4697,4798,4881,4975,5047,5133,5233,5276
272,Zambia,-13.133897,27.849332,0,0,0,0,0,0,0,...,88549,88730,88800,88930,89009,89071,89386,89592,89783,89918


In [41]:
data_aux.loc[data_aux["Country/Region"] == "Australia"]

Unnamed: 0,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,...,4/1/21,4/2/21,4/3/21,4/4/21,4/5/21,4/6/21,4/7/21,4/8/21,4/9/21,4/10/21
8,Australia,-35.4735,149.0124,0,0,0,0,0,0,0,...,123,123,123,123,123,123,123,123,123,123
9,Australia,-33.8688,151.2093,0,0,0,0,3,4,4,...,5296,5299,5300,5303,5310,5316,5318,5320,5324,5330
10,Australia,-12.4634,130.8456,0,0,0,0,0,0,0,...,109,111,112,112,112,112,112,112,112,112
11,Australia,-27.4698,153.0251,0,0,0,0,0,0,0,...,1485,1488,1489,1492,1491,1497,1500,1501,1502,1502
12,Australia,-34.9285,138.6007,0,0,0,0,0,0,0,...,658,658,659,661,661,662,663,665,665,666
13,Australia,-42.8821,147.3272,0,0,0,0,0,0,0,...,234,234,234,234,234,234,234,234,234,234
14,Australia,-37.8136,144.9631,0,0,0,0,1,1,1,...,20484,20484,20484,20484,20484,20484,20484,20484,20485,20485
15,Australia,-31.9505,115.8605,0,0,0,0,0,0,0,...,944,944,947,948,950,951,951,951,951,953


In [42]:
#Para las lat y lon, voy a eliminar las filas extra de cada país y me voy a quedar con las coordenadas donde mas casos haya.
#United Kingdom
for i in range(253,264):
    data_aux = data_aux.drop(index=i)

In [43]:
#Netherlands
for i in range(191,195):
    data_aux = data_aux.drop(index=i)

In [44]:
#France
for i in range(118,129):
    data_aux = data_aux.drop(index=i)

In [45]:
#Denmark
for i in range(101,103):
    data_aux = data_aux.drop(index=i)

In [46]:
#China 1
for i in range(58,71):
    data_aux = data_aux.drop(index=i)

In [47]:
#China 2
for i in range(72,91):
    data_aux = data_aux.drop(index=i)

In [48]:
#Canada 1
for i in range(39,49):
    data_aux = data_aux.drop(index=i)

In [49]:
#Canada 2
for i in range(50,55):
    data_aux = data_aux.drop(index=i)

In [50]:
#Australia 1
for i in range(8,14):
    data_aux = data_aux.drop(index=i)

In [51]:
#Australia 2
data_aux = data_aux.drop(index=15)

In [52]:
data_aux = data_aux.reset_index(drop=True)

In [53]:
data_cleanish

Unnamed: 0,Country/Region,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,4/1/21,4/2/21,4/3/21,4/4/21,4/5/21,4/6/21,4/7/21,4/8/21,4/9/21,4/10/21
0,Afghanistan,0,0,0,0,0,0,0,0,0,...,56517,56572,56595,56676,56717,56779,56873,56943,57019,57144
1,Albania,0,0,0,0,0,0,0,0,0,...,125506,125842,126183,126531,126795,126936,127192,127509,127795,128155
2,Algeria,0,0,0,0,0,0,0,0,0,...,117304,117429,117524,117622,117739,117879,118004,118116,118251,118378
3,Andorra,0,0,0,0,0,0,0,0,0,...,12053,12115,12174,12231,12286,12328,12363,12409,12456,12497
4,Angola,0,0,0,0,0,0,0,0,0,...,22399,22467,22579,22631,22717,22885,23010,23108,23242,23331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,Vietnam,0,2,2,2,2,2,2,2,2,...,2617,2620,2626,2631,2637,2648,2659,2668,2683,2692
188,West Bank and Gaza,0,0,0,0,0,0,0,0,0,...,244645,246893,248482,251288,253922,256461,259133,262017,264395,265897
189,Yemen,0,0,0,0,0,0,0,0,0,...,4531,4620,4697,4798,4881,4975,5047,5133,5233,5276
190,Zambia,0,0,0,0,0,0,0,0,0,...,88549,88730,88800,88930,89009,89071,89386,89592,89783,89918


In [54]:
#Ahora añado lat y lon de data_aux a data_cleanish
data_cleanish["Lat"] = data_aux["Lat"]
data_cleanish["Long"] = data_aux["Long"]

In [55]:
data_cleanish

Unnamed: 0,Country/Region,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,4/3/21,4/4/21,4/5/21,4/6/21,4/7/21,4/8/21,4/9/21,4/10/21,Lat,Long
0,Afghanistan,0,0,0,0,0,0,0,0,0,...,56595,56676,56717,56779,56873,56943,57019,57144,33.93911,67.709953
1,Albania,0,0,0,0,0,0,0,0,0,...,126183,126531,126795,126936,127192,127509,127795,128155,41.1533,20.1683
2,Algeria,0,0,0,0,0,0,0,0,0,...,117524,117622,117739,117879,118004,118116,118251,118378,28.0339,1.6596
3,Andorra,0,0,0,0,0,0,0,0,0,...,12174,12231,12286,12328,12363,12409,12456,12497,42.5063,1.5218
4,Angola,0,0,0,0,0,0,0,0,0,...,22579,22631,22717,22885,23010,23108,23242,23331,-11.2027,17.8739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,Vietnam,0,2,2,2,2,2,2,2,2,...,2626,2631,2637,2648,2659,2668,2683,2692,14.058324,108.277199
188,West Bank and Gaza,0,0,0,0,0,0,0,0,0,...,248482,251288,253922,256461,259133,262017,264395,265897,31.9522,35.2332
189,Yemen,0,0,0,0,0,0,0,0,0,...,4697,4798,4881,4975,5047,5133,5233,5276,15.552727,48.516388
190,Zambia,0,0,0,0,0,0,0,0,0,...,88800,88930,89009,89071,89386,89592,89783,89918,-13.133897,27.849332


In [56]:
lat_column = data_cleanish.pop("Lat")
long_column = data_cleanish.pop("Long")

In [57]:
data_cleanish.insert(1, "Lat", lat_column)
data_cleanish.insert(2, "Long", long_column)

In [58]:
data_cleanish

Unnamed: 0,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,...,4/1/21,4/2/21,4/3/21,4/4/21,4/5/21,4/6/21,4/7/21,4/8/21,4/9/21,4/10/21
0,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,0,...,56517,56572,56595,56676,56717,56779,56873,56943,57019,57144
1,Albania,41.1533,20.1683,0,0,0,0,0,0,0,...,125506,125842,126183,126531,126795,126936,127192,127509,127795,128155
2,Algeria,28.0339,1.6596,0,0,0,0,0,0,0,...,117304,117429,117524,117622,117739,117879,118004,118116,118251,118378
3,Andorra,42.5063,1.5218,0,0,0,0,0,0,0,...,12053,12115,12174,12231,12286,12328,12363,12409,12456,12497
4,Angola,-11.2027,17.8739,0,0,0,0,0,0,0,...,22399,22467,22579,22631,22717,22885,23010,23108,23242,23331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,Vietnam,14.058324,108.277199,0,2,2,2,2,2,2,...,2617,2620,2626,2631,2637,2648,2659,2668,2683,2692
188,West Bank and Gaza,31.9522,35.2332,0,0,0,0,0,0,0,...,244645,246893,248482,251288,253922,256461,259133,262017,264395,265897
189,Yemen,15.552727,48.516388,0,0,0,0,0,0,0,...,4531,4620,4697,4798,4881,4975,5047,5133,5233,5276
190,Zambia,-13.133897,27.849332,0,0,0,0,0,0,0,...,88549,88730,88800,88930,89009,89071,89386,89592,89783,89918


In [61]:
#Tengo que pasar el dataframe a diccionario para subir los datos a mongodb
data_checked = data_cleanish.to_dict("records")
data_checked

[{'Country/Region': 'Afghanistan',
  'Lat': '33.93911',
  'Long': '67.709953',
  '1/22/20': 0,
  '1/23/20': 0,
  '1/24/20': 0,
  '1/25/20': 0,
  '1/26/20': 0,
  '1/27/20': 0,
  '1/28/20': 0,
  '1/29/20': 0,
  '1/30/20': 0,
  '1/31/20': 0,
  '2/1/20': 0,
  '2/2/20': 0,
  '2/3/20': 0,
  '2/4/20': 0,
  '2/5/20': 0,
  '2/6/20': 0,
  '2/7/20': 0,
  '2/8/20': 0,
  '2/9/20': 0,
  '2/10/20': 0,
  '2/11/20': 0,
  '2/12/20': 0,
  '2/13/20': 0,
  '2/14/20': 0,
  '2/15/20': 0,
  '2/16/20': 0,
  '2/17/20': 0,
  '2/18/20': 0,
  '2/19/20': 0,
  '2/20/20': 0,
  '2/21/20': 0,
  '2/22/20': 0,
  '2/23/20': 0,
  '2/24/20': 1,
  '2/25/20': 1,
  '2/26/20': 1,
  '2/27/20': 1,
  '2/28/20': 1,
  '2/29/20': 1,
  '3/1/20': 1,
  '3/2/20': 1,
  '3/3/20': 2,
  '3/4/20': 4,
  '3/5/20': 4,
  '3/6/20': 4,
  '3/7/20': 4,
  '3/8/20': 5,
  '3/9/20': 7,
  '3/10/20': 8,
  '3/11/20': 11,
  '3/12/20': 12,
  '3/13/20': 13,
  '3/14/20': 15,
  '3/15/20': 16,
  '3/16/20': 18,
  '3/17/20': 20,
  '3/18/20': 24,
  '3/19/20': 25,
  

In [59]:
covid_cases_checked = db["Covid_Cases_Checked"]

In [62]:
for i in data_checked:
    covid_cases_checked.insert_one(i)

# Ahora seguimos con las muertes

In [63]:
covid_deaths = db["Covid_Deaths"]

In [64]:
data2 = pd.DataFrame(covid_deaths.find({}))

In [65]:
data2.head()

Unnamed: 0,_id,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,4/2/21,4/3/21,4/4/21,4/5/21,4/6/21,4/7/21,4/8/21,4/9/21,4/10/21,Province/State
0,61c9ddde50d5b7208c17ec0b,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,2495,2496,2497,2508,2512,2512,2516,2521,2521,
1,61c9ddde50d5b7208c17ec0c,Albania,41.1533,20.1683,0,0,0,0,0,0,...,2247,2256,2265,2274,2283,2291,2297,2304,2310,
2,61c9ddde50d5b7208c17ec0d,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,3099,3102,3105,3108,3112,3116,3119,3123,3126,
3,61c9ddde50d5b7208c17ec0e,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,116,117,117,117,117,119,120,120,120,
4,61c9ddde50d5b7208c17ec0f,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,538,540,542,543,547,547,549,549,550,


In [67]:
data2.shape

(274, 450)

In [68]:
data2 = data2.drop(columns=["Province/State"])
data2= data2.drop(columns=["_id"])

In [71]:
for i in range(3,448):
    data2.iloc[:,i] = data2.iloc[:,i].apply(pd.to_numeric)

In [72]:
data2.dtypes

Country/Region    object
Lat               object
Long              object
1/22/20            int64
1/23/20            int64
                   ...  
4/6/21             int64
4/7/21             int64
4/8/21             int64
4/9/21             int64
4/10/21            int64
Length: 448, dtype: object

In [76]:
data2["Country/Region"].value_counts().iloc[:7]

China             33
Canada            16
France            12
United Kingdom    12
Australia          8
Netherlands        5
Denmark            3
Name: Country/Region, dtype: int64

In [77]:
data_cleanish2 = data2.groupby("Country/Region",as_index=False).sum()

In [78]:
data_aux2= data2

In [82]:
for i in range(253,264):
    data_aux2 = data_aux2.drop(index=i)

In [84]:
for i in range(191,195):
    data_aux2 = data_aux2.drop(index=i)

In [85]:
for i in range(118,129):
    data_aux2 = data_aux2.drop(index=i)

In [86]:
for i in range(101,103):
    data_aux2 = data_aux2.drop(index=i)

In [87]:
for i in range(58,71):
    data_aux2 = data_aux2.drop(index=i)
for i in range(72,91):
    data_aux2 = data_aux2.drop(index=i)
for i in range(39,49):
    data_aux2 = data_aux2.drop(index=i)
for i in range(50,55):
    data_aux2 = data_aux2.drop(index=i)
for i in range(8,14):
    data_aux2 = data_aux2.drop(index=i)
data_aux2 = data_aux2.drop(index=15)

In [88]:
data_aux2 = data_aux2.reset_index(drop=True)

In [89]:
data_cleanish2["Lat"] = data_aux2["Lat"]
data_cleanish2["Long"] = data_aux2["Long"]

In [90]:
lat_column2 = data_cleanish2.pop("Lat")
long_column2 = data_cleanish2.pop("Long")
data_cleanish2.insert(1, "Lat", lat_column2)
data_cleanish2.insert(2, "Long", long_column2)

In [91]:
data_cleanish2

Unnamed: 0,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,...,4/1/21,4/2/21,4/3/21,4/4/21,4/5/21,4/6/21,4/7/21,4/8/21,4/9/21,4/10/21
0,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,0,...,2489,2495,2496,2497,2508,2512,2512,2516,2521,2521
1,Albania,41.1533,20.1683,0,0,0,0,0,0,0,...,2241,2247,2256,2265,2274,2283,2291,2297,2304,2310
2,Algeria,28.0339,1.6596,0,0,0,0,0,0,0,...,3096,3099,3102,3105,3108,3112,3116,3119,3123,3126
3,Andorra,42.5063,1.5218,0,0,0,0,0,0,0,...,115,116,117,117,117,117,119,120,120,120
4,Angola,-11.2027,17.8739,0,0,0,0,0,0,0,...,538,538,540,542,543,547,547,549,549,550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,Vietnam,14.058324,108.277199,0,0,0,0,0,0,0,...,35,35,35,35,35,35,35,35,35,35
188,West Bank and Gaza,31.9522,35.2332,0,0,0,0,0,0,0,...,2645,2661,2681,2706,2716,2735,2753,2781,2812,2838
189,Yemen,15.552727,48.516388,0,0,0,0,0,0,0,...,906,916,932,946,955,976,986,1004,1022,1031
190,Zambia,-13.133897,27.849332,0,0,0,0,0,0,0,...,1212,1215,1215,1220,1222,1224,1224,1225,1226,1226


In [92]:
data_checked2 = data_cleanish2.to_dict("records")
data_checked2 

[{'Country/Region': 'Afghanistan',
  'Lat': '33.93911',
  'Long': '67.709953',
  '1/22/20': 0,
  '1/23/20': 0,
  '1/24/20': 0,
  '1/25/20': 0,
  '1/26/20': 0,
  '1/27/20': 0,
  '1/28/20': 0,
  '1/29/20': 0,
  '1/30/20': 0,
  '1/31/20': 0,
  '2/1/20': 0,
  '2/2/20': 0,
  '2/3/20': 0,
  '2/4/20': 0,
  '2/5/20': 0,
  '2/6/20': 0,
  '2/7/20': 0,
  '2/8/20': 0,
  '2/9/20': 0,
  '2/10/20': 0,
  '2/11/20': 0,
  '2/12/20': 0,
  '2/13/20': 0,
  '2/14/20': 0,
  '2/15/20': 0,
  '2/16/20': 0,
  '2/17/20': 0,
  '2/18/20': 0,
  '2/19/20': 0,
  '2/20/20': 0,
  '2/21/20': 0,
  '2/22/20': 0,
  '2/23/20': 0,
  '2/24/20': 0,
  '2/25/20': 0,
  '2/26/20': 0,
  '2/27/20': 0,
  '2/28/20': 0,
  '2/29/20': 0,
  '3/1/20': 0,
  '3/2/20': 0,
  '3/3/20': 0,
  '3/4/20': 0,
  '3/5/20': 0,
  '3/6/20': 0,
  '3/7/20': 0,
  '3/8/20': 0,
  '3/9/20': 0,
  '3/10/20': 0,
  '3/11/20': 0,
  '3/12/20': 0,
  '3/13/20': 0,
  '3/14/20': 0,
  '3/15/20': 0,
  '3/16/20': 0,
  '3/17/20': 0,
  '3/18/20': 0,
  '3/19/20': 0,
  '3/20/20'

In [93]:
covid_deaths_checked = db["Covid_Deaths_Checked"]

In [94]:
for i in data_checked2:
    covid_deaths_checked.insert_one(i)

# Por último los datos de recuperados

In [95]:
covid_recovered = db["Covid_Recovered"]

In [96]:
data3 = pd.DataFrame(covid_recovered.find({}))

In [97]:
data3.head()

Unnamed: 0,_id,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,4/2/21,4/3/21,4/4/21,4/5/21,4/6/21,4/7/21,4/8/21,4/9/21,4/10/21,Province/State
0,61c9de0350d5b7208c17ed1f,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,51798,51802,51885,51902,51928,51940,51956,51961,51962,
1,61c9de0350d5b7208c17ed20,Albania,41.1533,20.1683,0,0,0,0,0,0,...,92500,93173,93842,94431,95035,95600,96129,96672,97206,
2,61c9de0350d5b7208c17ed21,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,81729,81813,81896,81994,82096,82192,82289,82392,82493,
3,61c9de0350d5b7208c17ed22,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,11401,11428,11474,11523,11570,11616,11692,11732,11770,
4,61c9de0350d5b7208c17ed23,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,20867,20871,20879,21452,21489,21545,21557,21589,21890,


In [98]:
data3.shape

(259, 450)

In [99]:
data3 = data3.drop(columns=["Province/State"])
data3= data3.drop(columns=["_id"])

In [100]:
for i in range(3,448):
    data3.iloc[:,i] = data3.iloc[:,i].apply(pd.to_numeric)

In [101]:
data3.dtypes

Country/Region    object
Lat               object
Long              object
1/22/20            int64
1/23/20            int64
                   ...  
4/6/21             int64
4/7/21             int64
4/8/21             int64
4/9/21             int64
4/10/21            int64
Length: 448, dtype: object

In [104]:
data3["Country/Region"].value_counts().iloc[:6]

China             33
United Kingdom    12
France            12
Australia          8
Netherlands        5
Denmark            3
Name: Country/Region, dtype: int64

In [105]:
data_cleanish3 = data3.groupby("Country/Region",as_index=False).sum()

In [121]:
data3.loc[data3["Country/Region"] == "Australia"]

Unnamed: 0,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,...,4/1/21,4/2/21,4/3/21,4/4/21,4/5/21,4/6/21,4/7/21,4/8/21,4/9/21,4/10/21
8,Australia,-35.4735,149.0124,0,0,0,0,0,0,0,...,120,120,120,120,120,120,120,120,120,120
9,Australia,-33.8688,151.2093,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,Australia,-12.4634,130.8456,0,0,0,0,0,0,0,...,106,106,106,106,106,106,106,107,107,107
11,Australia,-27.4698,153.0251,0,0,0,0,0,0,0,...,1344,1353,1353,1353,1366,1366,1366,1376,1381,1381
12,Australia,-34.9285,138.6007,0,0,0,0,0,0,0,...,641,641,642,644,649,649,651,651,654,657
13,Australia,-42.8821,147.3272,0,0,0,0,0,0,0,...,221,221,221,221,221,221,221,221,221,221
14,Australia,-37.8136,144.9631,0,0,0,0,0,0,0,...,19663,19663,19663,19663,19663,19663,19664,19664,19664,19664
15,Australia,-31.9505,115.8605,0,0,0,0,0,0,0,...,919,921,922,925,923,924,928,932,933,933


In [110]:
data_aux3= data3

In [111]:
for i in range(238,249):
    data_aux3 = data_aux3.drop(index=i)

In [113]:
for i in range(176,180):
    data_aux3 = data_aux3.drop(index=i)

In [115]:
for i in range(103,114):
    data_aux3 = data_aux3.drop(index=i)

In [117]:
for i in range(86,88):
    data_aux3 = data_aux3.drop(index=i)

In [119]:
for i in range(43,56):
    data_aux3 = data_aux3.drop(index=i)

In [120]:
for i in range(57,76):
    data_aux3 = data_aux3.drop(index=i)

In [122]:
for i in range(8,14):
    data_aux3 = data_aux3.drop(index=i)

In [123]:
data_aux3 = data_aux3.drop(index=15)

In [124]:
data_aux3 = data_aux3.reset_index(drop=True)

In [125]:
data_cleanish3["Lat"] = data_aux3["Lat"]
data_cleanish3["Long"] = data_aux3["Long"]

In [126]:
lat_column3 = data_cleanish3.pop("Lat")
long_column3 = data_cleanish3.pop("Long")
data_cleanish3.insert(1, "Lat", lat_column3)
data_cleanish3.insert(2, "Long", long_column3)

In [127]:
data_cleanish3

Unnamed: 0,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,...,4/1/21,4/2/21,4/3/21,4/4/21,4/5/21,4/6/21,4/7/21,4/8/21,4/9/21,4/10/21
0,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,0,...,51788,51798,51802,51885,51902,51928,51940,51956,51961,51962
1,Albania,41.1533,20.1683,0,0,0,0,0,0,0,...,91875,92500,93173,93842,94431,95035,95600,96129,96672,97206
2,Algeria,28.0339,1.6596,0,0,0,0,0,0,0,...,81632,81729,81813,81896,81994,82096,82192,82289,82392,82493
3,Andorra,42.5063,1.5218,0,0,0,0,0,0,0,...,11365,11401,11428,11474,11523,11570,11616,11692,11732,11770
4,Angola,-11.2027,17.8739,0,0,0,0,0,0,0,...,20508,20867,20871,20879,21452,21489,21545,21557,21589,21890
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,Vietnam,14.058324,108.277199,0,0,0,0,0,0,0,...,2359,2383,2383,2383,2416,2422,2429,2429,2429,2429
188,West Bank and Gaza,31.9522,35.2332,0,0,0,0,0,0,0,...,217224,217224,220418,222007,223249,224552,226090,227988,229876,231288
189,Yemen,15.552727,48.516388,0,0,0,0,0,0,0,...,1682,1691,1715,1738,1772,1822,1886,1946,1987,2027
190,Zambia,-13.133897,27.849332,0,0,0,0,0,0,0,...,84698,84825,85017,85068,85178,85338,85409,85446,85559,86813


In [128]:
data_checked3 = data_cleanish3.to_dict("records")
data_checked3 

[{'Country/Region': 'Afghanistan',
  'Lat': '33.93911',
  'Long': '67.709953',
  '1/22/20': 0,
  '1/23/20': 0,
  '1/24/20': 0,
  '1/25/20': 0,
  '1/26/20': 0,
  '1/27/20': 0,
  '1/28/20': 0,
  '1/29/20': 0,
  '1/30/20': 0,
  '1/31/20': 0,
  '2/1/20': 0,
  '2/2/20': 0,
  '2/3/20': 0,
  '2/4/20': 0,
  '2/5/20': 0,
  '2/6/20': 0,
  '2/7/20': 0,
  '2/8/20': 0,
  '2/9/20': 0,
  '2/10/20': 0,
  '2/11/20': 0,
  '2/12/20': 0,
  '2/13/20': 0,
  '2/14/20': 0,
  '2/15/20': 0,
  '2/16/20': 0,
  '2/17/20': 0,
  '2/18/20': 0,
  '2/19/20': 0,
  '2/20/20': 0,
  '2/21/20': 0,
  '2/22/20': 0,
  '2/23/20': 0,
  '2/24/20': 0,
  '2/25/20': 0,
  '2/26/20': 0,
  '2/27/20': 0,
  '2/28/20': 0,
  '2/29/20': 0,
  '3/1/20': 0,
  '3/2/20': 0,
  '3/3/20': 0,
  '3/4/20': 0,
  '3/5/20': 0,
  '3/6/20': 0,
  '3/7/20': 0,
  '3/8/20': 0,
  '3/9/20': 0,
  '3/10/20': 0,
  '3/11/20': 0,
  '3/12/20': 0,
  '3/13/20': 0,
  '3/14/20': 0,
  '3/15/20': 0,
  '3/16/20': 1,
  '3/17/20': 1,
  '3/18/20': 1,
  '3/19/20': 1,
  '3/20/20'

In [129]:
covid_recovered_checked = db["Covid_Recovered_Checked"]

In [130]:
for i in data_checked3:
    covid_recovered_checked.insert_one(i)