# Importing libraries

In [275]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Death prediction in Colombia due to covid-19.

In [276]:
df = pd.read_csv('Casos_positivos_de_COVID-19_en_Colombia._20241128.csv')

  df = pd.read_csv('Casos_positivos_de_COVID-19_en_Colombia._20241128.csv')


## 1. Data Collection

The dataset that we found is too big (more than 6.3M of records) and the features are written in spanish, which doesn't help us for cleaning it.

We will then rename columns, and use a small sample (more precisely 1% of the original dataset) to export it to another file.

Doing so, we will have readable input variables in english, and a file that will allow us to use less computing ressources.

### Renaming columns

In [277]:
column_mapping = {
    "fecha reporte web": "report_date",
    "ID de caso": "case_id",
    "Fecha de notificación": "notification_date",
    "Código DIVIPOLA departamento": "department_code",
    "Nombre departamento": "department_name",
    "Código DIVIPOLA municipio": "municipality_code",
    "Nombre municipio": "municipality_name",
    "Edad": "age",
    "Unidad de medida de edad": "age_unit",
    "Sexo": "gender",
    "Tipo de contagio": "contagion_type",
    "Ubicación del caso": "case_location",
    "Estado": "status",
    "Código ISO del país": "country_iso_code",
    "Nombre del país": "country_name",
    "Recuperado": "recovered",
    "Fecha de inicio de síntomas": "symptom_start_date",
    "Fecha de muerte": "death_date",
    "Fecha de diagnóstico": "diagnosis_date",
    "Fecha de recuperación": "recovery_date",
    "Tipo de recuperación": "recovery_type",
    "Pertenencia étnica": "ethnic_affiliation",
    "Nombre del grupo étnico": "ethnic_group_name",
}
df.rename(columns=column_mapping, inplace=True)

### Keeping 1% of dataset using Random Sampling Technique (from pandas library), and exporting it to another file

In [278]:
df_sample = df.sample(frac=0.01, random_state=42)

In [279]:
df_sample.to_csv('covid19_sample.csv', index=False)

## 2. Data Understanding

In [280]:
df = pd.read_csv('covid19_sample.csv')

In [281]:
df.shape

(63910, 23)

In [282]:
df

Unnamed: 0,report_date,case_id,notification_date,department_code,department_name,municipality_code,municipality_name,age,age_unit,gender,contagion_type,case_location,status,country_iso_code,country_name,recovered,symptom_start_date,death_date,diagnosis_date,recovery_date,recovery_type,ethnic_affiliation,ethnic_group_name
0,2020-08-01 00:00:00,299403,2020-07-15 00:00:00,5,ANTIOQUIA,5001,MEDELLIN,23,1,M,Comunitaria,Casa,Leve,,,Recuperado,2020-07-10 00:00:00,,2020-07-31 00:00:00,2020-08-09 00:00:00,Tiempo,6.0,
1,2020-09-19 00:00:00,751888,2020-09-14 00:00:00,68,SANTANDER,68307,GIRON,47,1,F,Comunitaria,Casa,Leve,,,Recuperado,2020-09-11 00:00:00,,2020-09-18 00:00:00,2020-09-28 00:00:00,Tiempo,6.0,
2,2021-01-14 00:00:00,1833100,2021-01-01 00:00:00,81,ARAUCA,81001,ARAUCA,65,1,F,Relacionado,Casa,Leve,,,Recuperado,2020-12-29 00:00:00,,2021-01-12 00:00:00,2021-01-15 00:00:00,Tiempo,6.0,
3,2021-09-15 00:00:00,4933309,2021-08-04 00:00:00,25,CUNDINAMARCA,25754,SOACHA,25,1,F,Comunitaria,Casa,Leve,,,Recuperado,2021-07-31 00:00:00,,2021-08-15 00:00:00,2021-09-16 00:00:00,Tiempo,6.0,
4,2021-02-01 00:00:00,2098903,2021-01-24 00:00:00,11,BOGOTA,11001,BOGOTA,35,1,M,Comunitaria,Casa,Leve,,,Recuperado,2021-01-22 00:00:00,,2021-01-31 00:00:00,2021-02-07 00:00:00,Tiempo,6.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63905,2021-04-23 00:00:00,2728432,2021-04-10 00:00:00,25,CUNDINAMARCA,25754,SOACHA,19,1,F,Comunitaria,Casa,Leve,,,Recuperado,2021-04-06 00:00:00,,2021-04-21 00:00:00,2021-04-24 00:00:00,Tiempo,6.0,
63906,2020-09-15 00:00:00,723841,2020-09-10 00:00:00,25,CUNDINAMARCA,25899,ZIPAQUIRA,68,1,F,Comunitaria,Casa,Leve,,,Recuperado,2020-09-05 00:00:00,,2020-09-13 00:00:00,2020-09-26 00:00:00,Tiempo,6.0,
63907,2021-02-17 00:00:00,2206812,2021-02-14 00:00:00,11,BOGOTA,11001,BOGOTA,54,1,M,Relacionado,Casa,Leve,,,Recuperado,2021-02-09 00:00:00,,2021-02-14 00:00:00,2021-02-23 00:00:00,Tiempo,6.0,
63908,2021-06-16 00:00:00,3807449,2021-06-01 00:00:00,13001,CARTAGENA,13001,CARTAGENA,33,1,F,Comunitaria,Casa,Leve,,,Recuperado,2021-05-29 00:00:00,,2021-06-12 00:00:00,2021-06-17 00:00:00,PCR,5.0,


In [283]:
df.head()

Unnamed: 0,report_date,case_id,notification_date,department_code,department_name,municipality_code,municipality_name,age,age_unit,gender,contagion_type,case_location,status,country_iso_code,country_name,recovered,symptom_start_date,death_date,diagnosis_date,recovery_date,recovery_type,ethnic_affiliation,ethnic_group_name
0,2020-08-01 00:00:00,299403,2020-07-15 00:00:00,5,ANTIOQUIA,5001,MEDELLIN,23,1,M,Comunitaria,Casa,Leve,,,Recuperado,2020-07-10 00:00:00,,2020-07-31 00:00:00,2020-08-09 00:00:00,Tiempo,6.0,
1,2020-09-19 00:00:00,751888,2020-09-14 00:00:00,68,SANTANDER,68307,GIRON,47,1,F,Comunitaria,Casa,Leve,,,Recuperado,2020-09-11 00:00:00,,2020-09-18 00:00:00,2020-09-28 00:00:00,Tiempo,6.0,
2,2021-01-14 00:00:00,1833100,2021-01-01 00:00:00,81,ARAUCA,81001,ARAUCA,65,1,F,Relacionado,Casa,Leve,,,Recuperado,2020-12-29 00:00:00,,2021-01-12 00:00:00,2021-01-15 00:00:00,Tiempo,6.0,
3,2021-09-15 00:00:00,4933309,2021-08-04 00:00:00,25,CUNDINAMARCA,25754,SOACHA,25,1,F,Comunitaria,Casa,Leve,,,Recuperado,2021-07-31 00:00:00,,2021-08-15 00:00:00,2021-09-16 00:00:00,Tiempo,6.0,
4,2021-02-01 00:00:00,2098903,2021-01-24 00:00:00,11,BOGOTA,11001,BOGOTA,35,1,M,Comunitaria,Casa,Leve,,,Recuperado,2021-01-22 00:00:00,,2021-01-31 00:00:00,2021-02-07 00:00:00,Tiempo,6.0,


In [284]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63910 entries, 0 to 63909
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   report_date         63910 non-null  object 
 1   case_id             63910 non-null  int64  
 2   notification_date   63910 non-null  object 
 3   department_code     63910 non-null  int64  
 4   department_name     63910 non-null  object 
 5   municipality_code   63910 non-null  int64  
 6   municipality_name   63910 non-null  object 
 7   age                 63910 non-null  int64  
 8   age_unit            63910 non-null  int64  
 9   gender              63910 non-null  object 
 10  contagion_type      63910 non-null  object 
 11  case_location       63489 non-null  object 
 12  status              63489 non-null  object 
 13  country_iso_code    39 non-null     float64
 14  country_name        39 non-null     object 
 15  recovered           63543 non-null  object 
 16  symp

In [285]:
df.describe()

Unnamed: 0,case_id,department_code,municipality_code,age,age_unit,country_iso_code,ethnic_affiliation
count,63910.0,63910.0,63910.0,63910.0,63910.0,39.0,63887.0
mean,3193952.0,1316.857659,28378.870365,39.963417,1.005461,633.74359,5.910702
std,1845261.0,5813.507711,26235.642794,18.511392,0.080398,253.76924,0.594951
min,59.0,5.0,5001.0,1.0,1.0,56.0,1.0
25%,1591566.0,11.0,11001.0,27.0,1.0,484.0,6.0
50%,3191206.0,17.0,11001.0,38.0,1.0,724.0,6.0
75%,4795385.0,68.0,50573.0,53.0,1.0,840.0,6.0
max,6390919.0,47001.0,99773.0,107.0,3.0,862.0,6.0


In [286]:
# Checking for missing values
df.isnull().sum()

report_date               0
case_id                   0
notification_date         0
department_code           0
department_name           0
municipality_code         0
municipality_name         0
age                       0
age_unit                  0
gender                    0
contagion_type            0
case_location           421
status                  421
country_iso_code      63871
country_name          63871
recovered               367
symptom_start_date     5028
death_date            62021
diagnosis_date           27
recovery_date          1862
recovery_type          1864
ethnic_affiliation       23
ethnic_group_name     63045
dtype: int64

In [287]:
# Not sure about it, but it seems like the column 'death_date' has a lot of class inbalancing
print(f"Percentage of deaths: {round(int(df[df['death_date'].notna()]['death_date'].count()) / int(df['case_id'].count()) * 100)} %")

Percentage of deaths: 3 %


## 3. Data Pre-processing

#### We can see that we have a lot of irrelevant data, missing values, class inbalancing, date columns to be replaced, categorical variables to be encoded, and so on.

So let's clear our data...

### Dropping/filling missing values

df.dropna(axis=0, how='any', inplace=True)

df.dropna(axis=1, how='all', inplace=True)

### OR

df['age'].fillna(df['age'].median(), inplace=True)

and so on...


### Converting date columns

In [288]:
date_columns = ["report_date", "notification_date", "symptom_start_date", "death_date", "diagnosis_date", "recovery_date"]
for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')

### Encoding categorical variables

In [289]:
df = pd.get_dummies(df, columns=["gender", "contagion_type", "status", "recovered"], drop_first=False)

### Filtering relevant features

In [290]:
relevant_features = [
    "age", "gender_M", "gender_F", "contagion_type_Comunitaria", 
    "status_Leve", "status_Fallecido", "recovered_Recuperado", "death_date"
]
df = df[relevant_features]


### Renaming columns

In [291]:
column_mapping = {
    "contagion_type_Comunitaria": "contagion_type_Community", 
    "status_Leve": "status_Alive",
    "status_Fallecido": "status_Dead",
    "recovered_Recuperado": "recovered_Recovered"
}
df.rename(columns=column_mapping, inplace=True)

In [292]:
scaler = StandardScaler()
df['age'] = scaler.fit_transform(df[['age']])

In [293]:
df.head()

Unnamed: 0,age,gender_M,gender_F,contagion_type_Community,status_Alive,status_Dead,recovered_Recovered,death_date
0,-0.916384,True,False,True,True,False,True,NaT
1,0.380125,False,True,True,True,False,True,NaT
2,1.352507,False,True,False,True,False,True,NaT
3,-0.808342,False,True,True,True,False,True,NaT
4,-0.26813,True,False,True,True,False,True,NaT
