In [17]:
# Importação do csv
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.neighbors import NearestNeighbors
import numpy as np
import umap
df = pd.read_csv('./data/brazil_covid19_cities.csv')

# 1. Tratamento Inicial

In [18]:
# Renomear coluna name para city
df = df.rename(columns={'name': 'city'})

In [19]:
# Transforma a coluna `date` em um objeto DateTime
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['city', 'date'])

In [20]:
# Garantir que casos e mortes são numéricos
df['cases'] = pd.to_numeric(df['cases'], errors='coerce')
df['deaths'] = pd.to_numeric(df['deaths'], errors='coerce')

df = df.dropna(subset=['city', 'cases', 'deaths'])

In [21]:
# Adição de variáveis por cidade e estado
features_city = df.groupby(['city', 'state']).agg({
  'cases': ['max', 'mean', 'std'],
  'deaths': ['max', 'mean', 'std'],
  'date': 'count'
})

features_city.columns = [
  'total_cases',
  'mean_cases',
  'std_cases',
  'total_deaths',
  'mean_deaths',
  'std_deaths',
  'days_recorded'
]

features_city.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_cases,mean_cases,std_cases,total_deaths,mean_deaths,std_deaths,days_recorded
city,state,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Abadia de Goiás,GO,1351.0,558.153664,426.181817,28,7.565012,7.530658,423
Abadia dos Dourados,MG,375.0,118.761229,119.465862,14,2.687943,4.138253,423
Abadiânia,GO,400.0,147.659574,105.898626,27,10.801418,8.029761,423
Abaetetuba,PA,7659.0,3646.444444,2098.725146,210,104.628842,48.581654,423
Abaeté,MG,1019.0,248.574468,312.806079,26,5.224586,6.855251,423


In [22]:
# Adição de coluna `death_rate` (taxa de mortalidade)
features_city['death_rate'] = (
    features_city['total_deaths'] / features_city['total_cases']
)


In [23]:
# Crescimento dos casos
df['new_cases'] = df.groupby(['city', 'state'])['cases'].diff().fillna(0)

growth = df.groupby(['city', 'state'])['new_cases'].mean().rename('mean_daily_growth')
features_city = features_city.join(growth)

In [24]:
# Dia do primeiro caso
first_case = df[df['cases'] > 0].groupby(['city','state'])['date'].min()
first_case = (first_case - df['date'].min()).dt.days
first_case = first_case.rename('days_until_first_case')

features_city = features_city.join(first_case)

In [27]:
# Remoção de registros com dados faltosos
features_city = features_city.dropna()

In [None]:
# Reset de índices
features_city = features_city.reset_index()

In [34]:
# Remoção do do campo `city`
features_city.drop('city', axis=1, inplace=True)

In [35]:
# Aplicando one-hot encode no campo `state`
features_city = pd.get_dummies(features_city, columns=['state'])

In [36]:
# Finalização do DataFrame tratado
df = features_city.copy()

In [37]:
df

Unnamed: 0,total_cases,mean_cases,std_cases,total_deaths,mean_deaths,std_deaths,days_recorded,death_rate,mean_daily_growth,days_until_first_case,...,state_PR,state_RJ,state_RN,state_RO,state_RR,state_RS,state_SC,state_SE,state_SP,state_TO
0,1351.0,558.153664,426.181817,28,7.565012,7.530658,423,0.020725,3.193853,55,...,False,False,False,False,False,False,False,False,False,False
1,375.0,118.761229,119.465862,14,2.687943,4.138253,423,0.037333,0.886525,42,...,False,False,False,False,False,False,False,False,False,False
2,400.0,147.659574,105.898626,27,10.801418,8.029761,423,0.067500,0.945626,63,...,False,False,False,False,False,False,False,False,False,False
3,7659.0,3646.444444,2098.725146,210,104.628842,48.581654,423,0.027419,18.106383,4,...,False,False,False,False,False,False,False,False,False,False
4,1019.0,248.574468,312.806079,26,5.224586,6.855251,423,0.025515,2.408983,16,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5565,202.0,53.307329,58.444733,3,0.754137,1.095478,423,0.014851,0.477541,51,...,False,False,False,False,False,True,False,False,False,False
5566,283.0,88.886525,90.399673,6,1.465721,1.509255,423,0.021201,0.669031,60,...,True,False,False,False,False,False,False,False,False,False
5567,235.0,60.338061,81.674372,2,0.846336,0.955170,423,0.008511,0.555556,75,...,False,False,False,False,False,False,False,False,False,False
5568,6049.0,2496.177305,1930.441608,112,42.179669,32.105806,423,0.018515,14.300236,19,...,False,False,False,False,False,False,False,False,False,False


# 2. Redução de Dimensionalidade

## 2.1 Aplicação de PCA

## 2.2 Aplicação de t-SNE

## 2.3 Aplicação de UMAP

# 3. Clusterização

## 3.1 Aplicação de K-Means

## 3.2 Aplicação de DBSCAN

# 4. Discussão de resultados