In [10]:
import pandas as pd

In [11]:
df = pd.read_csv("../base-dados/eventos-climaticos.csv")

In [12]:
# Selecionar colunas essenciais
df_simplificado = df[[
    'Disaster Type', 'Start Year', 'Total Deaths', 'No. Injured',
    'Total Affected', 'Total Damage (\'000 US$)', 'Magnitude', 'Country'
]].copy()

# Criar coluna 'Nivel de Risco' com base em regras simples
def classificar_risco(row):
    impacto = 0
    if pd.notnull(row['Total Deaths']):
        impacto += row['Total Deaths'] * 5
    if pd.notnull(row['No. Injured']):
        impacto += row['No. Injured'] * 2
    if pd.notnull(row['Total Affected']):
        impacto += row['Total Affected'] * 1
    if pd.notnull(row['Total Damage (\'000 US$)']):
        impacto += row['Total Damage (\'000 US$)'] * 0.1

    if impacto == 0:
        return 'Muito Baixo'
    elif impacto <= 100:
        return 'Baixo'
    elif impacto <= 1000:
        return 'Médio'
    elif impacto <= 10000:
        return 'Alto'
    else:
        return 'Muito Alto'

df_simplificado['Nivel de Risco'] = df_simplificado.apply(classificar_risco, axis=1)

# Verificar as primeiras linhas da nova tabela
df_simplificado.head()


Unnamed: 0,Disaster Type,Start Year,Total Deaths,No. Injured,Total Affected,Total Damage ('000 US$),Magnitude,Country,Nivel de Risco
0,Drought,2001,,,100000.0,,,Djibouti,Muito Alto
1,Drought,2000,,,2000000.0,,,Sudan,Muito Alto
2,Drought,2000,21.0,,1200000.0,,,Somalia,Muito Alto
3,Road,2000,14.0,11.0,11.0,,,Angola,Médio
4,Flood,2000,31.0,,70000.0,10000.0,47000.0,Angola,Muito Alto


In [13]:
df_simplificado.to_csv("../base-dados/eventos-climaticos-limpos.csv")

In [14]:
df_model = pd.read_csv("../base-dados/eventos-climaticos-limpos.csv")

numerical_cols = [
    'Start Year', 'Total Deaths', 'No. Injured',
    'Total Affected', 'Total Damage (\'000 US$)', 'Magnitude'
]

df_model[numerical_cols] = df_model[numerical_cols].fillna(0)

df_model.head()

Unnamed: 0.1,Unnamed: 0,Disaster Type,Start Year,Total Deaths,No. Injured,Total Affected,Total Damage ('000 US$),Magnitude,Country,Nivel de Risco
0,0,Drought,2001,0.0,0.0,100000.0,0.0,0.0,Djibouti,Muito Alto
1,1,Drought,2000,0.0,0.0,2000000.0,0.0,0.0,Sudan,Muito Alto
2,2,Drought,2000,21.0,0.0,1200000.0,0.0,0.0,Somalia,Muito Alto
3,3,Road,2000,14.0,11.0,11.0,0.0,0.0,Angola,Médio
4,4,Flood,2000,31.0,0.0,70000.0,10000.0,47000.0,Angola,Muito Alto


In [15]:
df_model = df_model.drop("Unnamed: 0", axis=1)

In [16]:
df_model

Unnamed: 0,Disaster Type,Start Year,Total Deaths,No. Injured,Total Affected,Total Damage ('000 US$),Magnitude,Country,Nivel de Risco
0,Drought,2001,0.0,0.0,100000.0,0.0,0.0,Djibouti,Muito Alto
1,Drought,2000,0.0,0.0,2000000.0,0.0,0.0,Sudan,Muito Alto
2,Drought,2000,21.0,0.0,1200000.0,0.0,0.0,Somalia,Muito Alto
3,Road,2000,14.0,11.0,11.0,0.0,0.0,Angola,Médio
4,Flood,2000,31.0,0.0,70000.0,10000.0,47000.0,Angola,Muito Alto
...,...,...,...,...,...,...,...,...,...
16237,Road,2025,21.0,0.0,0.0,0.0,0.0,Mexico,Médio
16238,Miscellaneous accident (General),2025,21.0,0.0,0.0,0.0,0.0,India,Médio
16239,Storm,2025,28.0,108.0,15108.0,0.0,0.0,United States of America,Muito Alto
16240,Drought,2025,0.0,0.0,4400000.0,0.0,0.0,Somalia,Muito Alto


In [17]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16242 entries, 0 to 16241
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Disaster Type            16242 non-null  object 
 1   Start Year               16242 non-null  int64  
 2   Total Deaths             16242 non-null  float64
 3   No. Injured              16242 non-null  float64
 4   Total Affected           16242 non-null  float64
 5   Total Damage ('000 US$)  16242 non-null  float64
 6   Magnitude                16242 non-null  float64
 7   Country                  16242 non-null  object 
 8   Nivel de Risco           16242 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.1+ MB


In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import joblib
from sklearn.metrics import accuracy_score

# Definir X e y
y = df_model["Nivel de Risco"]
X = df_model.drop("Nivel de Risco", axis=1)

# Definir colunas
categorical_cols = ['Disaster Type', 'Country']
numeric_cols = ['Start Year', 'Total Deaths', 'No. Injured', 'Total Affected', "Total Damage ('000 US$)", 'Magnitude']

# Pré-processamento
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numeric_cols)
    ]
)

# Pipeline: pré-processamento + modelo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier())
])

# Split dos dados
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Treinar pipeline
pipeline.fit(X_train, y_train)

# Avaliar
y_pred = pipeline.predict(X_test)
print("Acurácia:", accuracy_score(y_test, y_pred))

# Salvar pipeline
joblib.dump(pipeline, "../app/modelo_pipeline.pkl")


Acurácia: 0.9874415168677666


['../app/modelo_pipeline.pkl']

# Gráficos

In [19]:
import os
os.makedirs('../dataviz/', exist_ok=True)

In [20]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12,6))
sns.countplot(y='Disaster Type', data=df, order=df['Disaster Type'].value_counts().head(10).index)
plt.title('Top 10 Tipos de Desastre mais Frequentes')
plt.xlabel('Quantidade de Eventos')
plt.ylabel('Tipo de Desastre')
plt.tight_layout()
plt.savefig('../dataviz/tipos_desastre.png')
plt.close()


In [21]:
plt.figure(figsize=(14,6))
sns.countplot(x='Start Year', data=df)
plt.xticks(rotation=90)
plt.title('Número de Desastres por Ano')
plt.xlabel('Ano')
plt.ylabel('Quantidade de Desastres')
plt.tight_layout()
plt.savefig('../dataviz/desastres_por_ano.png')
plt.close()


In [22]:
top_paises = df['Country'].value_counts().head(10)

plt.figure(figsize=(12,6))
sns.barplot(x=top_paises.values, y=top_paises.index, palette="viridis")
plt.title('Top 10 Países com mais Desastres')
plt.xlabel('Quantidade de Desastres')
plt.ylabel('País')
plt.tight_layout()
plt.savefig('../dataviz/paises_mais_desastres.png')
plt.close()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_paises.values, y=top_paises.index, palette="viridis")


In [23]:
df_mortes = df.groupby('Disaster Type')['Total Deaths'].mean().sort_values(ascending=False).head(10)

plt.figure(figsize=(12,6))
sns.barplot(x=df_mortes.values, y=df_mortes.index, palette="magma")
plt.title('Média de Mortes por Tipo de Desastre')
plt.xlabel('Média de Mortes')
plt.ylabel('Tipo de Desastre')
plt.tight_layout()
plt.savefig('../dataviz/media_mortes_por_tipo.png')
plt.close()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=df_mortes.values, y=df_mortes.index, palette="magma")


In [24]:
df_danos = df.groupby('Disaster Type')['Total Damage, Adjusted (\'000 US$)'].sum().sort_values(ascending=False).head(10)

plt.figure(figsize=(12,6))
sns.barplot(x=df_danos.values, y=df_danos.index, palette="rocket")
plt.title('Danos Financeiros Totais por Tipo de Desastre (em mil US$ ajustados)')
plt.xlabel('Danos Totais (\'000 US$)')
plt.ylabel('Tipo de Desastre')
plt.tight_layout()
plt.savefig('../dataviz/danos_financeiros.png')
plt.close()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=df_danos.values, y=df_danos.index, palette="rocket")


In [25]:
df_model["Country"].unique()

array(['Djibouti', 'Sudan', 'Somalia', 'Angola', 'Bangladesh', 'Brazil',
       'China', 'Egypt', 'Guatemala', 'Iran (Islamic Republic of)',
       'Indonesia', 'Mozambique', 'Nigeria', 'Malawi', 'Peru',
       'South Africa', 'India', 'United States of America', 'Norway',
       'Côte d’Ivoire', 'Romania', 'Bulgaria', 'Libya', 'Switzerland',
       'Réunion', 'Philippines', 'Spain', 'Pakistan', 'Germany',
       'Ethiopia', 'Morocco', 'Botswana', 'Russian Federation',
       'Mongolia', 'Congo', 'Afghanistan', 'United Republic of Tanzania',
       'Lesotho', 'Kenya', 'Uganda', 'Viet Nam', 'Italy', 'Thailand',
       'Bosnia and Herzegovina', 'Iceland', 'Namibia', 'Colombia',
       'Zimbabwe', 'Israel', 'Jordan', 'Eswatini', 'Australia', 'Zambia',
       'Paraguay', 'Madagascar', 'Tuvalu', 'Argentina',
       'Democratic Republic of the Congo', 'Hungary',
       'Bolivia (Plurinational State of)', 'Chad', 'Sierra Leone',
       'Austria', 'Niger', 'Malaysia', 'Cameroon', 'Sri Lanka', 

In [26]:
df_model["Disaster Type"].unique()

array(['Drought', 'Road', 'Flood', 'Extreme temperature',
       'Fire (Miscellaneous)', 'Volcanic activity', 'Storm', 'Wildfire',
       'Earthquake', 'Rail', 'Air', 'Collapse (Industrial)',
       'Collapse (Miscellaneous)', 'Fire (Industrial)',
       'Explosion (Miscellaneous)', 'Epidemic', 'Water',
       'Mass movement (wet)', 'Explosion (Industrial)', 'Chemical spill',
       'Gas leak', 'Infestation', 'Miscellaneous accident (General)',
       'Poisoning', 'Mass movement (dry)',
       'Industrial accident (General)', 'Radiation', 'Oil spill',
       'Impact', 'Animal incident', 'Glacial lake outburst flood'],
      dtype=object)

In [27]:
df_model["Start Year"].min()
df_model["Start Year"].max()

np.int64(2025)

In [28]:
df_model[df_model["Magnitude"] > 0]

Unnamed: 0,Disaster Type,Start Year,Total Deaths,No. Injured,Total Affected,Total Damage ('000 US$),Magnitude,Country,Nivel de Risco
4,Flood,2000,31.0,0.0,70000.0,10000.0,47000.0,Angola,Muito Alto
5,Extreme temperature,2000,49.0,0.0,0.0,0.0,6.4,Bangladesh,Médio
11,Storm,2000,3.0,0.0,5500.0,0.0,120.0,Iran (Islamic Republic of),Alto
14,Flood,2000,800.0,0.0,4500000.0,419200.0,1980.0,Mozambique,Muito Alto
20,Flood,2000,26.0,0.0,70000.0,0.0,350000.0,Brazil,Muito Alto
...,...,...,...,...,...,...,...,...,...
16188,Earthquake,2025,3804.0,4824.0,287614.0,0.0,7.7,Myanmar,Muito Alto
16189,Earthquake,2025,54.0,38.0,2351.0,0.0,7.7,Thailand,Alto
16200,Earthquake,2025,1.0,0.0,145.0,0.0,5.8,Tajikistan,Médio
16215,Earthquake,2025,0.0,236.0,236.0,0.0,6.2,Türkiye,Médio


In [29]:
df_model["Country"].unique()

array(['Djibouti', 'Sudan', 'Somalia', 'Angola', 'Bangladesh', 'Brazil',
       'China', 'Egypt', 'Guatemala', 'Iran (Islamic Republic of)',
       'Indonesia', 'Mozambique', 'Nigeria', 'Malawi', 'Peru',
       'South Africa', 'India', 'United States of America', 'Norway',
       'Côte d’Ivoire', 'Romania', 'Bulgaria', 'Libya', 'Switzerland',
       'Réunion', 'Philippines', 'Spain', 'Pakistan', 'Germany',
       'Ethiopia', 'Morocco', 'Botswana', 'Russian Federation',
       'Mongolia', 'Congo', 'Afghanistan', 'United Republic of Tanzania',
       'Lesotho', 'Kenya', 'Uganda', 'Viet Nam', 'Italy', 'Thailand',
       'Bosnia and Herzegovina', 'Iceland', 'Namibia', 'Colombia',
       'Zimbabwe', 'Israel', 'Jordan', 'Eswatini', 'Australia', 'Zambia',
       'Paraguay', 'Madagascar', 'Tuvalu', 'Argentina',
       'Democratic Republic of the Congo', 'Hungary',
       'Bolivia (Plurinational State of)', 'Chad', 'Sierra Leone',
       'Austria', 'Niger', 'Malaysia', 'Cameroon', 'Sri Lanka', 