In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("../base-dados/eventos-climaticos.csv")

In [4]:
# Selecionar colunas essenciais
df_simplificado = df[[
    'Disaster Type', 'Start Year', 'Total Deaths', 'No. Injured',
    'Total Affected', 'Total Damage (\'000 US$)', 'Magnitude', 'Country'
]].copy()

# Criar coluna 'Nivel de Risco' com base em regras simples
def classificar_risco(row):
    impacto = 0
    if pd.notnull(row['Total Deaths']):
        impacto += row['Total Deaths'] * 5
    if pd.notnull(row['No. Injured']):
        impacto += row['No. Injured'] * 2
    if pd.notnull(row['Total Affected']):
        impacto += row['Total Affected'] * 1
    if pd.notnull(row['Total Damage (\'000 US$)']):
        impacto += row['Total Damage (\'000 US$)'] * 0.1

    if impacto == 0:
        return 'Muito Baixo'
    elif impacto <= 100:
        return 'Baixo'
    elif impacto <= 1000:
        return 'Médio'
    elif impacto <= 10000:
        return 'Alto'
    else:
        return 'Muito Alto'

df_simplificado['Nivel de Risco'] = df_simplificado.apply(classificar_risco, axis=1)

# Verificar as primeiras linhas da nova tabela
df_simplificado.head()


Unnamed: 0,Disaster Type,Start Year,Total Deaths,No. Injured,Total Affected,Total Damage ('000 US$),Magnitude,Country,Nivel de Risco
0,Drought,2001,,,100000.0,,,Djibouti,Muito Alto
1,Drought,2000,,,2000000.0,,,Sudan,Muito Alto
2,Drought,2000,21.0,,1200000.0,,,Somalia,Muito Alto
3,Road,2000,14.0,11.0,11.0,,,Angola,Médio
4,Flood,2000,31.0,,70000.0,10000.0,47000.0,Angola,Muito Alto


In [5]:
df_simplificado.to_csv("../base-dados/eventos-climaticos-limpos.csv")

In [6]:
df_model = pd.read_csv("../base-dados/eventos-climaticos-limpos.csv")

numerical_cols = [
    'Start Year', 'Total Deaths', 'No. Injured',
    'Total Affected', 'Total Damage (\'000 US$)', 'Magnitude'
]

df_model[numerical_cols] = df_model[numerical_cols].fillna(0)

df_model.head()

Unnamed: 0.1,Unnamed: 0,Disaster Type,Start Year,Total Deaths,No. Injured,Total Affected,Total Damage ('000 US$),Magnitude,Country,Nivel de Risco
0,0,Drought,2001,0.0,0.0,100000.0,0.0,0.0,Djibouti,Muito Alto
1,1,Drought,2000,0.0,0.0,2000000.0,0.0,0.0,Sudan,Muito Alto
2,2,Drought,2000,21.0,0.0,1200000.0,0.0,0.0,Somalia,Muito Alto
3,3,Road,2000,14.0,11.0,11.0,0.0,0.0,Angola,Médio
4,4,Flood,2000,31.0,0.0,70000.0,10000.0,47000.0,Angola,Muito Alto


In [7]:
df_model = df_model.drop("Unnamed: 0", axis=1)

In [8]:
df_model

Unnamed: 0,Disaster Type,Start Year,Total Deaths,No. Injured,Total Affected,Total Damage ('000 US$),Magnitude,Country,Nivel de Risco
0,Drought,2001,0.0,0.0,100000.0,0.0,0.0,Djibouti,Muito Alto
1,Drought,2000,0.0,0.0,2000000.0,0.0,0.0,Sudan,Muito Alto
2,Drought,2000,21.0,0.0,1200000.0,0.0,0.0,Somalia,Muito Alto
3,Road,2000,14.0,11.0,11.0,0.0,0.0,Angola,Médio
4,Flood,2000,31.0,0.0,70000.0,10000.0,47000.0,Angola,Muito Alto
...,...,...,...,...,...,...,...,...,...
16237,Road,2025,21.0,0.0,0.0,0.0,0.0,Mexico,Médio
16238,Miscellaneous accident (General),2025,21.0,0.0,0.0,0.0,0.0,India,Médio
16239,Storm,2025,28.0,108.0,15108.0,0.0,0.0,United States of America,Muito Alto
16240,Drought,2025,0.0,0.0,4400000.0,0.0,0.0,Somalia,Muito Alto


In [9]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16242 entries, 0 to 16241
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Disaster Type            16242 non-null  object 
 1   Start Year               16242 non-null  int64  
 2   Total Deaths             16242 non-null  float64
 3   No. Injured              16242 non-null  float64
 4   Total Affected           16242 non-null  float64
 5   Total Damage ('000 US$)  16242 non-null  float64
 6   Magnitude                16242 non-null  float64
 7   Country                  16242 non-null  object 
 8   Nivel de Risco           16242 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.1+ MB


In [10]:
y = df_model["Nivel de Risco"]
X = df_model.drop("Nivel de Risco", axis=1)
X = pd.get_dummies(X, columns=['Disaster Type', 'Country'])

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

In [13]:
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [14]:
accuracy

0.9913814331445456

In [15]:
pd.to_pickle(clf, "../app/modelo.pkl")