<a href="https://colab.research.google.com/github/Alilson2/Projeto_IA/blob/main/ProjetoIA_ver_David.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Se precisar gerar os dados execute desde o começo, se não precisar basta executar a partir da célula que carrega o arquivo teste.csv

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import xarray as xr # Ler arquivos netcdf
from google.colab import drive
import seaborn as sns

!pip install cartopy
import cartopy

import cartopy.crs as ccrs # Escolha do sistema de coordenadas
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
import cartopy.feature as cfeature
import matplotlib.patches as mpatches # Desenhar geometria em um mapa

In [None]:
import os
import xarray as xr

# --- Clonar repositório se não existir ---
if not os.path.exists("Projeto_IA"):
    !git clone https://github.com/Alilson2/Projeto_IA.git
else:
    print("📁 Repositório 'Projeto_IA' já existe — pulando o clone.")

# --- Verificar se a pasta foi criada ---
if not os.path.exists("Projeto_IA"):
    raise FileNotFoundError("❌ A pasta 'Projeto_IA' não foi encontrada. O clone pode ter falhado.")
else:
    print("\n✅ Repositório clonado com sucesso!\n")
    print("Arquivos dentro da pasta Projeto_IA:\n", os.listdir("Projeto_IA"))

# --- Localizar arquivos .nc ---
arquivos_nc = [f for f in os.listdir("Projeto_IA") if f.endswith(".nc")]
if not arquivos_nc:
    raise FileNotFoundError("❌ Nenhum arquivo .nc encontrado na pasta Projeto_IA!")
else:
    print("\n📂 Arquivo(s) NetCDF encontrado(s):")
    for f in arquivos_nc:
        print(" -", f)

# --- Montar lista de caminhos ---
ARQUIVO = [os.path.join("Projeto_IA", f) for f in arquivos_nc]

# --- Função para corrigir longitude ---
def corrigir_longitude(ds):
    for coord in ["longitude", "lon"]:
        if coord in ds.coords:
            ds = ds.assign_coords({coord: ((ds[coord] + 180) % 360) - 180})
            ds = ds.sortby(coord)
    return ds

# --- Abrir arquivos com segurança (nova sintaxe) ---
try:
    dados = xr.open_mfdataset(
        ARQUIVO,
        combine='by_coords',
        parallel=True,           # usa múltiplos núcleos
        preprocess=corrigir_longitude,
        combine_attrs='override' # 🟢 substitui o antigo compat='override'
    )
except ValueError as e:
    print("\n⚠️ Erro na combinação — tentando modo 'nested' (concat por tempo)...")
    dados = xr.open_mfdataset(
        ARQUIVO,
        combine='nested',
        concat_dim='valid_time',  # ajuste se sua dimensão temporal tiver outro nome
        parallel=True,
        preprocess=corrigir_longitude,
        combine_attrs='override'
    )

print("\n✅ Dataset carregado com sucesso!\n")
dados

In [None]:
df = dados.to_dataframe()
df = df.dropna()

In [None]:
df

In [None]:

# --- 2️⃣ Verificar se há a coordenada temporal ---
if "valid_time" not in dados.coords:
    raise ValueError("❌ O dataset não contém uma coordenada temporal chamada 'valid_time'.")

# --- 3️⃣ Converter o eixo temporal para pandas.DatetimeIndex ---
tempo = pd.to_datetime(dados["valid_time"].values)

# --- 4️⃣ Criar DataFrame com componentes temporais ---
df_tempo = pd.DataFrame({
    "timestamp": tempo,
    "timestamp_segundos": tempo.view("int64"),   # segundos desde 1970
    "ano": tempo.year,
    "mes": tempo.month,
    "dia": tempo.day,
    "hora": tempo.hour,
    "minuto": tempo.minute,
    "segundo": tempo.second,
    "dia_semana": tempo.dayofweek,
    "dia_do_ano": tempo.dayofyear
})

#print(df_tempo.head())

# Exemplo: seleciona uma variável e um período
# Sort the dataset by valid_time before slicing
dados_sorted = dados.sortby('valid_time')
dados_filtrado = dados_sorted

# Converte para pandas sem estourar RAM
df_panda = dados_filtrado.to_dataframe().reset_index()
df_panda = df_panda.dropna()

# Junta com df_tempo
df_final = pd.merge(
    df_panda,
    df_tempo,
    left_on='valid_time',
    right_on='timestamp',
    how='left'
)

df_panda

In [None]:
print(np.unique(df_panda['latitude'].values))
print(np.unique(df_panda['longitude'].values))
print(len(np.unique(df_panda['latitude'].values)))
print(len(np.unique(df_panda['longitude'].values)))

In [None]:
tempo = np.unique(df_panda['valid_time'].values)
tempo

In [None]:
from tqdm import tqdm

df_panda.keys()

In [None]:
import numpy as np
import pandas as pd

def calcula_vapor_umidade(data):
    # Temperatura e ponto de orvalho em Celsius
    t2m_celsius = data["t2m"] - 273.15
    d2m_celsius = data["d2m"] - 273.15

    # 1) Pressão de vapor de saturação (es) em Pa
    es = 610.94 * np.exp(17.625 * t2m_celsius / (243.04 + t2m_celsius))

    # 2) Pressão de vapor real (ev) em Pa
    ev = 610.94 * np.exp(17.625 * d2m_celsius / (243.04 + d2m_celsius))

    # 3) Umidade Relativa (%)
    RH = 100 * ev / es

    # 4) Déficit de Pressão de Vapor (kPa)
    VPD = (es - ev) / 1000

    # Retorna como DataFrame para fácil concatenação
    return pd.DataFrame({
        "es": es,
        "ev": ev,
        "RH": RH,
        "VPD": VPD
    })

# --- Aplicar ao DataFrame original ---
resultados = calcula_vapor_umidade(df_panda)

# Adiciona novas colunas ao df_panda
df_panda = pd.concat([df_panda.reset_index(drop=True), resultados.reset_index(drop=True)], axis=1)


In [None]:
df_panda

In [None]:
mask = df_panda['valid_time'].values == tempo[13]
df_panda['d2m'].values[mask]

In [None]:
#RODE APENAS SE NÃO TIVER OS DADOS PRONTOS
from tqdm import tqdm
import numpy as np
import pandas as pd

def criar_dados(df_panda):
  data = []

  # Lista de variáveis a processar
  variaveis = ['d2m', 't2m', 'u10', 'v10', 'slhf', 'sshf', 'ssrd', 'sp', 'e', 'tp', 'es', 'ev', 'RH', 'VPD']

  for t in tqdm(tempo, desc="Processando tempos"):
      # Filtra as linhas com o tempo atual
      mask = df_panda['valid_time'].values == t

      # Dicionário para armazenar os resultados deste tempo
      stats = {'tempo': t}

      for var in variaveis:
          valores = df_panda[var].values[mask]

          # Se for precipitação ou evaporação, somar; caso contrário, calcular média/mín/máx
          if var in ['e', 'tp']:
              stats[f'{var}_sum'] = np.sum(valores)
          else:
              stats[f'{var}_mean'] = np.mean(valores)
              stats[f'{var}_min'] = np.min(valores)
              stats[f'{var}_max'] = np.max(valores)

      data.append(stats)

  # Converte para DataFrame
  df_resumo = pd.DataFrame(data)
  return df_resumo

In [None]:
#RODE APENAS SE NÃO TIVER OS DADOS PRONTOS
#df_resumo = criar_dados(df_panda)
#df_resumo.to_csv('teste.csv', index=False, encoding='utf-8')

In [48]:
url = 'https://raw.githubusercontent.com/Alilson2/Projeto_IA/main/teste.csv'
df_resumo = pd.read_csv(url)

In [49]:
df_resumo

Unnamed: 0,tempo,d2m_mean,d2m_min,d2m_max,t2m_mean,t2m_min,t2m_max,u10_mean,u10_min,u10_max,...,ev_mean,ev_min,ev_max,RH_mean,RH_min,RH_max,VPD_mean,VPD_min,VPD_max,data
0,2020-01-01 00:00:00,294.35715,290.97913,296.40527,296.00278,294.86487,298.34375,-1.671670,-2.459788,-0.666270,...,2520.350811,2037.708369,2847.425769,90.793529,64.118763,98.381075,0.260865,0.043199,1.147642,2020-01-01
1,2020-01-01 01:00:00,294.31824,292.10450,296.99230,295.47670,294.04517,297.61250,-1.653717,-2.406607,-0.626455,...,2513.622883,2186.621767,2949.978385,93.364740,72.843492,99.174344,0.181176,0.020972,0.825427,2020-01-01
2,2020-01-01 02:00:00,294.07920,292.05470,296.87305,295.05322,293.46850,297.44598,-1.352155,-2.164139,-0.337479,...,2476.886533,2179.834825,2928.885907,94.335643,78.891616,99.628635,0.149498,0.009513,0.613241,2020-01-01
3,2020-01-01 03:00:00,293.72958,291.85193,296.77880,294.66718,292.97913,297.28674,-0.911181,-1.818641,0.163568,...,2424.424361,2152.395124,2912.312676,94.491535,82.287300,99.735729,0.140987,0.006596,0.484505,2020-01-01
4,2020-01-01 04:00:00,293.31630,291.23560,296.54980,294.25015,292.60730,297.04760,-0.436056,-1.570677,0.646486,...,2364.066126,2070.838234,2872.380601,94.464307,86.122503,99.761469,0.136942,0.006171,0.368936,2020-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40915,2024-12-31 19:00:00,293.14930,291.33215,296.68494,296.53550,294.87085,298.68298,-1.418195,-3.344849,-0.476501,...,2339.956264,2083.433215,2895.885534,81.531515,69.042965,93.255137,0.533534,0.178391,0.981484,2024-12-31
40916,2024-12-31 20:00:00,292.78330,291.22803,296.32056,296.05594,294.44482,298.54200,-1.870405,-3.758713,-0.823227,...,2285.904956,2069.853848,2832.886955,81.975957,70.452359,91.763051,0.506006,0.237884,0.923120,2024-12-31
40917,2024-12-31 21:00:00,292.50880,290.82605,295.66480,295.75460,294.49835,298.20270,-2.341373,-4.136810,-1.276398,...,2246.497457,2018.157133,2722.529167,81.984778,73.628952,89.547855,0.494364,0.297049,0.800050,2024-12-31
40918,2024-12-31 22:00:00,292.40010,291.03040,295.14220,295.11040,293.89362,297.56177,-2.170078,-3.658921,-1.319138,...,2230.803441,2044.293711,2637.298946,84.657122,77.262895,90.000381,0.404759,0.261509,0.649820,2024-12-31


In [50]:
df_resumo['tempo'] = pd.to_datetime(df_resumo['tempo'])
df_resumo['data'] = df_resumo['tempo'].dt.date
df_diario = (
    df_resumo
    .groupby('data')
    .agg({
        'd2m_mean': 'mean',
        'd2m_min': 'min',
        'd2m_max': 'max',
        't2m_mean': 'mean',
        't2m_min': 'min',
        't2m_max': 'max',
        'u10_mean': 'mean',
        'u10_min': 'min',
        'u10_max': 'max',
        'v10_mean': 'mean',
        'v10_min': 'min',
        'v10_max': 'max',
        'slhf_mean': 'mean',
        'sshf_mean': 'mean',
        'ssrd_mean': 'mean',
        'sp_mean': 'mean',
        'e_sum': 'sum',
        'tp_sum': 'sum',
        'es_mean': 'mean',
        'ev_mean': 'mean',
        'RH_mean': 'mean',
        'VPD_mean': 'mean',
        'es_min': 'min',
        'es_max': 'max',
        'ev_min': 'min',
        'ev_max': 'max',
        'RH_min': 'min',
        'RH_max': 'max',
        'VPD_min': 'min',
        'VPD_max': 'max'
    })
    .reset_index()
)
df_diario

Unnamed: 0,data,d2m_mean,d2m_min,d2m_max,t2m_mean,t2m_min,t2m_max,u10_mean,u10_min,u10_max,...,RH_mean,VPD_mean,es_min,es_max,ev_min,ev_max,RH_min,RH_max,VPD_min,VPD_max
0,2020-01-01,292.666795,287.50293,300.11768,298.356312,291.25793,305.37952,0.457941,-3.303340,3.354943,...,73.248825,0.988804,2073.746157,4810.687691,1632.416453,3552.257718,36.539641,99.761469,0.006171,2.871634
1,2020-01-02,293.406096,289.85010,299.29200,296.250915,292.52783,301.23670,0.799427,-1.667494,3.300402,...,84.489326,0.456947,2245.063660,3792.636542,1897.334838,3383.532998,61.348478,99.467196,0.012338,1.377248
2,2020-01-03,292.296292,290.12110,296.03943,294.345430,291.69934,299.01294,0.123197,-1.924273,3.471870,...,88.564353,0.303775,2131.945548,3328.116254,1930.229150,2785.105443,66.349927,99.955850,0.001040,1.110049
3,2020-01-04,291.970090,289.90393,296.24402,293.869149,290.25757,298.85193,-0.046169,-1.935276,2.982586,...,89.195695,0.279918,1946.983749,3296.503784,1903.829757,2819.807683,66.990279,99.653280,0.007672,1.088167
4,2020-01-05,292.203271,289.97754,295.42230,294.359055,290.07240,300.24730,-0.298716,-2.821115,2.675707,...,88.213359,0.336362,1924.280393,3579.405823,1912.742182,2682.684612,68.176722,99.899298,0.002168,1.133866
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1700,2024-12-27,293.573809,291.73694,297.27440,294.762819,292.62744,299.18200,0.843993,-1.466258,2.657155,...,93.104706,0.185341,2259.012237,3361.595559,2136.968422,3000.400694,78.312507,99.979259,0.000492,0.708451
1701,2024-12-28,293.463451,291.09558,299.76220,295.803074,292.18982,301.21270,-0.034025,-2.530714,2.490721,...,87.274697,0.382776,2198.292509,3787.338552,2052.693486,3478.738771,62.492942,99.234654,0.018421,1.321146
1702,2024-12-29,292.879512,290.00660,298.67175,294.771741,290.07495,300.46387,-1.349646,-4.243759,0.127316,...,89.542975,0.297529,1924.593083,3625.159429,1916.269933,3261.440122,63.196452,99.822453,0.003849,1.298997
1703,2024-12-30,292.186342,290.18005,296.72522,294.175350,290.29907,299.28467,-1.247885,-4.130829,1.023910,...,88.843421,0.300020,1952.104337,3382.068245,1937.451890,2902.924929,67.430386,99.527032,0.009406,1.026971


In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score, PredictionErrorDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


In [52]:
df_diario['tp_sum'] = df_diario['tp_sum'].shift(-1) #determina previsão de chuva no dia seguinte IMPORTANTE: rodar apenas uma unica vez
df_diario = df_diario.dropna()

In [53]:
#X = df_diario.drop(['data', 'tp_sum'], axis=1)
#y = df_diario['tp_sum']

ind_sem_chuva = np.where(df_diario['tp_sum']<=0.01)[0]
ind_com_chuva = np.where(df_diario['tp_sum']>0.01)[0]

scaler = StandardScaler()
# Drop the 'data' column as it's not numerical
X_scaled = scaler.fit_transform(df_diario.drop('data', axis=1))
nomes = df_diario.drop('data', axis=1).columns
df_scaled = pd.DataFrame(X_scaled, columns=nomes)

df_diario_sem_chuva = df_scaled.iloc[ind_sem_chuva]
df_diario_com_chuva = df_scaled.iloc[ind_com_chuva]

df_diario_sem_chuva['tp_sum'] = 0
df_diario_com_chuva['tp_sum'] = 1

X_sem_chuva = df_diario_sem_chuva.drop(['tp_sum'], axis=1)
y_sem_chuva = df_diario_sem_chuva['tp_sum']

X_com_chuva = df_diario_com_chuva.drop(['tp_sum'], axis=1)
y_com_chuva = df_diario_com_chuva['tp_sum']

X_train_sem_chuva, X_test_sem_chuva, y_train_sem_chuva, y_test_sem_chuva = train_test_split(X_sem_chuva, y_sem_chuva, test_size=0.2, random_state=42)
X_train_com_chuva, X_test_com_chuva, y_train_com_chuva, y_test_com_chuva = train_test_split(X_com_chuva, y_com_chuva, test_size=0.2, random_state=42)

X_train = pd.concat([X_train_sem_chuva, X_train_com_chuva])
X_test = pd.concat([X_test_sem_chuva, X_test_com_chuva])
y_train = pd.concat([y_train_sem_chuva, y_train_com_chuva])
y_test = pd.concat([y_test_sem_chuva, y_test_com_chuva])

#

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_diario_sem_chuva['tp_sum'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_diario_com_chuva['tp_sum'] = 1


In [54]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [55]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print("Acurácia:", accuracy_score(y_test, y_pred))
print("\nMatriz de confusão:\n", confusion_matrix(y_test, y_pred))
print("\nRelatório de classificação:\n", classification_report(y_test, y_pred))

Acurácia: 0.9237536656891495

Matriz de confusão:
 [[ 16  19]
 [  7 299]]

Relatório de classificação:
               precision    recall  f1-score   support

           0       0.70      0.46      0.55        35
           1       0.94      0.98      0.96       306

    accuracy                           0.92       341
   macro avg       0.82      0.72      0.76       341
weighted avg       0.92      0.92      0.92       341

