In [126]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.neighbors import LocalOutlierFactor

In [127]:
data = pd.read_csv('data.csv', delimiter=';')

In [128]:
data.drop('Radiacao Global', axis=1, inplace=True)
data.drop('Cidade', axis=1, inplace=True)

In [129]:
print(data.shape[0])
data = data.dropna()
data.reset_index(drop=True, inplace=True)
print(data.shape[0])
print(data.isna().sum())

248121
200023
Precipitacao Total            0
Vai Chover Amanha             0
Pressao Media                 0
Pressao Maxima                0
Pressao Minima                0
Temperatura Media             0
Temperatura Orvalho Media     0
Temperatura Maxima            0
Temperatura Minima            0
Temperatura Orvalho Maxima    0
Temperatura Orvalho Minima    0
Umidade Maxima                0
Umidade Minima                0
Umidade Media                 0
Direcao Vento                 0
Rajada Maxima de Vento        0
Vento Velocidade Media        0
Codigo                        0
Latitude                      0
Longitude                     0
Data                          0
dtype: int64


In [130]:
data['Data'] = pd.to_datetime(data['Data'])
data['Ano'] = data['Data'].dt.year
data['Mes'] = data['Data'].dt.month
data['Dia'] = data['Data'].dt.day
data.drop('Data', axis=1, inplace=True)


In [131]:
encoder = OneHotEncoder(sparse=False)
encoded_data = encoder.fit_transform(data[['Codigo']])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['Codigo']))
data = pd.concat([data.drop(['Codigo'], axis=1), encoded_df], axis=1)

In [132]:
scaler = RobustScaler()
data[['Pressao Maxima','Pressao Maxima','Temperatura Maxima','Temperatura Minima','Temperatura Orvalho Maxima','Temperatura Orvalho Minima','Umidade Minima',
      'Umidade Maxima','Precipitacao Total', 'Pressao Media', 'Temperatura Media', 'Temperatura Orvalho Media', 'Umidade Media', 'Direcao Vento',
      'Rajada Maxima de Vento','Vento Velocidade Media', 'Latitude', 'Longitude', 'Ano', 'Mes', 'Dia']] = scaler.fit_transform(
          data[['Pressao Maxima','Pressao Maxima','Temperatura Maxima','Temperatura Minima','Temperatura Orvalho Maxima','Temperatura Orvalho Minima',
                'Umidade Minima', 'Umidade Maxima','Precipitacao Total', 'Pressao Media', 'Temperatura Media', 'Temperatura Orvalho Media', 'Umidade Media', 
                'Direcao Vento', 'Rajada Maxima de Vento', 'Vento Velocidade Media', 'Latitude', 'Longitude', 'Ano', 'Mes', 'Dia']])

In [133]:
X = data.drop('Vai Chover Amanha', axis=1)
y = data['Vai Chover Amanha']

In [134]:
lof = LocalOutlierFactor(contamination=0.1)
yhat = lof.fit_predict(X)
mask = yhat != -1
X = X[mask]
y = y[mask]

In [135]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [136]:
train_data = pd.concat([X_train, y_train], axis=1)

majority_class_data = train_data[train_data['Vai Chover Amanha'] == 'Nao']
minority_class_data = train_data[train_data['Vai Chover Amanha'] == 'Sim']
upsampled_miNaority_class = resample(minority_class_data, replace=True, n_samples=len(majority_class_data))
train_data = pd.concat([majority_class_data, upsampled_miNaority_class], axis=0)

X_train = train_data.drop('Vai Chover Amanha', axis=1)
y_train = train_data['Vai Chover Amanha']

print(train_data['Vai Chover Amanha'].value_counts())

Nao    104086
Sim    104086
Name: Vai Chover Amanha, dtype: int64


In [137]:
def get_mlp(data_x, data_y):
    mlp = MLPClassifier(hidden_layer_sizes=(50,100,100, 50), activation='relu', max_iter=50, random_state=42)
    mlp.fit(data_x, data_y)
    return mlp

In [144]:
def get_dt(data_x, data_y):
    dt = DecisionTreeClassifier(splitter='best', max_depth=None, criterion='gini')
    dt.fit(data_x, data_y)
    return dt

In [139]:
def get_rf(data_x, data_y):
    rf = RandomForestClassifier(class_weight='balanced', random_state=42)
    rf.fit(X_train, y_train)
    return rf

In [140]:
def plot_result_scikit(model, data_x, data_y):
    print("plot_result_scikit")
    y_pred = model.predict(data_x)
    accuracy = accuracy_score(data_y, y_pred),
    #mse = mean_squared_error(y_test, y_pred)
    FP = np.sum((y_pred == 'Sim') & (data_y == 'Nao'))
    FN = np.sum((y_pred == 'Nao') & (data_y == 'Sim'))
    VP = np.sum((y_pred == 'Sim') & (data_y == 'Sim'))
    VN = np.sum((y_pred == 'Nao') & (data_y == 'Nao'))

    print(f"Accuracy: {accuracy} | FP:{FP} | FN:{FN} | VP:{VP} | VN:{VN}")

In [145]:
scikit_model = get_dt(X_train, y_train)
plot_result_scikit(scikit_model, X_train,y_train) 
plot_result_scikit(scikit_model, X_test,y_test)

plot_result_scikit
Accuracy: (1.0,) | FP:0 | FN:0 | VP:104086 | VN:104086
plot_result_scikit
Accuracy: (0.6855071658704588,) | FP:5601 | FN:5722 | VP:4131 | VN:20550
