In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.neighbors import LocalOutlierFactor
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
#from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline
from imblearn import FunctionSampler
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.metrics import precision_recall_curve, auc

from sklearn.model_selection import cross_val_score
import optuna

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, KMeansSMOTE, SVMSMOTE, ADASYN, RandomOverSampler
from sklearn.feature_selection import SelectKBest, f_classif, chi2, SelectPercentile, SelectFromModel, SequentialFeatureSelector, SelectFromModel
from sklearn.decomposition import PCA
from scipy import stats
from imblearn.base import SamplerMixin
from sklearn.ensemble import VotingClassifier

In [3]:
class ManualFeatureSelectorTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        #print("fit - ManualFeatureSelectorTransformer - begin", X.shape)
        #print("fit - ManualFeatureSelectorTransformer - end", X.shape)
        return self

    def transform(self, X):
        #print("transform - ManualFeatureSelectorTransformer", X.shape)
        X.drop('Cidade', axis=1, inplace=True)
        X.drop('Codigo', axis=1, inplace=True)
        #print("transform - ManualFeatureSelectorTransformer - end", X.shape)
        return X

In [4]:
class ImputerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.imputer_numeric = SimpleImputer(strategy='mean')
        self.imputer_categorical = SimpleImputer(strategy='most_frequent')

    def fit(self, X, y=None):
        #print("fit - ImputerTransformer - begin", X.shape)
        numeric_cols = X.select_dtypes(include=['number']).columns
        categorical_cols = X.select_dtypes(include=['object']).columns

        self.imputer_numeric.fit(X[numeric_cols])
        self.imputer_categorical.fit(X[categorical_cols])
        #print("fit - ImputerTransformer - end", X.shape)
        return self

    def transform(self, X):
        #print("transform - ImputerTransformer - begin", X.shape)
        numeric_cols = X.select_dtypes(include=['number']).columns
        categorical_cols = X.select_dtypes(include=['object']).columns
        X[numeric_cols] = self.imputer_numeric.transform(X[numeric_cols])
        X[categorical_cols] = self.imputer_categorical.transform(X[categorical_cols])
        #print("transform - ImputerTransformer - end", X.shape)
        return X

In [5]:
class DateTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        #print("fit - DateTransformer - begin", X.shape)
        #print("fit - DateTransformer - end", X.shape)
        return self

    def transform(self, X):
        #print("transform - DateTransformer", X.shape)
        X['Data'] = pd.to_datetime(X['Data'])
        X['Ano'] = X['Data'].dt.year
        X['Mes'] = X['Data'].dt.month
        X['Dia'] = X['Data'].dt.day
        X.drop('Data', axis=1, inplace=True)
        #print("transform - DateTransformer - end", X.shape)
        return X

In [6]:
class RobustScalerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.rainfall_scaler = RobustScaler(quantile_range=(6.5, 93.5))
        self.scaler = RobustScaler()

    def fit(self, X, y=None):
        #print("fit - RobustScalerTransformer - begin", X.shape)
        self.scaler.fit(X[['Pressao Maxima','Pressao Minima','Temperatura Maxima','Temperatura Minima','Temperatura Orvalho Maxima','Temperatura Orvalho Minima',
                'Umidade Minima', 'Umidade Maxima', 'Pressao Media', 'Temperatura Media', 'Temperatura Orvalho Media', 'Umidade Media', 
                'Direcao Vento 0H','Direcao Vento 6H','Direcao Vento 12H','Direcao Vento 18H', 'Rajada Maxima de Vento 0H', 'Rajada Maxima de Vento 6H',
                'Rajada Maxima de Vento 12H', 'Rajada Maxima de Vento 18H', 'Vento Velocidade Media 0H', 'Vento Velocidade Media 6H',
                'Vento Velocidade Media 12H', 'Vento Velocidade Media 18H', 'Latitude', 'Longitude', 'Ano', 'Mes', 'Dia','Radiacao Global', 'Altitude']])
        self.rainfall_scaler.fit(X[['Precipitacao Total']])
        #print("fit - RobustScalerTransformer - end", X.shape)
        return self

    def transform(self, X):
        #print("transform - RobustScalerTransformer - begin", X.shape)
        X[['Pressao Maxima','Pressao Minima','Temperatura Maxima','Temperatura Minima','Temperatura Orvalho Maxima','Temperatura Orvalho Minima','Umidade Minima',
      'Umidade Maxima', 'Pressao Media', 'Temperatura Media', 'Temperatura Orvalho Media', 'Umidade Media', 'Direcao Vento 0H','Direcao Vento 6H','Direcao Vento 12H', 
      'Direcao Vento 18H', 'Rajada Maxima de Vento 0H', 'Rajada Maxima de Vento 6H', 'Rajada Maxima de Vento 12H', 'Rajada Maxima de Vento 18H',
      'Vento Velocidade Media 0H', 'Vento Velocidade Media 6H', 'Vento Velocidade Media 12H', 'Vento Velocidade Media 18H', 'Latitude', 'Longitude', 'Ano', 'Mes', 'Dia',
      'Radiacao Global', 'Altitude']] = self.scaler.transform(
          X[['Pressao Maxima','Pressao Minima','Temperatura Maxima','Temperatura Minima','Temperatura Orvalho Maxima','Temperatura Orvalho Minima',
                'Umidade Minima', 'Umidade Maxima', 'Pressao Media', 'Temperatura Media', 'Temperatura Orvalho Media', 'Umidade Media', 'Direcao Vento 0H', 
                'Direcao Vento 6H','Direcao Vento 12H','Direcao Vento 18H', 'Rajada Maxima de Vento 0H', 'Rajada Maxima de Vento 6H', 'Rajada Maxima de Vento 12H',
                'Rajada Maxima de Vento 18H', 'Vento Velocidade Media 0H', 'Vento Velocidade Media 6H', 'Vento Velocidade Media 12H', 'Vento Velocidade Media 18H',
                'Latitude', 'Longitude', 'Ano', 'Mes', 'Dia', 'Radiacao Global', 'Altitude']])
        X['Precipitacao Total'] = self.rainfall_scaler.transform(X[['Precipitacao Total']])
        #print("transform - RobustScalerTransformer - end", X.shape)
        return X

In [7]:
class SimpleRobustScalerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.rainfall_scaler = RobustScaler()
        self.scaler = RobustScaler()

    def fit(self, X, y=None):
        #print("fit - RobustScalerTransformer - begin", X.shape)
        self.scaler.fit(X[['Pressao Maxima','Pressao Minima','Temperatura Maxima','Temperatura Minima','Temperatura Orvalho Maxima','Temperatura Orvalho Minima',
                'Umidade Minima', 'Umidade Maxima', 'Pressao Media', 'Temperatura Media', 'Temperatura Orvalho Media', 'Umidade Media', 
                'Direcao Vento 0H','Direcao Vento 6H','Direcao Vento 12H','Direcao Vento 18H', 'Rajada Maxima de Vento 0H', 'Rajada Maxima de Vento 6H',
                'Rajada Maxima de Vento 12H', 'Rajada Maxima de Vento 18H', 'Vento Velocidade Media 0H', 'Vento Velocidade Media 6H',
                'Vento Velocidade Media 12H', 'Vento Velocidade Media 18H', 'Latitude', 'Longitude', 'Ano', 'Mes', 'Dia','Radiacao Global', 'Altitude']])
        self.rainfall_scaler.fit(X[['Precipitacao Total']])
        #print("fit - RobustScalerTransformer - end", X.shape)
        return self

    def transform(self, X):
        #print("transform - RobustScalerTransformer - begin", X.shape)
        X[['Pressao Maxima','Pressao Minima','Temperatura Maxima','Temperatura Minima','Temperatura Orvalho Maxima','Temperatura Orvalho Minima','Umidade Minima',
      'Umidade Maxima', 'Pressao Media', 'Temperatura Media', 'Temperatura Orvalho Media', 'Umidade Media', 'Direcao Vento 0H','Direcao Vento 6H','Direcao Vento 12H', 
      'Direcao Vento 18H', 'Rajada Maxima de Vento 0H', 'Rajada Maxima de Vento 6H', 'Rajada Maxima de Vento 12H', 'Rajada Maxima de Vento 18H',
      'Vento Velocidade Media 0H', 'Vento Velocidade Media 6H', 'Vento Velocidade Media 12H', 'Vento Velocidade Media 18H', 'Latitude', 'Longitude', 'Ano', 'Mes', 'Dia',
      'Radiacao Global', 'Altitude']] = self.scaler.transform(
          X[['Pressao Maxima','Pressao Minima','Temperatura Maxima','Temperatura Minima','Temperatura Orvalho Maxima','Temperatura Orvalho Minima',
                'Umidade Minima', 'Umidade Maxima', 'Pressao Media', 'Temperatura Media', 'Temperatura Orvalho Media', 'Umidade Media', 'Direcao Vento 0H', 
                'Direcao Vento 6H','Direcao Vento 12H','Direcao Vento 18H', 'Rajada Maxima de Vento 0H', 'Rajada Maxima de Vento 6H', 'Rajada Maxima de Vento 12H',
                'Rajada Maxima de Vento 18H', 'Vento Velocidade Media 0H', 'Vento Velocidade Media 6H', 'Vento Velocidade Media 12H', 'Vento Velocidade Media 18H',
                'Latitude', 'Longitude', 'Ano', 'Mes', 'Dia', 'Radiacao Global', 'Altitude']])
        X['Precipitacao Total'] = self.rainfall_scaler.transform(X[['Precipitacao Total']])
        #print("transform - RobustScalerTransformer - end", X.shape)
        return X

In [8]:
def handleOutliersZIndex(X, threshold = 3):
    #print("handleOutliersZIndex - begin", X.shape)
    columns = ['Pressao Maxima','Pressao Minima','Temperatura Maxima','Temperatura Minima','Temperatura Orvalho Maxima','Temperatura Orvalho Minima',
                'Umidade Minima', 'Umidade Maxima', 'Pressao Media', 'Temperatura Media', 'Temperatura Orvalho Media', 'Umidade Media', 
                'Direcao Vento 0H','Direcao Vento 6H','Direcao Vento 12H','Direcao Vento 18H', 'Rajada Maxima de Vento 0H', 'Rajada Maxima de Vento 6H', 
                'Rajada Maxima de Vento 12H', 'Rajada Maxima de Vento 18H', 'Vento Velocidade Media 0H', 'Vento Velocidade Media 6H', 'Vento Velocidade Media 12H', 
                'Vento Velocidade Media 18H', 'Radiacao Global', 'Altitude']
    
    for column in columns:
        non_zero_mask = (X[column].notna())

        nz_X = X[non_zero_mask]
        z_scores = np.abs(stats.zscore(nz_X[column]))
        outliers_mask = (z_scores > threshold)

        nz_X.loc[outliers_mask, column] = nz_X[(~outliers_mask)][column].mean()

        X.loc[non_zero_mask, column] = nz_X[column]
    
    column = 'Precipitacao Total'
    non_zero_mask = (X[column] != 0) & (X[column].notna())

    nz_X = X[non_zero_mask]
    z_scores = np.abs(stats.zscore(nz_X[column]))
    outliers_mask = (z_scores > threshold)

    nz_X.loc[outliers_mask, column] = nz_X[(~outliers_mask)][column].mean()

    X.loc[non_zero_mask, column] = nz_X[column]

    #print("handleOutliersZIndex - end", X.shape)
    return X

In [9]:
def handleOutliersSimpleZIndex(X, threshold = 3):
    #print("handleOutliersZIndex - begin", X.shape)
    columns = ['Pressao Maxima','Pressao Minima','Temperatura Maxima','Temperatura Minima','Temperatura Orvalho Maxima','Temperatura Orvalho Minima',
                'Umidade Minima', 'Umidade Maxima', 'Pressao Media', 'Temperatura Media', 'Temperatura Orvalho Media', 'Umidade Media', 
                'Direcao Vento 0H','Direcao Vento 6H','Direcao Vento 12H','Direcao Vento 18H', 'Rajada Maxima de Vento 0H', 'Rajada Maxima de Vento 6H', 
                'Rajada Maxima de Vento 12H', 'Rajada Maxima de Vento 18H', 'Vento Velocidade Media 0H', 'Vento Velocidade Media 6H', 'Vento Velocidade Media 12H', 
                'Vento Velocidade Media 18H', 'Radiacao Global', 'Altitude', 'Precipitacao Total']
    
    for column in columns:
        non_zero_mask = (X[column].notna())

        nz_X = X[non_zero_mask]
        z_scores = np.abs(stats.zscore(nz_X[column]))
        outliers_mask = (z_scores > threshold)

        nz_X.loc[outliers_mask, column] = nz_X[(~outliers_mask)][column].mean()

        X.loc[non_zero_mask, column] = nz_X[column]

    #print("handleOutliersZIndex - end", X.shape)
    return X

In [10]:
def get_mlp(data_x, data_y):
    mlp = MLPClassifier(hidden_layer_sizes=(10,20,20,10), activation='relu', max_iter=250, random_state=42)
    mlp.fit(data_x, data_y)
    return mlp

In [11]:
def get_dt(data_x, data_y):
    dt = DecisionTreeClassifier(class_weight='balanced', splitter='best', max_depth=None, criterion='gini')
    dt.fit(data_x, data_y)
    return dt

In [12]:
def get_rf(data_x, data_y):
    rf = RandomForestClassifier(class_weight='balanced', max_depth=15, n_estimators=100, random_state=42)
    rf.fit(data_x, data_y)
    return rf

In [13]:
def get_sgd(data_x, data_y):
    sgd = SGDClassifier(max_iter=3000, tol=1e-3, class_weight='balanced', random_state=42)
    sgd.fit(data_x, data_y)
    return sgd

In [14]:
def get_logistic_regression(data_x, data_y):
    lr = LogisticRegression(class_weight='balanced', max_iter=5000, random_state=42)
    lr.fit(data_x, data_y)
    return lr

In [15]:
def get_bagging(data_x, data_y):
   base_classifier = DecisionTreeClassifier(class_weight='balanced', splitter='best', max_depth=15, criterion='gini', random_state=42)
   bagging = BaggingClassifier(base_estimator=base_classifier, n_estimators=20, random_state=42)
   bagging.fit(data_x, data_y)
   return bagging

In [16]:
def get_linear_svc(data_x, data_y):
    linear_svc = LinearSVC(class_weight='balanced', random_state=42)
    linear_svc.fit(data_x, data_y)
    return linear_svc

In [17]:
def get_knn(data_x, data_y):
    knn = KNeighborsClassifier(n_neighbors=20)
    knn.fit(data_x, data_y)
    return knn

In [18]:
def get_nb(data_x, data_y):
    nb = BernoulliNB()
    nb.fit(data_x, data_y)
    return nb

In [19]:
def get_ada(data_x, data_y):
    dt = DecisionTreeClassifier(class_weight='balanced', splitter='best', max_depth=5, criterion='gini')
    ada = AdaBoostClassifier(base_estimator=dt, n_estimators=100, algorithm="SAMME", random_state=42)
    ada.fit(data_x, data_y)
    return ada

In [20]:
def get_ensemble(data_x, data_y):
    mlp = MLPClassifier(hidden_layer_sizes=(10,20,20,10), activation='relu', max_iter=15, random_state=42)
    rf = RandomForestClassifier(class_weight='balanced', max_depth=15, n_estimators=20, random_state=42)
    base_classifier = DecisionTreeClassifier(class_weight='balanced', splitter='best', max_depth=15, criterion='gini', random_state=42)
    bagging = BaggingClassifier(base_estimator=base_classifier, n_estimators=20, random_state=42)
    ensemble = VotingClassifier(estimators=[('mlp', mlp), ('rf', rf), ('bagging', bagging)], voting='hard')
    ensemble.fit(data_x, data_y)
    return ensemble

In [21]:
def get_lda(data_x, data_y):
    lda = LinearDiscriminantAnalysis()
    lda.fit(data_x, data_y)
    return lda

In [22]:
def plot_result_scikit(model, data_x, data_y):
    y_pred = model.predict(data_x)
    accuracy = accuracy_score(data_y, y_pred),
    y_true  = (data_y == 'Sim').astype(int)
    y_pred  = (y_pred == 'Sim').astype(int)

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"Accuracy: {str(accuracy).replace('.',',')} | Precision:{str(precision).replace('.',',')} | Recall:{str(recall).replace('.',',')} | F1-score:{str(f1).replace('.',',')}")
    print(f"{str(accuracy).replace('.',',')[1:-2]}\t{str(precision).replace('.',',')}\t{str(recall).replace('.',',')}\t{str(f1).replace('.',',')}")

In [None]:
data = pd.read_csv('data.csv', delimiter=';')

print(data.shape[0])
data = data.dropna(subset=['Vai Chover Amanha'])#Removendo linhas com valores faltantes na coluna de outliers

data.reset_index(drop=True, inplace=True)
print(data.shape[0])

X = data.drop('Vai Chover Amanha', axis=1)
y = data['Vai Chover Amanha']

raw_X_train, X_test, raw_y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42) #Holdout 15%/15%/70%
raw_X_train_2, X_test_2, raw_y_train_2, y_test_2 = train_test_split(raw_X_train, raw_y_train, test_size=0.177, random_state=42)

selector = SelectKBest(k=28) #Seleção de features com o SelectKBest
#rf = RandomForestClassifier(class_weight='balanced', max_depth=20, n_estimators=100, random_state=42)
#selector = SelectFromModel(rf) #Seleção de features com o SelectFromModel

pipeline = Pipeline(steps=[
    ('manual',ManualFeatureSelectorTransformer()),#Remoção de colunas de código e cidades
    ('imputer',ImputerTransformer()),#Lidando com valores faltantes restantes
    ('date', DateTransformer()),#Separa coluna de data em dia, mês e ano
    ('scaler', RobustScalerTransformer()),#Normalização dos dados
    #('scaler', SimpleRobustScalerTransformer()),#Normalização dos dados
    #('PCA', PCA(n_components=21)),
    ('selector', selector),
    ])

#X_train = pipeline.fit_transform(handleOutliersSimpleZIndex(raw_X_train_2.copy(),3), raw_y_train_2.copy()) # Abordagem simples para lidar com outliers
#X_train = pipeline.fit_transform(handleOutliersZIndex(raw_X_train_2.copy(),3), raw_y_train_2.copy()) # Abordagem complexa para lidar com outliers

X_train = pipeline.fit_transform(raw_X_train_2.copy(), raw_y_train_2.copy())
y_train = raw_y_train_2.copy()

X_test_2 = pipeline.transform(X_test_2.copy())

# Abordagens para lidar com dados desbalanceados
os = RandomOverSampler(random_state=42)
#os = SMOTE(random_state=42, k_neighbors=10)
#os = BorderlineSMOTE(random_state=42, k_neighbors=10)
X_train, y_train = os.fit_resample(X_train.copy(), y_train.copy())

scikit_model = get_rf(X_train.copy(), y_train.copy())
print("rf")
print("training:")
plot_result_scikit(scikit_model, pipeline.transform(raw_X_train_2.copy()), raw_y_train_2.copy()) 
print("over sampled training:")
plot_result_scikit(scikit_model, X_train.copy(), y_train.copy()) 
print("test:")
plot_result_scikit(scikit_model, X_test_2.copy(),y_test_2.copy())

scikit_model = get_ada(X_train, y_train)
print("ada")
print("training:")
plot_result_scikit(scikit_model, pipeline.transform(raw_X_train_2.copy()), raw_y_train_2.copy()) 
print("over sampled training:")
plot_result_scikit(scikit_model, X_train.copy(), y_train.copy()) 
print("test:")
plot_result_scikit(scikit_model, X_test_2.copy(),y_test_2.copy())

scikit_model = get_logistic_regression(X_train, y_train)
print("logistic_regression")
print("training:")
plot_result_scikit(scikit_model, pipeline.transform(raw_X_train_2.copy()), raw_y_train_2.copy()) 
print("over sampled training:")
plot_result_scikit(scikit_model, X_train.copy(), y_train.copy()) 
print("test:")
plot_result_scikit(scikit_model, X_test_2.copy(),y_test_2.copy())

scikit_model = get_mlp(X_train, y_train)
print("mlp")
print("training:")
plot_result_scikit(scikit_model, pipeline.transform(raw_X_train_2.copy()), raw_y_train_2.copy()) 
print("over sampled training:")
plot_result_scikit(scikit_model, X_train.copy(), y_train.copy()) 
print("test:")
plot_result_scikit(scikit_model, X_test_2.copy(),y_test_2.copy())

In [None]:
data = pd.read_csv('data.csv', delimiter=';')

print(data.shape[0])
data = data.dropna(subset=['Vai Chover Amanha'])#Removendo linhas com valores faltantes na coluna de outliers

data.reset_index(drop=True, inplace=True)
print(data.shape[0])

X = data.drop('Vai Chover Amanha', axis=1)
y = data['Vai Chover Amanha']

raw_X_train, X_test, raw_y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42) #Holdout 15%/15%/70%
raw_X_train_2, X_test_2, raw_y_train_2, y_test_2 = train_test_split(raw_X_train, raw_y_train, test_size=0.177, random_state=42)

pipeline = Pipeline(steps=[
    ('manual',ManualFeatureSelectorTransformer()),#Remoção de colunas de código e cidades
    ('imputer',ImputerTransformer()),#Lidando com valores faltantes restantes
    ('date', DateTransformer()),#Separa coluna de data em dia, mês e ano
    ('scaler', RobustScalerTransformer()),#Normalização dos dados
    ])

X_train = pipeline.fit_transform(raw_X_train.copy(), raw_y_train.copy())
y_train = raw_y_train.copy()

X_test = pipeline.transform(X_test.copy())

# Abordagens para lidar com dados desbalanceados
os = RandomOverSampler(random_state=42)
X_train, y_train = os.fit_resample(X_train.copy(), y_train.copy())

scikit_model = get_lda(X_train.copy(), y_train.copy())
print("lda")
print("training:")
plot_result_scikit(scikit_model, pipeline.transform(raw_X_train.copy()), raw_y_train.copy()) 
print("over sampled training:")
plot_result_scikit(scikit_model, X_train.copy(), y_train.copy()) 
print("test:")
plot_result_scikit(scikit_model, X_test.copy(),y_test.copy())

scikit_model = get_bagging(X_train.copy(), y_train.copy())
print("bagging")
print("training:")
plot_result_scikit(scikit_model, pipeline.transform(raw_X_train.copy()), raw_y_train.copy()) 
print("over sampled training:")
plot_result_scikit(scikit_model, X_train.copy(), y_train.copy()) 
print("test:")
plot_result_scikit(scikit_model, X_test.copy(),y_test.copy())

scikit_model = get_rf(X_train.copy(), y_train.copy())
print("rf")
print("training:")
plot_result_scikit(scikit_model, pipeline.transform(raw_X_train.copy()), raw_y_train.copy()) 
print("over sampled training:")
plot_result_scikit(scikit_model, X_train.copy(), y_train.copy()) 
print("test:")
plot_result_scikit(scikit_model, X_test.copy(),y_test.copy())

scikit_model = get_ada(X_train, y_train)
print("ada")
print("training:")
plot_result_scikit(scikit_model, pipeline.transform(raw_X_train.copy()), raw_y_train.copy()) 
print("over sampled training:")
plot_result_scikit(scikit_model, X_train.copy(), y_train.copy()) 
print("test:")
plot_result_scikit(scikit_model, X_test.copy(),y_test.copy())

scikit_model = get_logistic_regression(X_train, y_train)
print("logistic_regression")
print("training:")
plot_result_scikit(scikit_model, pipeline.transform(raw_X_train.copy()), raw_y_train.copy()) 
print("over sampled training:")
plot_result_scikit(scikit_model, X_train.copy(), y_train.copy()) 
print("test:")
plot_result_scikit(scikit_model, X_test.copy(),y_test.copy())

scikit_model = get_mlp(X_train, y_train)
print("mlp")
print("training:")
plot_result_scikit(scikit_model, pipeline.transform(raw_X_train.copy()), raw_y_train.copy()) 
print("over sampled training:")
plot_result_scikit(scikit_model, X_train.copy(), y_train.copy()) 
print("test:")
plot_result_scikit(scikit_model, X_test.copy(),y_test.copy())

scikit_model = get_ensemble(X_train, y_train)
print("ensemble")
print("training:")
plot_result_scikit(scikit_model, pipeline.transform(raw_X_train.copy()), raw_y_train.copy()) 
print("over sampled training:")
plot_result_scikit(scikit_model, X_train.copy(), y_train.copy()) 
print("test:")
plot_result_scikit(scikit_model, X_test.copy(),y_test.copy())

scikit_model = get_sgd(X_train, y_train)
print("sgd")
print("training:")
plot_result_scikit(scikit_model, pipeline.transform(raw_X_train.copy()), raw_y_train.copy()) 
print("over sampled training:")
plot_result_scikit(scikit_model, X_train.copy(), y_train.copy()) 
print("test:")
plot_result_scikit(scikit_model, X_test.copy(),y_test.copy())

scikit_model = get_linear_svc(X_train, y_train)
print("svc")
print("training:")
plot_result_scikit(scikit_model, pipeline.transform(raw_X_train.copy()), raw_y_train.copy()) 
print("over sampled training:")
plot_result_scikit(scikit_model, X_train.copy(), y_train.copy()) 
print("test:")
plot_result_scikit(scikit_model, X_test.copy(),y_test.copy())

scikit_model = get_nb(X_train, y_train)
print("nb")
print("training:")
plot_result_scikit(scikit_model, pipeline.transform(raw_X_train.copy()), raw_y_train.copy()) 
print("over sampled training:")
plot_result_scikit(scikit_model, X_train.copy(), y_train.copy()) 
print("test:")
plot_result_scikit(scikit_model, X_test.copy(),y_test.copy())

scikit_model = get_knn(X_train, y_train)
print("knn")
print("training:")
plot_result_scikit(scikit_model, pipeline.transform(raw_X_train.copy()), raw_y_train.copy()) 
print("over sampled training:")
plot_result_scikit(scikit_model, X_train.copy(), y_train.copy()) 
print("test:")
plot_result_scikit(scikit_model, X_test.copy(),y_test.copy())

In [None]:
data = pd.read_csv('data.csv', delimiter=';')

print(data.shape[0])
data = data.dropna(subset=['Vai Chover Amanha'])#Removendo linhas com valores faltantes na coluna de outliers

data.reset_index(drop=True, inplace=True)
print(data.shape[0])

X = data.drop('Vai Chover Amanha', axis=1)
y = data['Vai Chover Amanha']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)

    

def objective(trial):
    n_layers = trial.suggest_int('n_layers', 1, 5)
    layers = []
    for i in range(n_layers):
        layers.append(trial.suggest_int(f'n_units_{i}', 1, 30) * 10)

    
    activation = trial.suggest_categorical('activation', ['identity', 'logistic', 'tanh', 'relu'])
    solver = trial.suggest_categorical('solver', ['sgd', 'adam'])
    alpha = trial.suggest_float('alpha', 1e-5, 1e-1, log=True)
    learning_rate_init = trial.suggest_float('learning_rate_init', 1e-5, 1e-1, log=True)
    max_iter = trial.suggest_int('max_iter', 30, 40)

    pipeline = Pipeline(steps=[
        ('manual',ManualFeatureSelectorTransformer()),
        ('imputer',ImputerTransformer()),
        ('date', DateTransformer()),
        ('scaler', RobustScalerTransformer()),
        ('os', RandomOverSampler(random_state=42)),
        ('model', MLPClassifier(hidden_layer_sizes=tuple(layers), activation=activation, solver=solver, max_iter=max_iter*10, alpha=alpha, learning_rate_init=learning_rate_init, random_state=42)),
        ])

    X_train_2, X_val, y_train_2, y_val = train_test_split(X_train.copy(), y_train.copy(), stratify=y_train, test_size=0.177, random_state=42)
    pipeline.fit(X_train_2, y_train_2)

    y_pred = pipeline.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)

    y_true  = (y_val == 'Sim').astype(int)
    y_pred  = (y_pred == 'Sim').astype(int)

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"Accuracy: {str(accuracy).replace('.',',')} | Precision:{str(precision).replace('.',',')} | Recall:{str(recall).replace('.',',')} | F1-score:{str(f1).replace('.',',')}")
    #score = cross_val_score(pipeline, X.copy(), y.copy(), n_jobs=-1, cv=5).mean()
    return f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

for trial in study.trials:
    print(f"Trial {trial.number}:")
    print(f"  Value: {trial.value}")
    print("  Params:")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")
    print()

print("\n\n\nBest trial:")
trial = study.best_trial

print(f" Val Value: {trial.value}")
print(" Val Params:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

layers = []
for key, value in trial.params.items():
    if key[:7] == 'n_units':
        layers.append(value * 10)
    
print(f"layers: {layers}")

pipeline = Pipeline(steps=[
    ('manual',ManualFeatureSelectorTransformer()),
    ('imputer',ImputerTransformer()),
    ('date', DateTransformer()),
    ('scaler', RobustScalerTransformer()),
    ('os', RandomOverSampler(random_state=42)),
    ('model', MLPClassifier(hidden_layer_sizes=tuple(layers), activation=trial.params['activation'], solver=trial.params['solver'], max_iter=trial.params['max_iter']*10, alpha=trial.params['alpha'], learning_rate_init=trial.params['learning_rate_init'], random_state=42)),
    ])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

y_true  = (y_test == 'Sim').astype(int)
y_pred  = (y_pred == 'Sim').astype(int)

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Test: Accuracy: {str(accuracy).replace('.',',')} | Precision:{str(precision).replace('.',',')} | Recall:{str(recall).replace('.',',')} | F1-score:{str(f1).replace('.',',')}")


In [None]:
data = pd.read_csv('data.csv', delimiter=';')
data = data.dropna(subset=['Vai Chover Amanha'])#Removendo linhas com valores faltantes na coluna de outliers
data.reset_index(drop=True, inplace=True)

X = data.drop('Vai Chover Amanha', axis=1)
y = data['Vai Chover Amanha']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)

pipeline = Pipeline(steps=[
    ('manual',ManualFeatureSelectorTransformer()),
    ('imputer',ImputerTransformer()),
    ('date', DateTransformer()),
    ('scaler', RobustScalerTransformer()),
    ('os', RandomOverSampler(random_state=42)),
    ('model', MLPClassifier(hidden_layer_sizes=(40,20,300), activation='tanh', solver='adam', max_iter=350, alpha=0.00026300311127326896, learning_rate_init=0.00015373773782206807, random_state=42)),
    ])

pipeline.fit(X_train, y_train)

for i in range(2001,2025):
    print(i)
    f_X_test = X_test.copy().loc[pd.to_datetime(X_test['Data']).dt.year == i]
    f_X_test.reset_index(drop=True, inplace=True)

    f_y_test = y_test.copy().loc[pd.to_datetime(X_test['Data']).dt.year == i]
    f_y_test.reset_index(drop=True, inplace=True)
    

    y_pred = pipeline.predict(f_X_test)
    accuracy = accuracy_score(f_y_test, y_pred)

    y_true  = (f_y_test == 'Sim').astype(int)
    y_pred  = (y_pred == 'Sim').astype(int)

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)


    print(f"Test: Accuracy: {str(accuracy).replace('.',',')} | Precision:{str(precision).replace('.',',')} | Recall:{str(recall).replace('.',',')} | F1-score:{str(f1).replace('.',',')}")
    print(f"{str(accuracy).replace('.',',')[1:-2]}\t{str(precision).replace('.',',')}\t{str(recall).replace('.',',')}\t{str(f1).replace('.',',')}")

for cidade in data['Cidade'].unique():
    print(cidade)
    f_X_test = X_test.copy().loc[X_test['Cidade'] == cidade]
    f_X_test.reset_index(drop=True, inplace=True)

    f_y_test = y_test.copy().loc[X_test['Cidade'] == cidade]
    f_y_test.reset_index(drop=True, inplace=True)

    y_pred = pipeline.predict(f_X_test)
    accuracy = accuracy_score(f_y_test, y_pred)

    y_true  = (f_y_test == 'Sim').astype(int)
    y_pred  = (y_pred == 'Sim').astype(int)

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)


    print(f"Test: Accuracy: {str(accuracy).replace('.',',')} | Precision:{str(precision).replace('.',',')} | Recall:{str(recall).replace('.',',')} | F1-score:{str(f1).replace('.',',')}")
    print(f"{str(accuracy).replace('.',',')[1:-2]}\t{str(precision).replace('.',',')}\t{str(recall).replace('.',',')}\t{str(f1).replace('.',',')}")

y_pred = pipeline.predict(X_test.copy())
accuracy = accuracy_score(y_test, y_pred)

y_true  = (y_test == 'Sim').astype(int)
y_pred  = (y_pred == 'Sim').astype(int)

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

FP = np.sum((y_pred == 1) & (y_true == 0))
VP = np.sum((y_pred == 1) & (y_true == 1))
FN = np.sum((y_pred == 0) & (y_true == 1))
VN = np.sum((y_pred == 0) & (y_true == 0))

print(f"VP: {VP} - VN: {VN} - FP: {FP} - FN: {FN}")
print(f"{VP}\t{VN}\t{FP}\t{FN}")
print(f"Test: Accuracy: {str(accuracy).replace('.',',')} | Precision:{str(precision).replace('.',',')} | Recall:{str(recall).replace('.',',')} | F1-score:{str(f1).replace('.',',')}")
print(f"{str(accuracy).replace('.',',')[1:-2]}\t{str(precision).replace('.',',')}\t{str(recall).replace('.',',')}\t{str(f1).replace('.',',')}")

class_labels = pipeline.classes_
class_index = np.where(class_labels == "Sim")[0][0]

y_probs = pipeline.predict_proba(X_test.copy())[:, class_index]

y_true  = (y_test.copy() == 'Sim').astype(int)

precision, recall, _ = precision_recall_curve(y_true, y_probs)
pr_auc = auc(recall, precision)
print('auc: ', pr_auc)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precisão')
plt.grid(True)
plt.show()

In [None]:
data = pd.read_csv('data.csv', delimiter=';')
data = data.dropna(subset=['Vai Chover Amanha'])#Removendo linhas com valores faltantes na coluna de outliers
data.reset_index(drop=True, inplace=True)

X = data.drop('Vai Chover Amanha', axis=1)
y = data['Vai Chover Amanha']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)

pipeline = Pipeline(steps=[
    ('manual',ManualFeatureSelectorTransformer()),
    ('imputer',ImputerTransformer()),
    ('date', DateTransformer()),
    ('scaler', RobustScalerTransformer()),
    ('os', RandomOverSampler(random_state=42)),
    ('model', MLPClassifier(hidden_layer_sizes=(100,70,270,160,280), activation='logistic', solver='adam', max_iter=310, alpha=0.0003629543804893746, learning_rate_init=0.00044484921922101567, random_state=42)),
    ])

pipeline.fit(X_train, y_train)

for i in range(2001,2025):
    print(i)
    f_X_test = X_test.copy().loc[pd.to_datetime(X_test['Data']).dt.year == i]
    f_X_test.reset_index(drop=True, inplace=True)

    f_y_test = y_test.copy().loc[pd.to_datetime(X_test['Data']).dt.year == i]
    f_y_test.reset_index(drop=True, inplace=True)
    

    y_pred = pipeline.predict(f_X_test)
    accuracy = accuracy_score(f_y_test, y_pred)

    y_true  = (f_y_test == 'Sim').astype(int)
    y_pred  = (y_pred == 'Sim').astype(int)

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)


    print(f"Test: Accuracy: {str(accuracy).replace('.',',')} | Precision:{str(precision).replace('.',',')} | Recall:{str(recall).replace('.',',')} | F1-score:{str(f1).replace('.',',')}")
    print(f"{str(accuracy).replace('.',',')[1:-2]}\t{str(precision).replace('.',',')}\t{str(recall).replace('.',',')}\t{str(f1).replace('.',',')}")

for cidade in data['Cidade'].unique():
    print(cidade)
    f_X_test = X_test.copy().loc[X_test['Cidade'] == cidade]
    f_X_test.reset_index(drop=True, inplace=True)

    f_y_test = y_test.copy().loc[X_test['Cidade'] == cidade]
    f_y_test.reset_index(drop=True, inplace=True)

    y_pred = pipeline.predict(f_X_test)
    accuracy = accuracy_score(f_y_test, y_pred)

    y_true  = (f_y_test == 'Sim').astype(int)
    y_pred  = (y_pred == 'Sim').astype(int)

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)


    print(f"Test: Accuracy: {str(accuracy).replace('.',',')} | Precision:{str(precision).replace('.',',')} | Recall:{str(recall).replace('.',',')} | F1-score:{str(f1).replace('.',',')}")
    print(f"{str(accuracy).replace('.',',')[1:-2]}\t{str(precision).replace('.',',')}\t{str(recall).replace('.',',')}\t{str(f1).replace('.',',')}")

y_pred = pipeline.predict(X_test.copy())
accuracy = accuracy_score(y_test, y_pred)

y_true  = (y_test == 'Sim').astype(int)
y_pred  = (y_pred == 'Sim').astype(int)

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

FP = np.sum((y_pred == 1) & (y_true == 0))
VP = np.sum((y_pred == 1) & (y_true == 1))
FN = np.sum((y_pred == 0) & (y_true == 1))
VN = np.sum((y_pred == 0) & (y_true == 0))

print(f"VP: {VP} - VN: {VN} - FP: {FP} - FN: {FN}")
print(f"{VP}\t{VN}\t{FP}\t{FN}")
print(f"Test: Accuracy: {str(accuracy).replace('.',',')} | Precision:{str(precision).replace('.',',')} | Recall:{str(recall).replace('.',',')} | F1-score:{str(f1).replace('.',',')}")
print(f"{str(accuracy).replace('.',',')[1:-2]}\t{str(precision).replace('.',',')}\t{str(recall).replace('.',',')}\t{str(f1).replace('.',',')}")

class_labels = pipeline.classes_
class_index = np.where(class_labels == "Sim")[0][0]

y_probs = pipeline.predict_proba(X_test.copy())[:, class_index]

y_true  = (y_test.copy() == 'Sim').astype(int)

precision, recall, _ = precision_recall_curve(y_true, y_probs)
pr_auc = auc(recall, precision)
print('auc: ', pr_auc)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precisão')
plt.grid(True)
plt.show()

In [None]:
data = pd.read_csv('data.csv', delimiter=';')

print(data.shape[0])
data = data.dropna(subset=['Vai Chover Amanha'])#Removendo linhas com valores faltantes na coluna de outliers

data.reset_index(drop=True, inplace=True)
print(data.shape[0])

X = data.drop('Vai Chover Amanha', axis=1)
y = data['Vai Chover Amanha']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)

    

def objective(trial):
    algorithm = trial.suggest_categorical('algorithm', ['SAMME', 'SAMME.R'])
    learning_rate = trial.suggest_float('learning_rate', 0.25, 2)
    n_estimators = trial.suggest_int('n_estimators', 5, 30)
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    max_depth = trial.suggest_int('max_depth', 3, 8)

    dt = DecisionTreeClassifier(class_weight='balanced', splitter='best', max_depth=max_depth, criterion=criterion, random_state=42)
    ada = AdaBoostClassifier(base_estimator=dt, n_estimators=n_estimators*10, algorithm=algorithm, learning_rate=learning_rate, random_state=42)
    

    pipeline = Pipeline(steps=[
        ('manual',ManualFeatureSelectorTransformer()),
        ('imputer',ImputerTransformer()),
        ('date', DateTransformer()),
        ('scaler', RobustScalerTransformer()),
        ('os', RandomOverSampler(random_state=42)),
        ('model', ada),
        ])

    X_train_2, X_val, y_train_2, y_val = train_test_split(X_train.copy(), y_train.copy(), stratify=y_train, test_size=0.177, random_state=42)
    pipeline.fit(X_train_2, y_train_2)

    y_pred = pipeline.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)

    y_true  = (y_val == 'Sim').astype(int)
    y_pred  = (y_pred == 'Sim').astype(int)

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"Accuracy: {str(accuracy).replace('.',',')} | Precision:{str(precision).replace('.',',')} | Recall:{str(recall).replace('.',',')} | F1-score:{str(f1).replace('.',',')}")
    #score = cross_val_score(pipeline, X.copy(), y.copy(), n_jobs=-1, cv=5).mean()
    return f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

for trial in study.trials:
    print(f"Trial {trial.number}:")
    print(f"  Value: {trial.value}")
    print("  Params:")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")
    print()

print("\n\n\nBest trial:")
trial = study.best_trial

print(f" Val Value: {trial.value}")
print(" Val Params:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

dt = DecisionTreeClassifier(class_weight='balanced', splitter='best', max_depth=trial.params['max_depth'], criterion=trial.params['criterion'], random_state=42)
ada = AdaBoostClassifier(base_estimator=dt, n_estimators=trial.params['n_estimators']*10, algorithm=trial.params['algorithm'], learning_rate=trial.params['learning_rate'], random_state=42)

pipeline = Pipeline(steps=[
    ('manual',ManualFeatureSelectorTransformer()),
    ('imputer',ImputerTransformer()),
    ('date', DateTransformer()),
    ('scaler', RobustScalerTransformer()),
    ('os', RandomOverSampler(random_state=42)),
    ('model', ada),
    ])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

y_true  = (y_test == 'Sim').astype(int)
y_pred  = (y_pred == 'Sim').astype(int)

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Test: Accuracy: {str(accuracy).replace('.',',')} | Precision:{str(precision).replace('.',',')} | Recall:{str(recall).replace('.',',')} | F1-score:{str(f1).replace('.',',')}")


In [None]:
data = pd.read_csv('data.csv', delimiter=';')
data = data.dropna(subset=['Vai Chover Amanha'])#Removendo linhas com valores faltantes na coluna de outliers
data.reset_index(drop=True, inplace=True)

X = data.drop('Vai Chover Amanha', axis=1)
y = data['Vai Chover Amanha']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)

dt = DecisionTreeClassifier(class_weight='balanced', splitter='best', max_depth=8, criterion='gini', random_state=42)
ada = AdaBoostClassifier(base_estimator=dt, n_estimators=16*10, algorithm='SAMME', learning_rate=0.645003613327735, random_state=42)

pipeline = Pipeline(steps=[
    ('manual',ManualFeatureSelectorTransformer()),
    ('imputer',ImputerTransformer()),
    ('date', DateTransformer()),
    ('scaler', RobustScalerTransformer()),
    ('os', RandomOverSampler(random_state=42)),
    ('model', ada),
    ])

pipeline.fit(X_train, y_train)

for i in range(2001,2025):
    print(i)
    f_X_test = X_test.copy().loc[pd.to_datetime(X_test['Data']).dt.year == i]
    f_X_test.reset_index(drop=True, inplace=True)

    f_y_test = y_test.copy().loc[pd.to_datetime(X_test['Data']).dt.year == i]
    f_y_test.reset_index(drop=True, inplace=True)
    

    y_pred = pipeline.predict(f_X_test)
    accuracy = accuracy_score(f_y_test, y_pred)

    y_true  = (f_y_test == 'Sim').astype(int)
    y_pred  = (y_pred == 'Sim').astype(int)

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)


    print(f"Test: Accuracy: {str(accuracy).replace('.',',')} | Precision:{str(precision).replace('.',',')} | Recall:{str(recall).replace('.',',')} | F1-score:{str(f1).replace('.',',')}")
    print(f"{str(accuracy).replace('.',',')[1:-2]}\t{str(precision).replace('.',',')}\t{str(recall).replace('.',',')}\t{str(f1).replace('.',',')}")

for cidade in data['Cidade'].unique():
    print(cidade)
    f_X_test = X_test.copy().loc[X_test['Cidade'] == cidade]
    f_X_test.reset_index(drop=True, inplace=True)

    f_y_test = y_test.copy().loc[X_test['Cidade'] == cidade]
    f_y_test.reset_index(drop=True, inplace=True)

    y_pred = pipeline.predict(f_X_test)
    accuracy = accuracy_score(f_y_test, y_pred)

    y_true  = (f_y_test == 'Sim').astype(int)
    y_pred  = (y_pred == 'Sim').astype(int)

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)


    print(f"Test: Accuracy: {str(accuracy).replace('.',',')} | Precision:{str(precision).replace('.',',')} | Recall:{str(recall).replace('.',',')} | F1-score:{str(f1).replace('.',',')}")
    print(f"{str(accuracy).replace('.',',')[1:-2]}\t{str(precision).replace('.',',')}\t{str(recall).replace('.',',')}\t{str(f1).replace('.',',')}")

y_pred = pipeline.predict(X_test.copy())
accuracy = accuracy_score(y_test, y_pred)

y_true  = (y_test == 'Sim').astype(int)
y_pred  = (y_pred == 'Sim').astype(int)

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

FP = np.sum((y_pred == 1) & (y_true == 0))
VP = np.sum((y_pred == 1) & (y_true == 1))
FN = np.sum((y_pred == 0) & (y_true == 1))
VN = np.sum((y_pred == 0) & (y_true == 0))

print(f"VP: {VP} - VN: {VN} - FP: {FP} - FN: {FN}")
print(f"{VP}\t{VN}\t{FP}\t{FN}")
print(f"Test: Accuracy: {str(accuracy).replace('.',',')} | Precision:{str(precision).replace('.',',')} | Recall:{str(recall).replace('.',',')} | F1-score:{str(f1).replace('.',',')}")
print(f"{str(accuracy).replace('.',',')[1:-2]}\t{str(precision).replace('.',',')}\t{str(recall).replace('.',',')}\t{str(f1).replace('.',',')}")

class_labels = pipeline.classes_
class_index = np.where(class_labels == "Sim")[0][0]

y_probs = pipeline.predict_proba(X_test.copy())[:, class_index]

y_true  = (y_test.copy() == 'Sim').astype(int)

precision, recall, _ = precision_recall_curve(y_true, y_probs)
pr_auc = auc(recall, precision)
print('auc: ', pr_auc)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precisão')
plt.grid(True)
plt.show()

In [None]:
data = pd.read_csv('data.csv', delimiter=';')

print(data.shape[0])
data = data.dropna(subset=['Vai Chover Amanha'])#Removendo linhas com valores faltantes na coluna de outliers

data.reset_index(drop=True, inplace=True)
print(data.shape[0])

X = data.drop('Vai Chover Amanha', axis=1)
y = data['Vai Chover Amanha']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)

    

def objective(trial):
    loss = trial.suggest_categorical('loss', ['hinge', 'modified_huber', 'perceptron', 'squared_hinge', 'squared_error',
                                                    'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'])
    penalty = trial.suggest_categorical('penalty', ['l2', 'l1', 'elasticnet'])
    alpha = trial.suggest_float('alpha', 1e-5, 1e-3)
    tol = trial.suggest_float('tol', 1e-4, 1e-2)

    pipeline = Pipeline(steps=[
        ('manual',ManualFeatureSelectorTransformer()),
        ('imputer',ImputerTransformer()),
        ('date', DateTransformer()),
        ('scaler', RobustScalerTransformer()),
        ('os', RandomOverSampler(random_state=42)),
        ('model', SGDClassifier(max_iter=4000, tol=tol, alpha=alpha, penalty=penalty, loss=loss, class_weight='balanced', random_state=42)),
        ])

    X_train_2, X_val, y_train_2, y_val = train_test_split(X_train.copy(), y_train.copy(), stratify=y_train, test_size=0.177, random_state=42)
    pipeline.fit(X_train_2, y_train_2)

    y_pred = pipeline.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)

    y_true  = (y_val == 'Sim').astype(int)
    y_pred  = (y_pred == 'Sim').astype(int)

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"Accuracy: {str(accuracy).replace('.',',')} | Precision:{str(precision).replace('.',',')} | Recall:{str(recall).replace('.',',')} | F1-score:{str(f1).replace('.',',')}")
    #score = cross_val_score(pipeline, X.copy(), y.copy(), n_jobs=-1, cv=5).mean()
    return f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

for trial in study.trials:
    print(f"Trial {trial.number}:")
    print(f"  Value: {trial.value}")
    print("  Params:")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")
    print()

print("\n\n\nBest trial:")
trial = study.best_trial

print(f" Val Value: {trial.value}")
print(" Val Params:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

pipeline = Pipeline(steps=[
    ('manual',ManualFeatureSelectorTransformer()),
    ('imputer',ImputerTransformer()),
    ('date', DateTransformer()),
    ('scaler', RobustScalerTransformer()),
    ('os', RandomOverSampler(random_state=42)),
    ('model', SGDClassifier(max_iter=4000, tol=trial.params['tol'], alpha=trial.params['alpha'], penalty=trial.params['penalty'], loss=trial.params['loss'], class_weight='balanced', random_state=42)),
    ])



pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

y_true  = (y_test == 'Sim').astype(int)
y_pred  = (y_pred == 'Sim').astype(int)

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Test: Accuracy: {str(accuracy).replace('.',',')} | Precision:{str(precision).replace('.',',')} | Recall:{str(recall).replace('.',',')} | F1-score:{str(f1).replace('.',',')}")


In [None]:
data = pd.read_csv('data.csv', delimiter=';')
data = data.dropna(subset=['Vai Chover Amanha'])#Removendo linhas com valores faltantes na coluna de outliers
data.reset_index(drop=True, inplace=True)

X = data.drop('Vai Chover Amanha', axis=1)
y = data['Vai Chover Amanha']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)

pipeline = Pipeline(steps=[
    ('manual',ManualFeatureSelectorTransformer()),
    ('imputer',ImputerTransformer()),
    ('date', DateTransformer()),
    ('scaler', RobustScalerTransformer()),
    ('os', RandomOverSampler(random_state=42)),
    ('model', SGDClassifier(max_iter=4000, tol=0.002461489467454204, alpha=0.0005825682471131291, penalty='l1', loss="modified_huber", class_weight='balanced', random_state=42)),
    ])

pipeline.fit(X_train, y_train)

for i in range(2001,2025):
    print(i)
    f_X_test = X_test.copy().loc[pd.to_datetime(X_test['Data']).dt.year == i]
    f_X_test.reset_index(drop=True, inplace=True)

    f_y_test = y_test.copy().loc[pd.to_datetime(X_test['Data']).dt.year == i]
    f_y_test.reset_index(drop=True, inplace=True)
    

    y_pred = pipeline.predict(f_X_test)
    accuracy = accuracy_score(f_y_test, y_pred)

    y_true  = (f_y_test == 'Sim').astype(int)
    y_pred  = (y_pred == 'Sim').astype(int)

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)


    print(f"Test: Accuracy: {str(accuracy).replace('.',',')} | Precision:{str(precision).replace('.',',')} | Recall:{str(recall).replace('.',',')} | F1-score:{str(f1).replace('.',',')}")
    print(f"{str(accuracy).replace('.',',')[1:-2]}\t{str(precision).replace('.',',')}\t{str(recall).replace('.',',')}\t{str(f1).replace('.',',')}")

for cidade in data['Cidade'].unique():
    print(cidade)
    f_X_test = X_test.copy().loc[X_test['Cidade'] == cidade]
    f_X_test.reset_index(drop=True, inplace=True)

    f_y_test = y_test.copy().loc[X_test['Cidade'] == cidade]
    f_y_test.reset_index(drop=True, inplace=True)

    y_pred = pipeline.predict(f_X_test)
    accuracy = accuracy_score(f_y_test, y_pred)

    y_true  = (f_y_test == 'Sim').astype(int)
    y_pred  = (y_pred == 'Sim').astype(int)

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)


    print(f"Test: Accuracy: {str(accuracy).replace('.',',')} | Precision:{str(precision).replace('.',',')} | Recall:{str(recall).replace('.',',')} | F1-score:{str(f1).replace('.',',')}")
    print(f"{str(accuracy).replace('.',',')[1:-2]}\t{str(precision).replace('.',',')}\t{str(recall).replace('.',',')}\t{str(f1).replace('.',',')}")

y_pred = pipeline.predict(X_test.copy())
accuracy = accuracy_score(y_test, y_pred)

y_true  = (y_test == 'Sim').astype(int)
y_pred  = (y_pred == 'Sim').astype(int)

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

FP = np.sum((y_pred == 1) & (y_true == 0))
VP = np.sum((y_pred == 1) & (y_true == 1))
FN = np.sum((y_pred == 0) & (y_true == 1))
VN = np.sum((y_pred == 0) & (y_true == 0))

print(f"VP: {VP} - VN: {VN} - FP: {FP} - FN: {FN}")
print(f"{VP}\t{VN}\t{FP}\t{FN}")
print(f"Test: Accuracy: {str(accuracy).replace('.',',')} | Precision:{str(precision).replace('.',',')} | Recall:{str(recall).replace('.',',')} | F1-score:{str(f1).replace('.',',')}")
print(f"{str(accuracy).replace('.',',')[1:-2]}\t{str(precision).replace('.',',')}\t{str(recall).replace('.',',')}\t{str(f1).replace('.',',')}")

class_labels = pipeline.classes_
class_index = np.where(class_labels == "Sim")[0][0]

y_probs = pipeline.predict_proba(X_test.copy())[:, class_index]

y_true  = (y_test.copy() == 'Sim').astype(int)

precision, recall, _ = precision_recall_curve(y_true, y_probs)
pr_auc = auc(recall, precision)
print('auc: ', pr_auc)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precisão')
plt.grid(True)
plt.show()

In [None]:
#variancia explicada cumulativa
data = pd.read_csv('data.csv', delimiter=';')

data = data.dropna(subset=['Vai Chover Amanha'])
data.reset_index(drop=True, inplace=True)

X = data.drop('Vai Chover Amanha', axis=1)
y = data['Vai Chover Amanha']

pipeline = Pipeline(steps=[
    ('manual',ManualFeatureSelectorTransformer()),
    ('imputer',ImputerTransformer()),
    ('date', DateTransformer()),
    ('scaler', RobustScalerTransformer()),
])

X_train = pipeline.fit_transform(X, y)

nums = np.arange(33)
var_ratio = []
for num in nums:
  pca = PCA(n_components=num)
  pca.fit(X_train)
  var_ratio.append(np.sum(pca.explained_variance_ratio_))

print(nums)
for num in nums:
  print(num, str(var_ratio[num]).replace('.',','))

In [None]:
#descrição do conjunto de dados após a normalização
data = pd.read_csv('data.csv', delimiter=';')

data = data.dropna(subset=['Vai Chover Amanha'])
data.reset_index(drop=True, inplace=True)

X = data.drop('Vai Chover Amanha', axis=1)
y = data['Vai Chover Amanha']

pipeline = Pipeline(steps=[
    ('manual',ManualFeatureSelectorTransformer()),
    ('imputer',ImputerTransformer()),
    ('date', DateTransformer()),
    ('scaler', RobustScalerTransformer()),
])

X_train = pipeline.fit_transform(X, y)
print(X_train[['Precipitacao Total', 'Pressao Maxima','Pressao Minima','Temperatura Maxima','Temperatura Minima','Temperatura Orvalho Maxima','Temperatura Orvalho Minima',
                'Umidade Minima', 'Umidade Maxima','Precipitacao Total', 'Pressao Media', 'Temperatura Media', 'Temperatura Orvalho Media']].describe())

print(X_train[['Umidade Media', 'Direcao Vento 0H','Direcao Vento 6H','Direcao Vento 12H','Direcao Vento 18H', 'Rajada Maxima de Vento 0H', 'Rajada Maxima de Vento 6H',
                'Rajada Maxima de Vento 12H', 'Rajada Maxima de Vento 18H']].describe())

print(X_train[['Vento Velocidade Media 0H', 'Vento Velocidade Media 6H', 'Vento Velocidade Media 12H', 'Vento Velocidade Media 18H', 'Radiacao Global', 'Altitude']].describe())


In [None]:
#analise dos valores faltantes
data = pd.read_csv('data.csv', delimiter=';')

data.drop('Radiacao Global', axis=1, inplace=True)
data.drop('Cidade', axis=1, inplace=True)
data.drop('Codigo', axis=1, inplace=True)
data.drop('Latitude', axis=1, inplace=True)
data.drop('Longitude', axis=1, inplace=True)

data['Data'] = pd.to_datetime(data['Data'])
data['Ano'] = data['Data'].dt.year
data.drop('Data', axis=1, inplace=True)

print('any:')
counter = 0
for i in range(2001,2025):
    filtro = data.loc[(data['Ano']  == i)].drop('Ano', axis=1)
    counter += filtro.isna().any(axis=1).sum()
    print(f'{i}\t{filtro.isna().any(axis=1).sum()}')
print(f'Total\t{counter}')

print('all:')
counter = 0
for i in range(2001,2025):
    filtro = data.loc[(data['Ano']  == i)].drop('Ano', axis=1)
    counter += filtro.isna().all(axis=1).sum()
    print(f'{i}\t{filtro.isna().all(axis=1).sum()}')
print(f'Total\t{counter}')

print('i:')
counter = 0
for i in range(2001,2025):
    counter += data.loc[(data['Ano']  == i)]['Vai Chover Amanha'].isna().sum()
    value = data.loc[(data['Ano']  == i)]['Vai Chover Amanha'].isna().sum()
    print(f'{i}\t{value}')
print(f'Total\t{counter}')

In [None]:
#analise dos outliers usando o z-score
data = pd.read_csv('data.csv', delimiter=';')

data = data.dropna()
data.reset_index(drop=True, inplace=True)

columns = ['Pressao Maxima','Pressao Minima','Temperatura Maxima','Temperatura Minima','Temperatura Orvalho Maxima','Temperatura Orvalho Minima',
                'Umidade Minima', 'Umidade Maxima','Precipitacao Total', 'Pressao Media', 'Temperatura Media', 'Temperatura Orvalho Media', 'Umidade Media', 
                'Direcao Vento 0H','Direcao Vento 6H','Direcao Vento 12H','Direcao Vento 18H', 'Rajada Maxima de Vento 0H', 'Rajada Maxima de Vento 6H',
                'Rajada Maxima de Vento 12H', 'Rajada Maxima de Vento 18H', 'Vento Velocidade Media 0H', 'Vento Velocidade Media 6H',
                'Vento Velocidade Media 12H', 'Vento Velocidade Media 18H', 'Radiacao Global', 'Altitude']

counter = 0
for column in columns:
    z_scores = np.abs(stats.zscore(data[column]))
    threshold = 3
    outliers_mask = (z_scores > threshold)

    num_outliers = outliers_mask.sum()
    print(f"{column}\t{num_outliers}")
    counter += outliers_mask.sum()
print(counter)

column = 'Precipitacao Total'
data = data[data[column] != 0]

z_scores = np.abs(stats.zscore(data[column]))
threshold = 3
outliers_mask = (z_scores > threshold)

num_outliers = outliers_mask.sum()
print(f"{column}\t{num_outliers}")

In [None]:
#descrição geral dos dados
data = pd.read_csv('data.csv', delimiter=';')

data = data.dropna()
data.reset_index(drop=True, inplace=True)

print(data[['Precipitacao Total', 'Pressao Maxima','Pressao Minima','Temperatura Maxima','Temperatura Minima','Temperatura Orvalho Maxima','Temperatura Orvalho Minima',
                'Umidade Minima', 'Umidade Maxima','Precipitacao Total', 'Pressao Media', 'Temperatura Media', 'Temperatura Orvalho Media']].describe())

print(data[['Umidade Media', 'Direcao Vento 0H','Direcao Vento 6H','Direcao Vento 12H','Direcao Vento 18H', 'Rajada Maxima de Vento 0H', 'Rajada Maxima de Vento 6H',
                'Rajada Maxima de Vento 12H', 'Rajada Maxima de Vento 18H']].describe())

print(data[['Vento Velocidade Media 0H', 'Vento Velocidade Media 6H', 'Vento Velocidade Media 12H', 'Vento Velocidade Media 18H', 'Radiacao Global', 'Altitude']].describe())

In [None]:
#contagem de valores por classe no atributo alvo
data = pd.read_csv('data.csv', delimiter=';')

print(data['Vai Chover Amanha'].value_counts())

In [None]:
data = pd.read_csv('data.csv', delimiter=';')

#busca por outliers nos atributos de latitude e longitude
for value in data.dropna()['Codigo'].unique():
    filtered = data[(data['Codigo'] == value)]
    lat_mode = filtered['Latitude'].mode()[0]
    long_mode = filtered['Longitude'].mode()[0]
    alt_mode = filtered['Altitude'].mode()[0]
    lat_outliers = (filtered['Latitude'] != lat_mode).sum()
    long_outliers = (filtered['Longitude'] != long_mode).sum()
    alt_outliers = (filtered['Altitude'] != alt_mode).sum()
    print("cod - ", value, " lat: ", lat_mode, " outliers: ", lat_outliers, " - long: ", long_mode, " outliers: ", long_outliers, " - alt: ", alt_mode, " outliers: ", alt_outliers)
