In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.neighbors import LocalOutlierFactor
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
#from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline
from imblearn import FunctionSampler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, KMeansSMOTE, SVMSMOTE, ADASYN, RandomOverSampler
from sklearn.feature_selection import SelectKBest, f_classif, chi2, SelectPercentile, SelectFromModel, SequentialFeatureSelector, SelectFromModel
from sklearn.decomposition import PCA
from scipy import stats
from imblearn.base import SamplerMixin
from sklearn.ensemble import VotingClassifier

In [3]:
class ManualFeatureSelectorTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        print("fit - ManualFeatureSelectorTransformer - begin", X.shape)
        print("fit - ManualFeatureSelectorTransformer - end", X.shape)
        return self

    def transform(self, X):
        print("transform - ManualFeatureSelectorTransformer", X.shape)
        #X.drop('Radiacao Global', axis=1, inplace=True)
        X.drop('Cidade', axis=1, inplace=True)
        X.drop('Codigo', axis=1, inplace=True)
        print("transform - ManualFeatureSelectorTransformer - end", X.shape)
        return X

In [4]:
class ImputerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.imputer_numeric = SimpleImputer(strategy='mean')
        self.imputer_categorical = SimpleImputer(strategy='most_frequent')

    def fit(self, X, y=None):
        print("fit - ImputerTransformer - begin", X.shape)
        numeric_cols = X.select_dtypes(include=['number']).columns
        categorical_cols = X.select_dtypes(include=['object']).columns

        self.imputer_numeric.fit(X[numeric_cols])
        self.imputer_categorical.fit(X[categorical_cols])
        print("fit - ImputerTransformer - end", X.shape)
        return self

    def transform(self, X):
        print("transform - ImputerTransformer - begin", X.shape)
        numeric_cols = X.select_dtypes(include=['number']).columns
        categorical_cols = X.select_dtypes(include=['object']).columns
        X[numeric_cols] = self.imputer_numeric.transform(X[numeric_cols])
        X[categorical_cols] = self.imputer_categorical.transform(X[categorical_cols])
        print("transform - ImputerTransformer - end", X.shape)
        return X

In [5]:
class DateTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        print("fit - DateTransformer - begin", X.shape)
        print("fit - DateTransformer - end", X.shape)
        return self

    def transform(self, X):
        print("transform - DateTransformer", X.shape)
        X['Data'] = pd.to_datetime(X['Data'])
        X['Ano'] = X['Data'].dt.year
        X['Mes'] = X['Data'].dt.month
        X['Dia'] = X['Data'].dt.day
        X.drop('Data', axis=1, inplace=True)
        print("transform - DateTransformer - end", X.shape)
        return X

In [6]:
class OneHotEncoderTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = OneHotEncoder(sparse=False)

    def fit(self, X, y=None):
        print("fit - OneHotEncoderTransformer - begin", X.shape)
        self.encoder.fit(X[['Codigo']])
        print("fit - OneHotEncoderTransformer - end", X.shape)
        return self

    def transform(self, X):
        print("transform - OneHotEncoderTransformer - begin", X.shape)
        X.reset_index(drop=True, inplace=True)
        encoded_data = self.encoder.transform(X[['Codigo']])
        encoded_df = pd.DataFrame(encoded_data, columns=self.encoder.get_feature_names_out(['Codigo']))
        X = pd.concat([X.drop(['Codigo'], axis=1), encoded_df], axis=1)
        print("transform - OneHotEncoderTransformer - end", X.shape)
        return X

In [7]:
class RobustScalerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.rainfall_scaler = RobustScaler(quantile_range=(6.5, 93.5))
        self.scaler = RobustScaler()

    def fit(self, X, y=None):
        print("fit - RobustScalerTransformer - begin", X.shape)
        self.scaler.fit(X[['Pressao Maxima','Pressao Minima','Temperatura Maxima','Temperatura Minima','Temperatura Orvalho Maxima','Temperatura Orvalho Minima',
                'Umidade Minima', 'Umidade Maxima', 'Pressao Media', 'Temperatura Media', 'Temperatura Orvalho Media', 'Umidade Media', 
                'Direcao Vento', 'Rajada Maxima de Vento', 'Vento Velocidade Media', 'Latitude', 'Longitude', 'Ano', 'Mes', 'Dia','Radiacao Global']])
        self.rainfall_scaler.fit(X[['Precipitacao Total']])
        print("fit - RobustScalerTransformer - end", X.shape)
        return self

    def transform(self, X):
        print("transform - RobustScalerTransformer - begin", X.shape)
        X[['Pressao Maxima','Pressao Minima','Temperatura Maxima','Temperatura Minima','Temperatura Orvalho Maxima','Temperatura Orvalho Minima','Umidade Minima',
      'Umidade Maxima', 'Pressao Media', 'Temperatura Media', 'Temperatura Orvalho Media', 'Umidade Media', 'Direcao Vento',
      'Rajada Maxima de Vento','Vento Velocidade Media', 'Latitude', 'Longitude', 'Ano', 'Mes', 'Dia', 'Radiacao Global']] = self.scaler.transform(
          X[['Pressao Maxima','Pressao Minima','Temperatura Maxima','Temperatura Minima','Temperatura Orvalho Maxima','Temperatura Orvalho Minima',
                'Umidade Minima', 'Umidade Maxima', 'Pressao Media', 'Temperatura Media', 'Temperatura Orvalho Media', 'Umidade Media', 
                'Direcao Vento', 'Rajada Maxima de Vento', 'Vento Velocidade Media', 'Latitude', 'Longitude', 'Ano', 'Mes', 'Dia', 'Radiacao Global']])
        X['Precipitacao Total'] = self.rainfall_scaler.transform(X[['Precipitacao Total']])
        print("transform - RobustScalerTransformer - end", X.shape)
        return X

In [8]:
def removeLocalOutlier(X, y):
    print("fit_resample - LocalOutlierTransformer - begin", X.shape)
    lof = LocalOutlierFactor(contamination=0.025)
    outlier_mask = lof.fit_predict(X) != -1
    X = X[outlier_mask]
    y = y[outlier_mask]
    print("fit_resample - LocalOutlierTransformer - end",X.shape)
    return X, y

In [9]:
def handleOutliersZIndex(X, threshold = 3):
    print("handleOutliersZIndex - begin", X.shape)
    columns = ['Pressao Maxima','Pressao Minima','Temperatura Maxima','Temperatura Minima','Temperatura Orvalho Maxima','Temperatura Orvalho Minima',
                'Umidade Minima', 'Umidade Maxima', 'Pressao Media', 'Temperatura Media', 'Temperatura Orvalho Media', 'Umidade Media', 
                'Direcao Vento', 'Rajada Maxima de Vento', 'Vento Velocidade Media', 'Radiacao Global']
    
    for column in columns:
        non_zero_mask = (X[column].notna())

        nz_X = X[non_zero_mask]
        z_scores = np.abs(stats.zscore(nz_X[column]))
        outliers_mask = (z_scores > threshold)

        nz_X.loc[outliers_mask, column] = nz_X[(~outliers_mask)][column].mean()

        X.loc[non_zero_mask, column] = nz_X[column]
    
    column = 'Precipitacao Total'
    non_zero_mask = (X[column] != 0) & (X[column].notna())

    nz_X = X[non_zero_mask]
    z_scores = np.abs(stats.zscore(nz_X[column]))
    outliers_mask = (z_scores > threshold)

    nz_X.loc[outliers_mask, column] = nz_X[(~outliers_mask)][column].mean()

    X.loc[non_zero_mask, column] = nz_X[column]

    print("handleOutliersZIndex - end", X.shape)
    return X

In [10]:
def get_mlp(data_x, data_y):
    mlp = MLPClassifier(hidden_layer_sizes=(10,20,20,10), activation='relu', max_iter=15, random_state=42)
    #mlp = MLPClassifier(hidden_layer_sizes=(100,200,200,200,100), activation='relu', max_iter=150, random_state=42)
    mlp.fit(data_x, data_y)
    return mlp

In [11]:
def get_dt(data_x, data_y):
    dt = DecisionTreeClassifier(class_weight='balanced', splitter='best', max_depth=None, criterion='gini')
    dt.fit(data_x, data_y)
    return dt

In [12]:
def get_rf(data_x, data_y):
    rf = RandomForestClassifier(class_weight='balanced', max_depth=15, n_estimators=20, random_state=42)
    rf.fit(data_x, data_y)
    return rf

In [13]:
def get_sgd(data_x, data_y):
    sgd = SGDClassifier(max_iter=3000, tol=1e-3, class_weight='balanced', random_state=42)
    sgd.fit(data_x, data_y)
    return sgd

In [14]:
def get_logistic_regression(data_x, data_y):
    lr = LogisticRegression(class_weight='balanced', max_iter=2000, random_state=42)
    lr.fit(data_x, data_y)
    return lr

In [15]:
def get_bagging(data_x, data_y):
   base_classifier = DecisionTreeClassifier(class_weight='balanced', splitter='best', max_depth=15, criterion='gini', random_state=42)
   #base_classifier = DecisionTreeClassifier(splitter='best', max_depth=20, criterion='gini', random_state=42, min_samples_split=50)
   bagging = BaggingClassifier(base_estimator=base_classifier, n_estimators=20, random_state=42)
   bagging.fit(data_x, data_y)
   return bagging

In [16]:
def get_linear_svc(data_x, data_y):
    linear_svc = LinearSVC(class_weight='balanced', random_state=42)
    linear_svc.fit(data_x, data_y)
    return linear_svc

In [17]:
def get_knn(data_x, data_y):
    knn = KNeighborsClassifier(class_weight='balanced', n_neighbors=20)
    knn.fit(data_x, data_y)
    return knn

In [18]:
def get_nb(data_x, data_y):
    nb = BernoulliNB()
    nb.fit(data_x, data_y)
    return nb

In [19]:
def get_ada(data_x, data_y):
    ada = AdaBoostClassifier(n_estimators=100, algorithm="SAMME", random_state=0)
    ada.fit(data_x, data_y)
    return ada

In [20]:
def get_ensemble(data_x, data_y):
    mlp = MLPClassifier(hidden_layer_sizes=(10,20,20,10), activation='relu', max_iter=15, random_state=42)
    rf = RandomForestClassifier(class_weight='balanced', max_depth=15, n_estimators=20, random_state=42)
    base_classifier = DecisionTreeClassifier(class_weight='balanced', splitter='best', max_depth=15, criterion='gini', random_state=42)
    bagging = BaggingClassifier(base_estimator=base_classifier, n_estimators=20, random_state=42)
    ensemble = VotingClassifier(estimators=[('mlp', mlp), ('rf', rf), ('bagging', bagging)], voting='hard')
    ensemble.fit(data_x, data_y)
    return ensemble

In [21]:
def plot_result_scikit(model, data_x, data_y):
    y_pred = model.predict(data_x)
    accuracy = accuracy_score(data_y, y_pred),
    FP = np.sum((y_pred == 'Sim') & (data_y == 'Nao'))
    FN = np.sum((y_pred == 'Nao') & (data_y == 'Sim'))
    VP = np.sum((y_pred == 'Sim') & (data_y == 'Sim'))
    VN = np.sum((y_pred == 'Nao') & (data_y == 'Nao'))

    y_true  = (data_y == 'Sim').astype(int)
    y_pred  = (y_pred == 'Sim').astype(int)

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"Accuracy: {accuracy} | Precision:{precision} | Recall:{recall} | F1-score:{f1} | FP:{FP} | FN:{FN} | VP:{VP} | VN:{VN}")

In [22]:
data = pd.read_csv('data.csv', delimiter=';')

print(data.shape[0])
data = data.dropna(subset=['Vai Chover Amanha'])
data.reset_index(drop=True, inplace=True)
print(data.shape[0])

X = data.drop('Vai Chover Amanha', axis=1)
y = data['Vai Chover Amanha']

raw_X_train, X_test, raw_y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=60)


pipeline = Pipeline(steps=[
    ('manual',ManualFeatureSelectorTransformer()),
    ('imputer',ImputerTransformer()),
    ('date', DateTransformer()),
    #('oht', OneHotEncoderTransformer()),
    #('outliers', LocalOutlierTransformer()),
    ('scaler', RobustScalerTransformer()),
    #('PCA', PCA(n_components=18)),
    #('over_sampler', RandomOverSampler(random_state=42)),
    #('model', bagging = BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight='balanced', splitter='best', max_depth=15, criterion='gini', random_state=42), n_estimators=20, random_state=42)),
])

X_train = pipeline.fit_transform(handleOutliersZIndex(raw_X_train.copy(),4), raw_y_train.copy())
y_train = raw_y_train.copy()

X_test = pipeline.transform(X_test.copy())

os = RandomOverSampler(random_state=42)
X_train, y_train = os.fit_resample(X_train.copy(), y_train.copy())

scikit_model = get_bagging(X_train.copy(), y_train.copy())
print("bagging")
print("training:")
plot_result_scikit(scikit_model, pipeline.transform(raw_X_train.copy()), raw_y_train.copy()) 
print("over sampled training:")
plot_result_scikit(scikit_model, X_train.copy(), y_train.copy()) 
print("test:")
plot_result_scikit(scikit_model, X_test.copy(),y_test.copy())

scikit_model = get_rf(X_train.copy(), y_train.copy())
print("rf")
print("training:")
plot_result_scikit(scikit_model, pipeline.transform(raw_X_train.copy()), raw_y_train.copy()) 
print("over sampled training:")
plot_result_scikit(scikit_model, X_train.copy(), y_train.copy()) 
print("test:")
plot_result_scikit(scikit_model, X_test.copy(),y_test.copy())

scikit_model = get_ada(X_train, y_train)
print("ada")
print("training:")
plot_result_scikit(scikit_model, pipeline.transform(raw_X_train.copy()), raw_y_train.copy()) 
print("over sampled training:")
plot_result_scikit(scikit_model, X_train.copy(), y_train.copy()) 
print("test:")
plot_result_scikit(scikit_model, X_test.copy(),y_test.copy())

scikit_model = get_nb(X_train, y_train)
print("nb")
print("training:")
plot_result_scikit(scikit_model, pipeline.transform(raw_X_train.copy()), raw_y_train.copy()) 
print("over sampled training:")
plot_result_scikit(scikit_model, X_train.copy(), y_train.copy()) 
print("test:")
plot_result_scikit(scikit_model, X_test.copy(),y_test.copy())

scikit_model = get_mlp(X_train, y_train)
print("mlp")
print("training:")
plot_result_scikit(scikit_model, pipeline.transform(raw_X_train.copy()), raw_y_train.copy()) 
print("over sampled training:")
plot_result_scikit(scikit_model, X_train.copy(), y_train.copy()) 
print("test:")
plot_result_scikit(scikit_model, X_test.copy(),y_test.copy())

scikit_model = get_ensemble(X_train, y_train)
print("ensemble")
print("training:")
plot_result_scikit(scikit_model, pipeline.transform(raw_X_train.copy()), raw_y_train.copy()) 
print("over sampled training:")
plot_result_scikit(scikit_model, X_train.copy(), y_train.copy()) 
print("test:")
plot_result_scikit(scikit_model, X_test.copy(),y_test.copy())

256311
240452
handleOutliersZIndex - begin (192361, 22)
handleOutliersZIndex - end (192361, 22)
fit - ManualFeatureSelectorTransformer - begin (192361, 22)
fit - ManualFeatureSelectorTransformer - end (192361, 22)
transform - ManualFeatureSelectorTransformer (192361, 22)
transform - ManualFeatureSelectorTransformer - end (192361, 20)
fit - ImputerTransformer - begin (192361, 20)
fit - ImputerTransformer - end (192361, 20)
transform - ImputerTransformer - begin (192361, 20)
transform - ImputerTransformer - end (192361, 20)
fit - DateTransformer - begin (192361, 20)
fit - DateTransformer - end (192361, 20)
transform - DateTransformer (192361, 20)
transform - DateTransformer - end (192361, 22)
fit - RobustScalerTransformer - begin (192361, 22)
fit - RobustScalerTransformer - end (192361, 22)
transform - RobustScalerTransformer - begin (192361, 22)
transform - RobustScalerTransformer - end (192361, 22)
transform - ManualFeatureSelectorTransformer (48091, 22)
transform - ManualFeatureSelect



mlp
training:
transform - ManualFeatureSelectorTransformer (192361, 22)
transform - ManualFeatureSelectorTransformer - end (192361, 20)
transform - ImputerTransformer - begin (192361, 20)
transform - ImputerTransformer - end (192361, 20)
transform - DateTransformer (192361, 20)
transform - DateTransformer - end (192361, 22)
transform - RobustScalerTransformer - begin (192361, 22)
transform - RobustScalerTransformer - end (192361, 22)
Accuracy: (0.7041292153814962,) | Precision:0.47997471272916586 | Recall:0.7212897937894678 | F1-score:0.5763940575783739 | FP:41952 | FN:14962 | VP:38721 | VN:96726
over sampled training:
Accuracy: (0.7090994966757524,) | Precision:0.7045375993679949 | Recall:0.7202512294668224 | F1-score:0.7123077636219064 | FP:41888 | FN:38795 | VP:99883 | VN:96790
test:
Accuracy: (0.7044769291551434,) | Precision:0.47986743173723784 | Recall:0.7240632930288102 | F1-score:0.5771999762003927 | FP:10515 | FN:3697 | VP:9701 | VN:24178




ensemble
training:
transform - ManualFeatureSelectorTransformer (192361, 22)
transform - ManualFeatureSelectorTransformer - end (192361, 20)
transform - ImputerTransformer - begin (192361, 20)
transform - ImputerTransformer - end (192361, 20)
transform - DateTransformer (192361, 20)
transform - DateTransformer - end (192361, 22)
transform - RobustScalerTransformer - begin (192361, 22)
transform - RobustScalerTransformer - end (192361, 22)
Accuracy: (0.8381168740025265,) | Precision:0.6597366891997223 | Recall:0.8671832796229719 | F1-score:0.7493681889155386 | FP:24010 | FN:7130 | VP:46553 | VN:114668
over sampled training:
Accuracy: (0.8573169500569665,) | Precision:0.8369096669794259 | Recall:0.8876029362984756 | F1-score:0.8615112193619731 | FP:23987 | FN:15587 | VP:123091 | VN:114691
test:
Accuracy: (0.7289097752178162,) | Precision:0.5100933847788403 | Recall:0.6808478877444395 | F1-score:0.5832294363990922 | FP:8761 | FN:4276 | VP:9122 | VN:25932


In [23]:
data = pd.read_csv('data.csv', delimiter=';')

data = data.dropna(subset=['Vai Chover Amanha'])
data.reset_index(drop=True, inplace=True)

X = data.drop('Vai Chover Amanha', axis=1)
y = data['Vai Chover Amanha']

pipeline = Pipeline(steps=[
    ('manual',ManualFeatureSelectorTransformer()),
    ('imputer',ImputerTransformer()),
    ('date', DateTransformer()),
    ('scaler', RobustScalerTransformer()),
])

X_train = pipeline.fit_transform(handleOutliersZIndex(X,4), y)
#print(X_train.describe())
print(X_train.shape)

nums = np.arange(23)
var_ratio = []
for num in nums:
  pca = PCA(n_components=num)
  pca.fit(X_train)
  var_ratio.append(np.sum(pca.explained_variance_ratio_))

print(nums)
print(var_ratio)

handleOutliersZIndex - begin (240452, 22)
handleOutliersZIndex - end (240452, 22)
fit - ManualFeatureSelectorTransformer - begin (240452, 22)
fit - ManualFeatureSelectorTransformer - end (240452, 22)
transform - ManualFeatureSelectorTransformer (240452, 22)
transform - ManualFeatureSelectorTransformer - end (240452, 20)
fit - ImputerTransformer - begin (240452, 20)
fit - ImputerTransformer - end (240452, 20)
transform - ImputerTransformer - begin (240452, 20)
transform - ImputerTransformer - end (240452, 20)
fit - DateTransformer - begin (240452, 20)
fit - DateTransformer - end (240452, 20)
transform - DateTransformer (240452, 20)
transform - DateTransformer - end (240452, 22)
fit - RobustScalerTransformer - begin (240452, 22)
fit - RobustScalerTransformer - end (240452, 22)
transform - RobustScalerTransformer - begin (240452, 22)
transform - RobustScalerTransformer - end (240452, 22)
(240452, 22)
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22]
[0.0, 0.2534805945

In [24]:
data = pd.read_csv('data.csv', delimiter=';')

data.drop('Radiacao Global', axis=1, inplace=True)
data.drop('Cidade', axis=1, inplace=True)
data.drop('Codigo', axis=1, inplace=True)
data.drop('Latitude', axis=1, inplace=True)
data.drop('Longitude', axis=1, inplace=True)

data['Data'] = pd.to_datetime(data['Data'])
data['Ano'] = data['Data'].dt.year
data.drop('Data', axis=1, inplace=True)

counter = 0
for i in range(2001,2024):
    filtro = data.loc[(data['Ano']  == i)].drop('Ano', axis=1)
    counter += filtro.isna().any(axis=1).sum()
    print('ano-',i,':', filtro.isna().any(axis=1).sum())
print('any:',counter)

counter = 0
for i in range(2001,2024):
    filtro = data.loc[(data['Ano']  == i)].drop('Ano', axis=1)
    counter += filtro.isna().all(axis=1).sum()
    print('ano-',i,':', filtro.isna().all(axis=1).sum())
print('all:',counter)

counter = 0
for i in range(2001,2024):
    counter += data.loc[(data['Ano']  == i)]['Vai Chover Amanha'].isna().sum()
    print('ano-',i,':', data.loc[(data['Ano']  == i)]['Vai Chover Amanha'].isna().sum())
print('y:',counter)

ano- 2001 : 47
ano- 2002 : 458
ano- 2003 : 1073
ano- 2004 : 519
ano- 2005 : 337
ano- 2006 : 116
ano- 2007 : 439
ano- 2008 : 444
ano- 2009 : 311
ano- 2010 : 461
ano- 2011 : 464
ano- 2012 : 702
ano- 2013 : 659
ano- 2014 : 1958
ano- 2015 : 2179
ano- 2016 : 1730
ano- 2017 : 1309
ano- 2018 : 1599
ano- 2019 : 1280
ano- 2020 : 2340
ano- 2021 : 3996
ano- 2022 : 3181
ano- 2023 : 2590
any: 28192
ano- 2001 : 22
ano- 2002 : 73
ano- 2003 : 221
ano- 2004 : 501
ano- 2005 : 303
ano- 2006 : 34
ano- 2007 : 182
ano- 2008 : 149
ano- 2009 : 83
ano- 2010 : 34
ano- 2011 : 260
ano- 2012 : 500
ano- 2013 : 109
ano- 2014 : 735
ano- 2015 : 957
ano- 2016 : 1160
ano- 2017 : 722
ano- 2018 : 251
ano- 2019 : 345
ano- 2020 : 686
ano- 2021 : 1416
ano- 2022 : 1165
ano- 2023 : 549
all: 10457
ano- 2001 : 32
ano- 2002 : 92
ano- 2003 : 245
ano- 2004 : 511
ano- 2005 : 319
ano- 2006 : 62
ano- 2007 : 218
ano- 2008 : 171
ano- 2009 : 94
ano- 2010 : 47
ano- 2011 : 283
ano- 2012 : 612
ano- 2013 : 169
ano- 2014 : 1185
ano- 2015 : 18

In [25]:
data = pd.read_csv('data.csv', delimiter=';')

data = data.dropna()
data.reset_index(drop=True, inplace=True)

columns = ['Pressao Maxima','Pressao Minima','Temperatura Maxima','Temperatura Minima','Temperatura Orvalho Maxima','Temperatura Orvalho Minima',
                'Umidade Minima', 'Umidade Maxima','Precipitacao Total', 'Pressao Media', 'Temperatura Media', 'Temperatura Orvalho Media', 'Umidade Media', 
                'Direcao Vento', 'Rajada Maxima de Vento', 'Vento Velocidade Media', 'Radiacao Global']

counter = 0
for column in columns:
    q25, q75 = np.percentile(data[column], 25), np.percentile(data[column], 75)
    iqr = q75 - q25
    cut_off = iqr * 1.5
    lower_bound, upper_bound = q25 - cut_off, q75 + cut_off
    outliers = [x for x in data[column] if x < lower_bound or x > upper_bound]
    num_outliers = len(outliers)
    print(column,num_outliers)
    counter += num_outliers
print(counter)

counter = 0
for column in columns:
    z_scores = np.abs(stats.zscore(data[column]))
    threshold = 4
    outliers_mask = (z_scores > threshold)

    num_outliers = outliers_mask.sum()
    print(column,num_outliers)
    counter += outliers_mask.sum()
print(counter)

column = 'Precipitacao Total'
data = data[data[column] != 0]
counter = 0
q25, q75 = np.percentile(data[column], 25), np.percentile(data[column], 75)
iqr = q75 - q25
cut_off = iqr * 1.5
lower_bound, upper_bound = q25 - cut_off, q75 + cut_off
outliers = [x for x in data[column] if x < lower_bound or x > upper_bound]
num_outliers = len(outliers)
print(column,num_outliers)

z_scores = np.abs(stats.zscore(data[column]))
threshold = 4
outliers_mask = (z_scores > threshold)

num_outliers = outliers_mask.sum()
print(column,num_outliers)

Pressao Maxima 1394
Pressao Minima 1282
Temperatura Maxima 251
Temperatura Minima 1346
Temperatura Orvalho Maxima 3154
Temperatura Orvalho Minima 1601
Umidade Minima 0
Umidade Maxima 12679
Precipitacao Total 44144
Pressao Media 1504
Temperatura Media 707
Temperatura Orvalho Media 2276
Umidade Media 1847
Direcao Vento 2053
Rajada Maxima de Vento 7409
Vento Velocidade Media 5893
Radiacao Global 186
87726
Pressao Maxima 0
Pressao Minima 0
Temperatura Maxima 0
Temperatura Minima 2
Temperatura Orvalho Maxima 24
Temperatura Orvalho Minima 23
Umidade Minima 0
Umidade Maxima 1339
Precipitacao Total 3134
Pressao Media 0
Temperatura Media 0
Temperatura Orvalho Media 7
Umidade Media 44
Direcao Vento 0
Rajada Maxima de Vento 671
Vento Velocidade Media 644
Radiacao Global 69
5957
Precipitacao Total 8802
Precipitacao Total 992


In [26]:
data = pd.read_csv('data.csv', delimiter=';')

print(data['Vai Chover Amanha'].value_counts())

Nao    173371
Sim     67081
Name: Vai Chover Amanha, dtype: int64


In [27]:
for value in data.dropna()['Codigo'].unique():
    filtered = data[(data['Codigo'] == value)]
    lat_mode = filtered['Latitude'].mode()[0]
    long_mode = filtered['Longitude'].mode()[0]
    lat_outliers = (filtered['Latitude'] != lat_mode).sum()
    long_outliers = (filtered['Longitude'] != long_mode).sum()
    print("cod - ", value, " lat: ", lat_mode, " outliers: ", lat_outliers, " - long: ", long_mode," outliers: ", long_outliers)


cod -  A801  lat:  -30.05361111  outliers:  0  - long:  -51.17472221  outliers:  0
cod -  A802  lat:  -32.07888888  outliers:  0  - long:  -52.16777777  outliers:  0
cod -  A803  lat:  -29.72499999  outliers:  0  - long:  -53.72055554  outliers:  0
cod -  A804  lat:  -30.75055555  outliers:  0  - long:  -55.40138888  outliers:  0
cod -  A805  lat:  -27.85444444  outliers:  0  - long:  -53.7911111  outliers:  0
cod -  A808  lat:  -29.35027777  outliers:  0  - long:  -49.73333333  outliers:  0
cod -  A809  lat:  -29.83999999  outliers:  0  - long:  -57.08194443  outliers:  0
cod -  A810  lat:  -27.89055555  outliers:  0  - long:  -54.47999999  outliers:  0
cod -  A812  lat:  -30.54527777  outliers:  0  - long:  -53.46694443  outliers:  0
cod -  A813  lat:  -29.87222221  outliers:  0  - long:  -52.38194443  outliers:  0
cod -  A826  lat:  -29.70916666  outliers:  0  - long:  -55.52555554  outliers:  0
cod -  A828  lat:  -27.65777777  outliers:  0  - long:  -52.30583333  outliers:  0
cod -

In [None]:
sm = SMOTE(random_state=42,k_neighbors=15)

sm = BorderlineSMOTE(random_state=42)

sm = SVMSMOTE(random_state=42)

sm = KMeansSMOTE(random_state=42)

In [None]:
selector = SelectKBest(k=20)

dt = DecisionTreeClassifier(splitter='best', criterion='gini')
selector = SequentialFeatureSelector(dt, n_features_to_select=20)

rf = RandomForestClassifier(class_weight='balanced', random_state=42)
selector = SelectFromModel(rf)