In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s3e23/sample_submission.csv
/kaggle/input/playground-series-s3e23/train.csv
/kaggle/input/playground-series-s3e23/test.csv


In [2]:
database = pd.read_csv('/kaggle/input/playground-series-s3e23/train.csv')
database = database.drop(['id'], axis = 1)

In [3]:
import numpy as np
import pandas as pd

def capOutliers(data: pd.Series):
    """
    Cap values of a numeric series to upper and lower limits based on IQR.

    Args:
        data (pd.Series): A numeric pandas Series.

    Returns:
        pd.Series: The capped pandas Series.
    """
    # Calcula os quartis e o intervalo interquartil (IIQ)
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    iqr = q3 - q1

    # Define os limites inferior e superior
    lower_limit = q1 - 1.5 * iqr
    upper_limit = q3 + 1.5 * iqr

    # Aplica os limites para capear os valores
    data_capped = data.copy()
    data_capped[data_capped > upper_limit] = upper_limit
    data_capped[data_capped < lower_limit] = lower_limit

    print(f"Coluna ajustada: ")
    print(f"Q1: {q1:.2f}, Q3: {q3:.2f}, IIQ: {iqr:.2f}")
    print(f"Limite Inferior: {lower_limit:.2f}, Limite Superior: {upper_limit:.2f}")
    print("-" * 50)

    return data_capped

def processOutliers(database: pd.DataFrame):
    """
    Processa e ajusta os outliers em colunas numéricas de um DataFrame.

    Args:
        database (pd.DataFrame): DataFrame contendo os dados.

    Returns:
        pd.DataFrame: DataFrame com outliers ajustados.
    """
    numeric_columns = database.select_dtypes(include=['number']).columns
    for column in numeric_columns:
        print(f"Ajustando outliers na coluna: {column}")
        database[column] = capOutliers(database[column])
    print("\nOutliers foram ajustados com sucesso.")
    return database

database = processOutliers(database)


Ajustando outliers na coluna: loc
Coluna ajustada: 
Q1: 13.00, Q3: 42.00, IIQ: 29.00
Limite Inferior: -30.50, Limite Superior: 85.50
--------------------------------------------------
Ajustando outliers na coluna: v(g)
Coluna ajustada: 
Q1: 2.00, Q3: 6.00, IIQ: 4.00
Limite Inferior: -4.00, Limite Superior: 12.00
--------------------------------------------------
Ajustando outliers na coluna: ev(g)
Coluna ajustada: 
Q1: 1.00, Q3: 3.00, IIQ: 2.00
Limite Inferior: -2.00, Limite Superior: 6.00
--------------------------------------------------
Ajustando outliers na coluna: iv(g)
Coluna ajustada: 
Q1: 1.00, Q3: 4.00, IIQ: 3.00
Limite Inferior: -3.50, Limite Superior: 8.50
--------------------------------------------------
Ajustando outliers na coluna: n
Coluna ajustada: 
Q1: 25.00, Q3: 111.00, IIQ: 86.00
Limite Inferior: -104.00, Limite Superior: 240.00
--------------------------------------------------
Ajustando outliers na coluna: v
Coluna ajustada: 
Q1: 97.67, Q3: 560.25, IIQ: 462.58
Lim

  data_capped[data_capped > upper_limit] = upper_limit
  data_capped[data_capped > upper_limit] = upper_limit


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tqdm import tqdm

# Função para processar os outliers (ajustar conforme sua necessidade)
def processOutliers(database):
    # Aqui você pode aplicar a remoção de outliers com base nos limites superiores e inferiores
    Q3 = np.percentile(database, 75)
    Q1 = np.percentile(database, 25)
    IIQ = Q3 - Q1
    LS = Q3 + (1.5 * IIQ)
    LI = Q1 - (1.5 * IIQ)
    
    # Substituindo os outliers pelos limites superiores/inferiores
    database = np.where(database > LS, LS, database)
    database = np.where(database < LI, LI, database)
    
    return database

# Carregar e processar os dados
def loadAndPreprocessData(file_path):
    # Carregar os dados
    database = pd.read_csv(file_path)
    
    # Processar outliers (ajustado para o dataset completo, incluindo features)
    X = database.drop(columns=['defects'])
    y = database['defects']
    
    # Processar os dados de entrada
    X = processOutliers(X)
    
    # Dividir o dataset em treino e teste
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Escalonamento dos dados
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test

# Modelo 1: Regressão Logística com tqdm
def trainLogisticRegression(X_train, y_train):
    print("Treinando Regressão Logística...")
    model = LogisticRegression(max_iter=1000)
    
    # Simulando barra de progresso para treinar em etapas
    for _ in tqdm(range(1), desc="Treinando"):
        model.fit(X_train, y_train)
    return model

# Modelo 3: Random Forest com tqdm
def trainRandomForest(X_train, y_train):
    print("Treinando Random Forest...")
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    for _ in tqdm(range(1), desc="Treinando"):
        model.fit(X_train, y_train)
    return model

# Modelo 4: MLP com 256 camadas e callback de progresso
def trainMLP(X_train, y_train, input_dim):
    print("Treinando MLP com 256 camadas...")
    
    # Ajuste para usar a camada de entrada corretamente
    model = Sequential()
    model.add(Input(shape=(input_dim,)))  # Definindo a camada de entrada
    
    for _ in range(3):
        model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Barra de progresso para o treinamento
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)
    
    return model

# Avaliação de modelos
def evaluateModel(model, X_test, y_test, isMLP=False):
    if isMLP:
        y_pred = (model.predict(X_test) > 0.5).astype("int32")
    else:
        y_pred = model.predict(X_test)
    print("Acurácia:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

# Função principal
def main():
    # Caminho para o arquivo CSV
    file_path = '/kaggle/input/playground-series-s3e23/train.csv'  # Ajuste conforme necessário
    X_train, X_test, y_train, y_test = loadAndPreprocessData(file_path)
    
    # Modelos Clássicos
    print("\nTreinando o modelo de Regressão Logística...")
    lr_model = trainLogisticRegression(X_train, y_train)
    evaluateModel(lr_model, X_test, y_test)
    
    print("\nTreinando o modelo Random Forest...")
    rf_model = trainRandomForest(X_train, y_train)
    evaluateModel(rf_model, X_test, y_test)
    
    print("\nTreinando o modelo MLP com 256 camadas...")
    mlp_model = trainMLP(X_train, y_train, X_train.shape[1])
    evaluateModel(mlp_model, X_test, y_test, isMLP=True)

if __name__ == "__main__":
    main()



Treinando o modelo de Regressão Logística...
Treinando Regressão Logística...


Treinando: 100%|██████████| 1/1 [00:00<00:00,  1.86it/s]


Acurácia: 0.8141797278042548
              precision    recall  f1-score   support

       False       0.84      0.94      0.89     15825
        True       0.64      0.37      0.47      4528

    accuracy                           0.81     20353
   macro avg       0.74      0.66      0.68     20353
weighted avg       0.80      0.81      0.79     20353


Treinando o modelo Random Forest...
Treinando Random Forest...


Treinando: 100%|██████████| 1/1 [00:17<00:00, 17.58s/it]


Acurácia: 0.8096595096545963
              precision    recall  f1-score   support

       False       0.84      0.93      0.88     15825
        True       0.62      0.38      0.47      4528

    accuracy                           0.81     20353
   macro avg       0.73      0.65      0.68     20353
weighted avg       0.79      0.81      0.79     20353


Treinando o modelo MLP com 256 camadas...
Treinando MLP com 256 camadas...
Epoch 1/10
[1m2545/2545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.7874 - loss: 0.4662
Epoch 2/10
[1m2545/2545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8133 - loss: 0.4324
Epoch 3/10
[1m2545/2545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8165 - loss: 0.4274
Epoch 4/10
[1m2545/2545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.8158 - loss: 0.4292
Epoch 5/10
[1m2545/2545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1