<a href="https://colab.research.google.com/github/CllsPy/ML-Competition-Kaggle/blob/main/Playground%20Series%20-%20Season%204%2C%20Episode%203.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'playground-series-s4e3:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F68699%2F7659021%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240312%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240312T070702Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D88bbc7553f033f31912d705e0f7f1bdf877571763698975fe7aa6da596144704516f0c06a42ab6984ee4a9d5acf056c938408ed8d154d347fe63278ec710d647d4a46dd2e5ad7cb5bbd01a6353ab233cd8524672c794bb8633083c68581004d73930f9733d04bea053de5acb6db2ea575f271d8337e3dd7d7e50682ca86a77dcd5b976a8527561c917353cb6a2ee1443dce794638f0d9f735171b0cedf93b7a860b781222b280438dd99cf4e3761fa6ff5c4739f4340e00683a4d6430bd26b823804d86db2518057e190d6232556cd45ee0a53ab8b5ea7e8bf0790478fd7e22a5822f843dbfd2f14c30e89a272fad01d27164d8f0d87de7d0bffd3fa42c8d80f'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# basics
import pandas as pd
import numpy as np
import scipy.stats as stats

# sckit
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

from xgboost.sklearn import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier

from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score


In [None]:
# seed
np.random.seed(101)

# path
URL = '/kaggle/input/playground-series-s4e3/train.csv'

# targert
TARGET_FEATURES = [

                'Pastry',
                'Z_Scratch',
                'K_Scatch',
                'Stains',
                'Dirtiness',
                'Bumps',
                'Other_Faults'
    ]


# load
train = pd.read_csv(URL).set_index('id')

# features and labels
X = train.drop(TARGET_FEATURES, axis=1)
y = train[TARGET_FEATURES]


# train and test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.35)


# models
models = {

        #'lgr': LogisticRegression(max_iter=10000),
        #'svm': SVC(),
        #'gnb': GaussianNB(),
        #'sgd': SGDClassifier(),

        'knn': KNeighborsClassifier(),
        'rfc': RandomForestClassifier(),
        'gbc': GradientBoostingClassifier(),
        'lgbm': LGBMClassifier(),
        'xgb': XGBClassifier()
    }


# func for eval
def train_eval(models, X_train, X_val, y_train, y_val):
    '''
    Function to evaluate the
    models

    models: desired models
    X_train: training feature
    X_val: validation feature
    y_train: training label
    y_val: validation label

    '''

    # pipeline numérico
    numeric_features = X.select_dtypes(exclude=['object']).columns
    numeric_transformer = Pipeline(
        steps=[("scaler", RobustScaler())])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),])

    models_score = {}
    for name, model in models.items():

        clf = Pipeline(steps=[("preprocessor", preprocessor),("classifier", model)]) # models w/ fe
        treinar = MultiOutputClassifier(clf).fit(X_train, y_train) # models wo/ fe
        models_score[name] = treinar.score(X_val, y_val)

    return models_score

# call func.
#train_eval(models, X_train, X_val, y_train, y_val)


# Inicializar modelos individuais
knn_ens = KNeighborsClassifier()
rfc_ens = RandomForestClassifier()
gbc_ens = GradientBoostingClassifier()
xgb_ens = XGBClassifier()
lgbm_ens = LGBMClassifier()

# Inicializar o Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('knn', knn_ens),
        ('rfc', rfc_ens),
        ('gbc', gbc_ens),
        ('xgb', xgb_ens),
        ('lgbm_ens', lgbm_ens)], voting='hard')

# Treinar o Voting Classifier
# fit_vc = MultiOutputClassifier(voting_clf).fit(X_train, y_train)

numeric_features = X.select_dtypes(exclude=['object']).columns
numeric_transformer = Pipeline(
    steps=[("scaler", RobustScaler())])


preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),])

clf = Pipeline(steps=[
     ("preprocessor", preprocessor),
                       ("classifier", voting_clf)])

# Treinar
clf_vcl = MultiOutputClassifier(voting_clf).fit(X_train, y_train)

# Fazer previsões
y_pred = clf_vcl.predict(X_val)

# Calcular a acurácia
accuracy = accuracy_score(y_val, y_pred)
print("Acurácia do Voting Classifier:", accuracy)

[LightGBM] [Info] Number of positive: 934, number of negative: 11558
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003577 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5158
[LightGBM] [Info] Number of data points in the train set: 12492, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.074768 -> initscore=-2.515657
[LightGBM] [Info] Start training from score -2.515657
[LightGBM] [Info] Number of positive: 771, number of negative: 11721
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003737 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5158
[LightGBM] [Info] Number of data points in the train set: 12492, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.061720 -> initscore=-2.721449
[LightGBM] [Info] Start training from score -2.721449
[LightGBM] [Info

In [None]:
np.random.seed(101)

test = pd.read_csv('/kaggle/input/playground-series-s4e3/test.csv').set_index('id')
sub = pd.read_csv('/kaggle/input/playground-series-s4e3/sample_submission.csv')
sub[TARGET_FEATURES] = clf_vcl.predict(test)
sub.to_csv('16__ensemble__fe.csv', index=False)