In [None]:
%%time
import subprocess
import sys

def install_and_import(package):
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        __import__(package)

packages = [
    ('sklearn', '*'), 
    ('tqdm', 'tqdm'), 
    ('re', 're'), 
    ('numpy', 'np'), 
    ('pandas', 'pd'), 
    ('matplotlib.pyplot', 'plt'), 
    ('seaborn', 'sns'), 
    ('os', 'os'), 
    ('warnings', 'warnings'), 
    ('xgboost', 'xgboost'), 
    ('catboost', 'catboost'), 
    ('lightgbm', 'lightgbm'), 
    ('tabulate', 'tabulate'), 
    ('statsmodels', 'ARIMA'), 
    ('colorama', 'Fore, Style, init'), 
    ('category_encoders', '*'), 
    ('mlxtend', '*'), 
    ('optuna', 'optuna'), 
]

# Install and import packages
for package, import_as in packages:
    install_and_import(package)

# Specific imports
from sklearn.pipeline import *
from tqdm import tqdm
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.metrics import *
from sklearn.tree import *
from sklearn.ensemble import *
from xgboost import *
from catboost import *
from sklearn.linear_model import *
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.isotonic import *
from sklearn.compose import *
from sklearn.neural_network import *
from lightgbm import *
from tabulate import tabulate
from statsmodels.tsa.arima.model import ARIMA
from colorama import Fore, Style, init
from category_encoders import *
from mlxtend.evaluate import *
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args
from sklearn.base import *
from sklearn.inspection import *
import optuna
from optuna.visualization import *
from optuna.pruners import *
import logging
import sys
from sklearn.decomposition import *

# Add stream handler of stdout to show the messages to see Optuna works expectedly.
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
warnings.filterwarnings('ignore')

# warnings.filterwarnings("ignore", category=DeprecationWarning)
# warnings.filterwarnings("ignore", category=UserWarning)
# warnings.filterwarnings("ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [None]:
seed=42

In [None]:
##These Features quite helped LGBM model, but I did not use it for final blend!
class FeatureEngineering(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self,X,y=None):
        return self
    
    def transform(self, X):        
        x = X.copy()
        
        x['sum'] = x['capdiameter'] + x['stemheight'] + x['stemwidth']
        x['capheightproduct'] = x['capdiameter'] * x['stemheight']
        x['capwidthproduct'] = x['capdiameter'] * x['stemwidth']
        x['heightwidthproduct'] = x['stemheight'] * x['stemwidth']
        x['product'] = x['capdiameter'] * x['stemheight'] * x['stemwidth']

        # Adding a small value to avoid division by zero
        # x['captostemheight'] = x['capdiameter'] / (x['stemheight'] + 1e-6)
        # x['captostemwidth'] = x['capdiameter'] / (x['stemwidth'] + 1e-6)
        x['stemheighttowidth'] = x['stemheight'] / (x['stemwidth'].replace(0, np.nan) + 1e-6)
        x['capheightdifference'] = x['capdiameter'] - x['stemheight']
        x['capwidthdifference'] = x['capdiameter'] - x['stemwidth']
        x['stemheightwidthdifference'] = x['stemheight'] - x['stemwidth']

        x.replace([np.inf, -np.inf],0, inplace=True)
        
        return x

In [None]:
%%time

import polars as pl
import pandas as pd


train = pl.read_csv("/kaggle/input/playground-series-s4e8/train.csv").drop("id").to_pandas()
print("Train Dataset =", train.shape)
display(train)


test = pl.read_csv("/kaggle/input/playground-series-s4e8/test.csv").drop("id").to_pandas()
print("Test Dataset =", test.shape)
display(test)


submission = pd.read_csv("/kaggle/input/playground-series-s4e8/sample_submission.csv")


# origin = pl.read_csv("/kaggle/input/playground-series-s4e8/train.csv").to_pandas()

origin=train.copy()
print("Original Dataset =", origin.shape)
display(origin)


target = "class"

train.columns    = train.columns.str.replace("-","")
test.columns     = test.columns.str.replace("-","")
origin.columns = origin.columns.str.replace("-","")

In [None]:
# Credits to Ambros notebooks:
float_features = ['capdiameter', 'stemheight', 'stemwidth']
initial_features = test.columns
cat_features = [f for f in initial_features if f not in float_features]

for feature in initial_features:
    if feature in cat_features:
        # Ensure categorical features are processed
        categories = sorted(list(set(df[feature].dropna()) | set(test[feature].dropna())))
        dtype = pd.CategoricalDtype(categories=categories, ordered=False)
        print(f"{feature:30} {len(dtype.categories)}")
        
        # Apply categorical dtype
        df[feature] = df[feature].astype(dtype)
        test[feature] = test[feature].astype(dtype)
    else:
        # Handle non-numeric values in float features
        df[feature] = pd.to_numeric(df[feature], errors='coerce')
        test[feature] = pd.to_numeric(test[feature], errors='coerce')
        
        # Convert to float32
        dtype = np.float32
        df[feature] = df[feature].astype(dtype)
        test[feature] = test[feature].astype(dtype)

display(df)
display(test)

In [None]:
df[target]=df[target].map({'e':0,'p':1}).astype(int)

In [None]:
cat_feats=['capshape',
    'capsurface',
    'capcolor',
    'doesbruiseorbleed',
    'gillattachment',
    'gillspacing',
    'gillcolor',
    'stemroot',
    'stemsurface',
    'stemcolor',
    'veiltype',
    'veilcolor',
    'hasring',
    'ringtype',
    'sporeprintcolor',
    'habitat',
    'season']

In [None]:
X=df.drop(target,axis=1)
y=df[target]

In [None]:
xgb=XGBClassifier(**{
    'enable_categorical': True,
    'device': 'cuda',
    'n_estimators': 417,         
    'learning_rate': 0.06743,           
    'max_depth': 18,                
    'colsample_bytree': 0.567,         
    'min_child_weight': 4,         
    'reg_lambda':73,          
    'subsample': 1,              
    'num_parallel_tree': 5,
}
)

In [None]:
def CVBlender(model, df, df_test, target, cv="skf", seed=seed):
    X = df.drop(target, axis=1)
    y = df[target]
    bags_mean = []
    foldwise_accuracy = []
    
    if cv == "kf":
        cv_method = KFold(n_splits=5, shuffle=True, random_state=seed)
    elif cv == "skf":
        cv_method = StratifiedKFold(n_splits=25, shuffle=True, random_state=seed)
    elif cv == "rskf":
        cv_method = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=seed)
    else:
        raise ValueError("Invalid cross-validation method. Please choose 'kf', 'skf', or 'rskf'.")
    
    counter = 0
    for train_idx, valid_idx in cv_method.split(X, y):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[valid_idx], y.iloc[valid_idx]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        acc = matthews_corrcoef(y_val, y_pred)
        print(f"Matthews Correlation Coefficient for fold {counter + 1}: {acc}")
        foldwise_accuracy.append(acc)
        
        y_pred_test = model.predict_proba(df_test)[:, 1]
        bags_mean.append(y_pred_test)
        
        print(f"Average Matthews Correlation Coefficient Across Fold {counter + 1}: {acc}")
        counter += 1

    final_accuracy = np.mean(foldwise_accuracy)
    print(f"Final Total Average Matthews Correlation Coefficient: {final_accuracy}")
    
    total_mean_predictions = np.mean(bags_mean, axis=0)
    return total_mean_predictions

In [None]:
preds=CVBlender(xgb1,df,test,target)

In [None]:
xgb_sub=submission.copy()
xgb_sub[target]=preds
xgb_sub.to_csv('XGB_tunned_25Folds.csv',index=False)

# AutoGluon

In [None]:
%%capture
!pip install autogluon.tabular

from autogluon.tabular import TabularDataset, TabularPredictor

In [None]:
train_data = TabularDataset(df)
test_data=TabularDataset(test)

In [None]:
hyperparameter_tune_kwargs = {  
    'num_trials': 54,
    'scheduler' : 'local',
    'searcher'  : 'auto',
}
excluded_model_types=['KNN','XTModel']
predictor = TabularPredictor(label = target,
                                eval_metric='mcc',
                                problem_type = 'binary',
                                
                            )
predictor.fit(train_data,
            ag_args_fit={'num_gpus': 1, 'num_cpus': 4},
            time_limit = 8*60*60,
            hyperparameter_tune_kwargs=hyperparameter_tune_kwargs,
            presets = 'best_quality',
            save_space = True,
            keep_only_best = True,
            excluded_model_types=excluded_model_types
            )

In [None]:
test_preds_proba=predictor.predict_proba(test_data)
sub=submission.copy()
sub=sub.drop(target,axis=1)
sub['autogluon']=test_preds_proba.iloc[:,1]
sub.to_csv('autogluon_probas.csv',index=False)

# Blending

In [None]:
final_submission=submission.copy()

final_submission[target]=0.3*xgb_sub[target]+0.7*sub[target]
final_submission[target]=(final_submission[target]>0.5)
final_submission[target]=final_submission[target].map({False:'e',True:'p'})
final_submission.to_csv('xgb_folds_tunned1.csv',index=False)
display(final_submission)

*This notebook combines two separate notebooks into one for convenience.