<span style= "color:DarkSeaGreen; font-size:16px"><strong>
Challenge: </strong></span>
- Predict  <strong><span style="color:DarkSeaGreen; text-transform: uppercase;">target</span></strong> given features
- Performance metric is <strong><span style= "color:DarkSeaGreen">RMSE</span></strong> between predicted and observed scores.

---
# ðŸ’¾ Initialize and Load Data

In [None]:
# Update libraries - some of these are optional
import warnings
warnings.filterwarnings("ignore")
import plotly
print(plotly.__version__)

!pip install --upgrade hdbscan
!pip install --upgrade scikit-learn

!pip install --upgrade plotly  ## 5.24.1 -> 6.3.1
!pip install --upgrade seaborn  ##  0.12.2 ->  0.12.3

!pip install --upgrade umap-learn

import umap

In [None]:
# import common libraries and toolkits
import sys
import os
import joblib
from multiprocessing import Pool, cpu_count

# data manipulation
import pandas as pd
import xarray as xr
import numpy as np

# machine learning libraries
import sklearn as skl
import lightgbm as lgb
import xgboost as xgb
import catboost as catb

# deep learning libraries
import torch
import torch.nn as nn

# clustering libraries
import hdbscan

# hyperparameter optimization
import optuna
import hyperopt

# visualization libraries
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as go

# other useful libraries
import math 
import itertools
import random

# time management
from time import time
from tqdm import tqdm


#import pkg_resources
#print("hdbscan:", pkg_resources.get_distribution("hdbscan").version)
print("sklearn:", skl.__version__)

In [None]:
# Set globals

# processing settings
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
CORES = min(4, cpu_count())  # Limit cores to avoid memory issues

print(f"Using device: {DEVICE}")
print(f"Using {CORES} CPU cores when multiprocessing")

# random seed settings for reproducibility
SEED = 67
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# visualization settings
pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_colwidth', 15)
pd.set_option('display.width', 130)
pd.set_option('display.precision', 4)

# custom seaborn/matplotlib style
MY_PALETTE = sns.xkcd_palette(['ocean blue', 'gold', 'dull green', 'dusty rose', 'dark lavender', 'carolina blue', 'sunflower', 'lichen', 'blush pink', 'dusty lavender', 'steel grey'])
MY_CMAP = mpl.colors.ListedColormap(MY_PALETTE)
sns.set_theme(context = 'paper', style = 'ticks', palette = MY_PALETTE, rc={"figure.figsize": (9, 3), "axes.spines.right": False, "axes.spines.top": False})

sns.palplot(MY_PALETTE) 

In [None]:
### load tabular data
def summarize_data(df, features):
    """prints data summary and descriptive stats"""
    print("=" * 69)
    print(df[features].info())
    print("=" * 69)
    print(df[features].head(5).T) 
    print("=" * 69)
    try:
        print(df[features].describe(include=['float', 'int']).T)
    except: pass
    try:
        non_numeric_cols = df[features].select_dtypes(include=['object', 'category', 'bool']).columns
        print(df[non_numeric_cols].describe().T)
    except: pass


def get_target_labels(df, target, targets, cuts = 10):
    """
    Use to visualize continuous target as a categorical
    """
    if df[target].nunique() < 8:
        df["label"] = XY[target].max() - XY[target]
        targets.append("label")
    else:
        df["qcut_label"] = cuts  - pd.qcut(df[df.target_mask.eq(True)][target], cuts, labels=False)
        df["cut_label"] = cuts  - pd.cut(df[df.target_mask.eq(True)][target], cuts, labels=False)
        df[["qcut_label", "cut_label"]] = df[["qcut_label", "cut_label"]].fillna(-1).astype('int16')
        targets.extend(["qcut_label", "cut_label"])
    return df, targets


def get_transformed_target(df, target, targets=[], TargetTransformer=None, name="std"):
    """
    scales or transforms targets in df with scikit learn scalers / transformers
    returns 
    1. df with transformed target
    2. TargetTransformer to support inverse transformation of predictions
    3. updated list of targets
    """
    if TargetTransformer==None: TargetTransformer=skl.preprocessing.StandardScaler()
    y = df[df.target_mask.eq(True)][target].values
    TargetTransformer.fit(y.reshape(-1,1))
    y = df[target].values
    df[f"{target}_{name}"] = TargetTransformer.transform(y.reshape(-1,1))
    targets.append(f"{target}_{name}")
    return df, TargetTransformer, targets


def clean_categoricals(df, features, string_length = 3):
    for col in df[features].select_dtypes(include=['object', 'string']).columns:
        df[col] = (
            df[col]
            .astype(str)
            .str.strip()
            .str.casefold()
            .str.replace(" ", "_", regex=False)
            .str.replace("(", "", regex=False)
            .str.replace(")", "", regex=False)
            .str.replace("-", "_", regex=False)
            .str.replace(".", "_", regex=False)
            .str.replace(",", "_", regex=False)
        )
        df[col] = df[col].str[:string_length].astype('category')
    return df

    
def load_tabular_data(path, extra_data = None, verbose = True, csv_sep=","):
    """
    loads tabular data from path into single DataFrame
    assumes path contains train.csv, test.csv, sample_submission.csv
    if extra_data is provided, it is assumed to be a csv file with additional training data
    Returns:
    - merged DataFrame for EDA & feature engineering
    - list of training features
    - list of targets, including column "target_mask" for separating test data
    - target 
    """
    df_train = pd.read_csv(f"{path}/train.csv", sep=csv_sep)
    df_test = pd.read_csv(f"{path}/test.csv", sep=csv_sep)
    df_submission = pd.read_csv(f"{path}/sample_submission.csv", sep=csv_sep)
    
    targets = list(df_submission.columns)
    features = list(df_test.columns)
    id_feature = [feature for feature in features if feature in targets]
    assert len(id_feature) == 1, "Expected exactly one ID column"
    targets = [feature for feature in targets if feature not in id_feature]
    features = [feature for feature in features if feature not in id_feature]
   
    df_test = df_test.merge(df_submission, how = 'left', on = id_feature)
    df = pd.concat([df_train.assign(target_mask = True), df_test.assign(target_mask = False)], ignore_index=True)
    
    if extra_data != None:
        df_extra_training = pd.read_csv(f"{extra_data}", sep=csv_sep)
        missing = set(targets + features + id_feature) - set(df_extra_training.columns)
        assert not missing, f"Extra Data missing columns: {missing}"
        df_extra_training[id_feature[0]] = range(len(df), len(df) + len(df_extra_training))
        df = pd.concat([df, df_extra_training.assign(target_mask = True)])
    
    df.set_index(id_feature, inplace = True)  
    ### clean feature names
    clean_feature_names = {}
    for i, col in enumerate(features):
        clean_feature_names[col] = col.casefold().strip().replace(" ","_").replace("(","_").replace(")","_").replace("-","_")
        features[i] = clean_feature_names[col]
    df.rename(columns=clean_feature_names, inplace=True)
    if verbose:
        print("=" * 69)
        print(f"Loaded {df.target_mask.eq(True).sum()} training samples of {len(features)} predictive features and {len(targets)} target(s) in DataFrame.")
        print(f"Loaded {df.target_mask.eq(False).sum()} testing samples in DataFrame.")
        print(f"DataFrame shape: {df.shape}. Ready to explore, engineer, and predict!")
        print("=" * 69)
    targets.append('target_mask')

    return df, features, targets, targets[0]


def split_training_data(df, features, targets, validation_size = None):
    """
    returns X,y (train & test) values as dataframes based on selcted features and targets
    if validation_size provided, returns train, validation, and test dataframes using either
    percentage (float) or selected rows (pd.Index)
    """
    XY[XY.target_mask.eq(True)]
    X = df[df.target_mask.eq(True)][features]
    y = df[df.target_mask.eq(True)][targets]

    X_test = df[df.target_mask.eq(False)][features]
    y_test = df[df.target_mask.eq(False)][targets]
    
    if type(validation_size) is float:
        X_train, X_val, y_train, y_val  = skl.model_selection.train_test_split(X, y, test_size = validation_size, random_state = SEED)
        return X_train, y_train, X_val,  y_val, X_test, y_test
    elif type(validation_size) is pd.Index: 
        X_train, y_train = X[~X.index.isin(validation_size)], y[~y.index.isin(validation_size)]
        X_val, y_val = X[X.index.isin(validation_size)], y[y.index.isin(validation_size)]
        return X_train, y_train, X_val,  y_val, X_test, y_test
    elif validation_size == None: return X, y, X_test, y_test
    else:
        print("Slice Type not recognized")


def calculate_score(actual, predicted, metric='rmse'):
    """ calculates score based on metric or task
    """
    if metric == 'rmse' or metric == 'regression':
        return skl.metrics.root_mean_squared_error(actual, predicted)
    elif metric == 'accuracy' or metric == 'classification':
        return skl.metrics.accuracy_score(actual, predicted)
    elif metric == 'auc' or metric == 'classification_probability':
        return skl.metrics.roc_auc_score(actual, predicted)
    else:
        raise ValueError("Unsupported metric")


In [None]:
PATH = "/kaggle/input/playground-series-s6e1/"
print(f"Home path is: {PATH}")

# Load data into single DataFrame for Feature Engineering 
XY, features, targets, target = load_tabular_data(PATH)

# Add labels to targets to support plotting feature relationships
XY, targets = get_target_labels(XY, target, targets)