In [5]:
import os
import zipfile

import numpy as np
import pandas as pd
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi 

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.sklearn
from autogluon.tabular import TabularDataset, TabularPredictor

In [2]:
# Define MLflow experiment name
experiment_name = "Kaggle_Competition_Example"
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='/home/artur/mlops-zoomcamp/notebooks/project/mlruns/1', creation_time=1715770587504, experiment_id='1', last_update_time=1715770587504, lifecycle_stage='active', name='Kaggle_Competition_Example', tags={}>

In [3]:

def load_data(comp_name: str) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    """
    Load data from a Kaggle competition zip file.
    Parameters:
        comp_name (str): Name of the Kaggle competition.
    Returns:
        tuple: A tuple containing three DataFrames: train, test, and submission.
    """
    api = KaggleApi()
    api.authenticate()

    # Download the competition files
    api.competition_download_files(comp_name, path='.', force=True)

    # Unzip the downloaded files
    with zipfile.ZipFile(f"{comp_name}.zip", 'r') as zip_ref:
        zip_ref.extractall(".")

    # Load data into DataFrames
    submission = pd.read_csv("sample_submission.csv")
    test = pd.read_csv("test.csv")
    train = pd.read_csv("train.csv")

    return train, test, submission

def adjust_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Adjust data types for the DataFrame columns.
    Parameters:
        df (pd.DataFrame): The DataFrame to adjust.
    Returns:
        pd.DataFrame: The DataFrame with adjusted data types.
    """
    int_columns = df.select_dtypes(include=['int64']).columns
    float_columns = df.select_dtypes(include=['float64']).columns
    
    # Change integer columns to the smallest type that fits the data
    for col in int_columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    
    # Change float columns to the smallest float type that fits the data
    for col in float_columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    
    return df

# Load data
train, test, submission = load_data('playground-series-s4e5')

# Adjust data types
train = adjust_dtypes(train)
test = adjust_dtypes(test)
submission = adjust_dtypes(submission)

# Split data into features and target
x_train_full = train.drop(columns=['FloodProbability'])
y_train_full = train['FloodProbability']

In [6]:
# Split data into train and validation sets
train_df, val_df = train_test_split(train, test_size=0.2, random_state=42)

x_train = train_df.drop(columns=['FloodProbability'])
y_train = train_df['FloodProbability']

x_val = val_df.drop(columns=['FloodProbability'])
y_val = val_df['FloodProbability']

x_test = test

In [8]:
# AutoML parameters
predictor = TabularPredictor(label='FloodProbability', eval_metric='r2')
predictor.fit(train_data=train_df, time_limit=600, presets='best_quality')


No path specified. Models will be saved in: "AutogluonModels/ag-20240518_122412"
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 600 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels/ag-20240518_122412/ds_sub_fit/sub_fit_ho.
2024-05-18 12:24:12,684	INFO util.py:154 -- Missing packages: ['ipywidgets']. R

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f69e5385f70>

In [10]:
y_val_predict

(223592,)

In [11]:

# Get out-of-fold predictions and calculate oof R^2 score
y_val_predict = predictor.predict(x_val)
r2_score = r2_score(y_val, y_val_predict)


In [12]:
print("Out-of-fold R^2 score:", r2_score)

Out-of-fold R^2 score: 0.85515524966557


In [None]:

# Get out-of-fold predictions and calculate oof R^2 score
y_val_predict = predictor.predic['oof_pred']
oof_r2 = r2_score(y_train, y_oof_pred)
print("Out-of-fold R^2 score:", oof_r2)

In [None]:
# AutoML parameters
torch.set_num_threads(4)
automl_params = {
    'task': Task('reg', loss = 'mse', metric = 'r2'),
    'timeout': 10*60,
    'reader_params': {"n_jobs": 4, 'cv': 3, 'random_state': 42},
    'cpu_limit': 4
}
model = TabularAutoML(**automl_params)
oof= model.fit_predict(train_df, roles= {'target': 'FloodProbability'}, verbose = 1)


[11:42:23] Stdout logging level is INFO.


[11:42:23] Task: reg

[11:42:23] Start automl preset with listed constraints:
[11:42:23] - time: 600.00 seconds
[11:42:23] - CPU: 4 cores
[11:42:23] - memory: 16 GB

[11:42:23] [1mTrain data shape: (894365, 22)[0m

[11:42:32] Layer [1m1[0m train process start. Time left 591.60 secs
[11:42:33] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[11:42:34] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.8448548330350316[0m
[11:42:34] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[11:42:34] Time left 588.68 secs

[11:45:12] [1mSelector_LightGBM[0m fitting and predicting completed
[11:45:12] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[11:47:54] Time limit exceeded after calculating fold 0

[11:47:54] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.8396970290102965[0m
[11:47:54] [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed
[11:47:54] Start fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m .

In [78]:
# Train models and record R2 scores
models = {
    "Linear_Regression": LinearRegression(),
    "Ridge": Ridge(),
    "LightGBM": lightgbm.LGBMRegressor(),
    "LightAutoML": TabularAutoML(**automl_params)
}

In [None]:
6 m 
5 m 40  int16
5 m 32  int8

In [32]:
r2_score(y_train,oof.data[:, 0])

0.8452595986219906

In [None]:
y_pred_val = model.predict(x_val)
r2_val_before_tuning = r2_score(y_val, y_pred_val.data[:,0])

model.fit_predict(train, roles= {'target': 'FloodProbability'})
y_pred_test = model.predict(x_test).data[:,0]    

In [99]:
def objective(trial, model_name):
    
    if model_name =='Ridge':
        params = {'alpha': trial.suggest_float("alpha", 0.01,10.0, log = True)}
        model = Ridge(**params)
    elif model_name == "LightGBM":
        params = {
            "num_leaves": trial.suggest_int("num_leaves", 31, 128),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log = True),
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000)
        }   
        model = lightgbm.LGBMRegressor(**params)
    else:
        model = LinearRegression()
        params = {}
    
    model.fit(x_train, y_train)     
    y_pred_val = model.predict(x_val)
    r2_val = r2_score(y_val, y_pred_val)
    
    # Start a nested run for the trial
    with mlflow.start_run(run_name=f"trial_{trial.number}", nested=True) as trial_run:
        mlflow.log_params(params)
        mlflow.log_metric("R2 Score", round(r2_val,4))

    return r2_val

In [67]:
def logging_callback(study, trial):
    """
    Callback function for logging Optuna trial results to MLflow.

    Parameters:
        study (optuna.study.Study): The study object.
        trial (optuna.trial.Trial): The trial object.
    """
    mlflow.log_params(trial.params)
    mlflow.log_metric(f"R2_Score_Trial_{trial.number}", trial.value)

In [110]:
torch.set_num_threads(4)

with mlflow.start_run(run_name="50 trials vs 10 min LightAutoML") as parent_run:
    for model_name, model in models.items():
        with mlflow.start_run(run_name=model_name, nested=True) as child_run:
            # Fit the model before hyperparameter tuning
            if model_name in ["Linear_Regression", "Ridge", "LightGBM"]:
                
                model.fit(x_train, y_train)
                y_pred_val = model.predict(x_val)
                r2_val_before_tuning = r2_score(y_val, y_pred_val)
                r2_val_after_tuning = r2_val_before_tuning # for models that are not going through tuning

                # Hyperparameter tuning with Optuna 
                if model_name in ["Ridge", "LightGBM"]:
                    study = optuna.create_study(direction="maximize", sampler=TPESampler())
                    study.optimize(lambda trial: objective(trial, model_name), n_trials=50)
                    best_params = study.best_params
                    r2_val_after_tuning  = study.best_value
                
                # Train with the best hyperparameters
                if model_name == "Ridge":
                    best_model = Ridge(**best_params)
                elif model_name == "LightGBM":
                    best_model = lightgbm.LGBMRegressor(**best_params)
                else:
                    best_model = model
                
                best_model.fit(x_train_full, y_train_full)
                y_pred_test = best_model.predict(x_test)    
                
            elif model_name == 'LightAutoML': 
                model.fit_predict(train_df, roles= {'target': 'FloodProbability'})
                y_pred_val = model.predict(x_val)
                r2_val_before_tuning = r2_score(y_val, y_pred_val.data[:,0])
                
                model.fit_predict(train, roles= {'target': 'FloodProbability'})
                y_pred_test = model.predict(x_test).data[:,0]     
                                
            # Log R^2 score before tuning
            mlflow.log_metric("0.R2 Score Pre Tuning", round(r2_val_before_tuning,4), step=0)
            if model_name != 'LightAutoML':
                mlflow.log_params(best_params)
                mlflow.log_metric("1.R2 Score Post Tuning", round(r2_val_after_tuning,4), step=1)
            else: 
                mlflow.log_metric("1.R2 Score Post Tuning", round(r2_val_before_tuning,4), step=1)
                          
            # Update submission DataFrame with predictions
            submission['FloodProbability'] = y_pred_test

            # Save submission to a CSV file
            submission_file = f"submission_{model_name}.csv"
            submission.to_csv(submission_file, index=False)
            
            # Submit to Kaggle competition
            competition_name = "playground-series-s4e5"
            submission_message = f"Submission with {model_name} Initial"
            # kaggle.api.competition_submit(submission_file, submission_message, competition_name)

            # # # Get Kaggle submission score
            submissions = kaggle.api.competitions_submissions_list(competition_name)
            submission_score = None
            for subs in submissions:
                if subs['description'] == submission_message:
                    submission_score = round(np.float64(subs['publicScore']),4)
                    break

            # Log parameters and artifacts
            mlflow.log_param("Model Name", model_name)
            mlflow.log_param("Competition Name", competition_name)
            mlflow.log_param("Submission File", submission_file)
            mlflow.log_param("Submission Message", submission_message)
            # mlflow.log_metric("Kaggle Score", submission_score, step=0)
            mlflow.log_artifact(submission_file)