In [None]:
# Packages

import os
import gc
import polars as pl
import numpy as np 
import pandas as pd 
import glob
import pyarrow as pa
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBRegressor
from IPython import display

In [None]:
# Loading and concatenating the whole dataset

class LoadData:
    
    def __init__(self, file_paths):
        self.file_paths = file_paths
        
    def load_and_concat(self):
        # Use `scan_parquet` for lazy loading
        partitioned_data = [pl.scan_parquet(file_path) for file_path in self.file_paths]
        df = pl.concat(partitioned_data, rechunk=False)  # Keep lazy mode with rechunk=False
        
        return df
    
# Specify file paths
file_paths = sorted(glob.glob('Data/train.parquet/*/*.parquet'))

# Initialize the loader and load data as a lazy frame
loader = LoadData(file_paths)
df_full = loader.load_and_concat()  # df_train is now a lazy frame

In [None]:
# Feature engineering the daily mean value for each responder

responder_columns = [col for col, dtype in df_full.schema.items() if col.startswith("responder_")]
aggregations = [pl.col(responder).mean().alias(f"daily_{responder}_mean") for responder in responder_columns]
df_daily_means = df_full.group_by("date_id").agg(aggregations)
df_full = df_full.join(df_daily_means, on="date_id")
df_full_collected = df_full.collect()
df_full_collected['date_id'].describe()

In [None]:
# Splitting the whole dataset into train and validation at 75th percentile of the date_id value

train_df_collected = df_full_collected.filter(pl.col('date_id') < (np.percentile(df_full_collected['date_id'].to_numpy(), 75)))
val_df_colelcted = df_full_collected.filter(pl.col('date_id') >= (np.percentile(df_full_collected['date_id'].to_numpy(), 75)))

len_train = len(train_df_collected)
len_valid = len(val_df_colelcted)

train_df = train_df_collected.lazy()
val_df = val_df_colelcted.lazy()

del train_df_collected
del val_df_colelcted
del df_full_collected
gc.collect()

In [None]:
'''
%%time

# Initialize the model with parameters for incremental learning
xgb_model = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
)

# Define chunk size
chunk_size = 50000

# Placeholder for the existing model
existing_model = None

# Train the model in chunks
for i in range(0, len_train, chunk_size):
    # Load a chunk of data
    chunk = train_df.slice(i, chunk_size).collect().to_pandas()

    # Separate features and target
    X_chunk = chunk.drop(columns=['responder_6'])
    y_chunk = chunk['responder_6']
    
    # Fit the model incrementally
    xgb_model.fit(X_chunk, y_chunk, xgb_model=existing_model)
    
    # Update the existing model
    existing_model = xgb_model.get_booster()
'''

In [None]:
# Load the saved model (if necessary)
model = xgb.Booster()
model.load_model('xgb_model.json')

# Get feature importance scores
feature_importance = model.get_score(importance_type='weight')

# Convert to a DataFrame for easier handling
feature_importance_df = pd.DataFrame(
    feature_importance.items(),
    columns=['Feature', 'Importance']
)

# Normalize the importance scores
feature_importance_df['Importance'] = feature_importance_df['Importance'] / feature_importance_df['Importance'].sum()

# Sort by importance and take the top 20
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False).head(20)

# Plot the feature importance for the top 20 features
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Normalized Importance')
plt.title('Top 20 Feature Importance (Normalized)')
plt.gca().invert_yaxis()  # Invert y-axis to show the most important feature at the top
plt.show()