# **feature encoding for activities**

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def encode_activities(df, activities_file):
    # Read activities from file
    with open(activities_file, 'r') as f:
        activities = [line.strip() for line in f]

    # Create label encoder
    le = LabelEncoder()
    le.fit(activities)

    # Get activity columns
    activity_cols = [col for col in df.columns if col.startswith('activity-')]

    # Encode each activity column
    for col in activity_cols:
        # Replace empty strings with None
        df[col] = df[col].replace('', None)
        # Only encode non-null values
        mask = df[col].notnull()
        df.loc[mask, col] = le.transform(df.loc[mask, col])
        # Convert to float to match other numerical columns
        df[col] = df[col].astype(float)

    return df, dict(zip(le.classes_, le.transform(le.classes_)))


In [None]:
df, activity_mapping = encode_activities(df, '/content/drive/MyDrive/AML_Project/activities.txt')
print("Activity mapping:", activity_mapping)

# **Feature aggregation**

In [None]:
import pandas as pd
import numpy as np

def process_time_series(df, is_test=False):
    def extract_hour(col):
        if '-' in col:
            return int(col.split('-')[1].split(':')[0])
        return 0

    def aggregate_by_category(df, category, agg_func):
        cols = [col for col in df.columns if col.startswith(f"{category}-")]
        if not cols:
            return pd.DataFrame()

        hour_groups = {}
        for col in cols:
            hour = extract_hour(col)
            if hour not in hour_groups:
                hour_groups[hour] = []
            hour_groups[hour].append(col)

        results = {}
        for hour, hour_cols in hour_groups.items():
            data = df[hour_cols]

            if category == 'activity':
                filled_data = data.fillna(-1)
                non_null_mean = filled_data[filled_data != -1].mean(axis=1)
                results[f"{category}_hour_{hour}"] = non_null_mean
            else:
                if agg_func == 'sum':
                    results[f"{category}_hour_{hour}"] = data.sum(axis=1)
                elif agg_func == 'mean':
                    results[f"{category}_hour_{hour}"] = data.mean(axis=1)

        return pd.DataFrame(results)

    # Adjust preserved columns based on whether it's test data
    preserved_cols = ['id', 'p_num']
    if not is_test:
        preserved_cols.append('bg+1:00')

    processed_dfs = [df[preserved_cols]]

    aggregations = {
        'bg': 'mean',
        'insulin': 'sum',
        'carbs': 'sum',
        'hr': 'mean',
        'steps': 'sum',
        'cals': 'sum',
        'activity': 'mean'
    }

    for category, agg_func in aggregations.items():
        category_data = aggregate_by_category(df, category, agg_func)
        if not category_data.empty:
            processed_dfs.append(category_data)

    return pd.concat(processed_dfs, axis=1)



# **XGBoost**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

class BGForecastModel:
    def __init__(self):
        self.model = xgb.XGBRegressor(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=5,
            objective='reg:squarederror'
        )
        self.scaler = StandardScaler()

    def prepare_features(self, df):
        # Drop label and ID columns for scaling
        feature_cols = [col for col in df.columns if col not in ['id', 'p_num', 'bg+1:00']]
        return feature_cols

    def fit(self, train_df):
        feature_cols = self.prepare_features(train_df)
        X = train_df[feature_cols]
        y = train_df['bg+1:00']

        # Scale features
        X_scaled = self.scaler.fit_transform(X)

        # Train model
        self.model.fit(X_scaled, y)

    def predict(self, test_df):
        feature_cols = self.prepare_features(test_df)
        X = test_df[feature_cols]

        # Scale features
        X_scaled = self.scaler.transform(X)

        # Make predictions
        predictions = self.model.predict(X_scaled)

        # Return results with ID
        results = pd.DataFrame({
            'id': test_df['id'],
            'bg+1:00': predictions
        })
        return results