In [None]:
import pandas as pd

# Load the datasets (replace file paths with actual ones)
users_train = pd.read_csv("/content/users_train.csv")
user_features_train = pd.read_csv("/content/user_features_train.csv")
targets_train = pd.read_csv("/content/targets_train.csv")

users_test = pd.read_csv("/content/users_test.csv")
user_features_test = pd.read_csv("/content/user_features_test.csv")

In [None]:
!pip install shap

Collecting shap
  Downloading shap-0.46.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (24 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Downloading shap-0.46.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (540 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m540.1/540.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.46.0 slicer-0.0.8


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, Input, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

# 1. Feature Engineering: Aggregate features and interactions
def feature_engineering(df):
    time_series_columns = {
        'retention': [f'RetentionD{i}' for i in range(16)],
        'ad_revenue': [f'AdRevenueD{i}' for i in range(16)],
        'iap_revenue': [f'IAPRevenueD{i}' for i in range(16)],
        'level_advanced': [f'LevelAdvancedCountD{i}' for i in range(16)]
    }

    # Aggregate features
    for feature, columns in time_series_columns.items():
        df.loc[:, f'{feature}_total'] = df[columns].sum(axis=1)
        df.loc[:, f'{feature}_mean'] = df[columns].mean(axis=1)
        df.loc[:, f'{feature}_max'] = df[columns].max(axis=1)
        df.loc[:, f'{feature}_std'] = df[columns].std(axis=1)

    # Interaction features
    df.loc[:, 'retention_ad_revenue_interaction'] = df['retention_total'] * df['ad_revenue_total']
    df.loc[:, 'retention_iap_revenue_interaction'] = df['retention_total'] * df['iap_revenue_total']
    df.loc[:, 'retention_level_interaction'] = df['retention_total'] * df['level_advanced_total']

    # Additional interactions
    df.loc[:, 'ad_iap_interaction'] = df['ad_revenue_total'] * df['iap_revenue_total']
    df.loc[:, 'level_ad_interaction'] = df['level_advanced_total'] * df['ad_revenue_total']
    df.loc[:, 'level_iap_interaction'] = df['level_advanced_total'] * df['iap_revenue_total']

    return df

# 2. Load, merge, and process training data
train_data = pd.merge(users_train, user_features_train, on='ID')
train_data = pd.merge(train_data, targets_train, on='ID')
train_data = feature_engineering(train_data)

engineered_columns = [
    'retention_total', 'retention_mean', 'retention_max', 'retention_std',
    'ad_revenue_total', 'ad_revenue_mean', 'ad_revenue_max', 'ad_revenue_std',
    'iap_revenue_total', 'iap_revenue_mean', 'iap_revenue_max', 'iap_revenue_std',
    'level_advanced_total', 'level_advanced_mean', 'level_advanced_max', 'level_advanced_std',
    'retention_ad_revenue_interaction', 'retention_iap_revenue_interaction', 'retention_level_interaction',
    'ad_iap_interaction', 'level_ad_interaction', 'level_iap_interaction'
]

time_series_columns = [
    *[f'RetentionD{i}' for i in range(16)],
    *[f'AdRevenueD{i}' for i in range(16)],
    *[f'IAPRevenueD{i}' for i in range(16)],
    *[f'LevelAdvancedCountD{i}' for i in range(16)]
]

train_data[time_series_columns] = train_data[time_series_columns].astype(float)

# Apply different scalers to sequential and engineered features
scaler_seq = StandardScaler()
scaler_eng = StandardScaler()

# Sequential data scaling
X_sequential = train_data[time_series_columns].values.reshape(train_data.shape[0], -1)
X_sequential = scaler_seq.fit_transform(X_sequential).reshape(train_data.shape[0], 16, 4)

# Engineered features scaling
X_engineered = scaler_eng.fit_transform(train_data[engineered_columns].values)

y = train_data['TARGET'].values.astype('float32')

# 3. Train-test split
X_train_seq, X_val_seq, X_train_eng, X_val_eng, y_train_seq, y_val_seq = train_test_split(
    X_sequential, X_engineered, y, test_size=0.2, random_state=42)

# 4. Build the LSTM model
def build_lstm_nn(input_shape_seq, input_shape_eng):
    seq_inputs = Input(shape=input_shape_seq)
    x = Bidirectional(LSTM(400, return_sequences=False))(seq_inputs)  # Increased from 300 to 400
    x = Dropout(0.2)(x)  # Reduced dropout to 0.2

    eng_inputs = Input(shape=input_shape_eng)

    combined = Concatenate()([x, eng_inputs])

    x = Dense(256, activation='relu')(combined)  # Increased from 128 to 256
    x = Dropout(0.2)(x)  # Reduced dropout to 0.2
    x = Dense(128, activation='relu')(x)  # Increased from 64 to 128
    x = Dense(64, activation='relu')(x)  # Increased from 32 to 64
    x = Dense(16, activation='relu')(x)  # New Dense layer with 16 units

    outputs = Dense(1)(x)

    model = Model(inputs=[seq_inputs, eng_inputs], outputs=outputs)
    model.compile(optimizer=Adam(learning_rate=0.00005), loss='mean_squared_error')

    return model

# 5. Build and train the model
input_shape_seq = (X_train_seq.shape[1], X_train_seq.shape[2])
input_shape_eng = (X_train_eng.shape[1],)
model = build_lstm_nn(input_shape_seq, input_shape_eng)

early_stopping = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4)  # Reduced factor for more gradual reduction

# Train the model
history = model.fit(
    [X_train_seq, X_train_eng], y_train_seq,
    validation_data=([X_val_seq, X_val_eng], y_val_seq),
    epochs=100,
    batch_size=528,
    callbacks=[early_stopping, lr_scheduler],
    verbose=1
)

# Predict and calculate RMSE
y_val_pred = model.predict([X_val_seq, X_val_eng])
rmse = np.sqrt(mean_squared_error(y_val_seq, y_val_pred))
print(f'Validation RMSE: {rmse}')

# ---------------------------------------
# 6. Generate Predictions for Test Data
# ---------------------------------------

test_data = pd.merge(users_test, user_features_test, on='ID')
test_data = feature_engineering(test_data)

# Sequential data for the test set
X_test_seq = test_data[time_series_columns].values.reshape(test_data.shape[0], -1)
X_test_seq = scaler_seq.transform(X_test_seq).reshape(test_data.shape[0], 16, 4)

# Engineered features for the test set
X_test_eng = scaler_eng.transform(test_data[engineered_columns].values)

# Predict on test data
test_predictions = model.predict([X_test_seq, X_test_eng])

# Create the submission file
submission = pd.DataFrame({
    'ID': test_data['ID'],  # Replace 'ID' with actual column name in the test set
    'TARGET': test_predictions.flatten()
})

submission.to_csv('submission.csv', index=False)

Epoch 1/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 10ms/step - loss: 3.0907 - val_loss: 2.4565 - learning_rate: 5.0000e-05
Epoch 2/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - loss: 2.3105 - val_loss: 2.2609 - learning_rate: 5.0000e-05
Epoch 3/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - loss: 2.3306 - val_loss: 2.2313 - learning_rate: 5.0000e-05
Epoch 4/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - loss: 2.1762 - val_loss: 2.1715 - learning_rate: 5.0000e-05
Epoch 5/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - loss: 1.7050 - val_loss: 2.2796 - learning_rate: 5.0000e-05
Epoch 6/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - loss: 2.1057 - val_loss: 2.1905 - learning_rate: 5.0000e-05
Epoch 7/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 1

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, Input, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

# 1. Feature Engineering: Aggregate features and interactions
def feature_engineering(df):
    time_series_columns = {
        'retention': [f'RetentionD{i}' for i in range(16)],
        'ad_revenue': [f'AdRevenueD{i}' for i in range(16)],
        'iap_revenue': [f'IAPRevenueD{i}' for i in range(16)],
        'level_advanced': [f'LevelAdvancedCountD{i}' for i in range(16)]
    }

    # Aggregate features
    for feature, columns in time_series_columns.items():
        df.loc[:, f'{feature}_total'] = df[columns].sum(axis=1)
        df.loc[:, f'{feature}_mean'] = df[columns].mean(axis=1)
        df.loc[:, f'{feature}_max'] = df[columns].max(axis=1)
        df.loc[:, f'{feature}_std'] = df[columns].std(axis=1)

    # Interaction features
    df.loc[:, 'retention_ad_revenue_interaction'] = df['retention_total'] * df['ad_revenue_total']
    df.loc[:, 'retention_iap_revenue_interaction'] = df['retention_total'] * df['iap_revenue_total']
    df.loc[:, 'retention_level_interaction'] = df['retention_total'] * df['level_advanced_total']

    # Additional interactions
    df.loc[:, 'ad_iap_interaction'] = df['ad_revenue_total'] * df['iap_revenue_total']
    df.loc[:, 'level_ad_interaction'] = df['level_advanced_total'] * df['ad_revenue_total']
    df.loc[:, 'level_iap_interaction'] = df['level_advanced_total'] * df['iap_revenue_total']

    return df

# 2. Load, merge, and process training data
train_data = pd.merge(users_train, user_features_train, on='ID')
train_data = pd.merge(train_data, targets_train, on='ID')
train_data = feature_engineering(train_data)

engineered_columns = [
    'retention_total', 'retention_mean', 'retention_max', 'retention_std',
    'ad_revenue_total', 'ad_revenue_mean', 'ad_revenue_max', 'ad_revenue_std',
    'iap_revenue_total', 'iap_revenue_mean', 'iap_revenue_max', 'iap_revenue_std',
    'level_advanced_total', 'level_advanced_mean', 'level_advanced_max', 'level_advanced_std',
    'retention_ad_revenue_interaction', 'retention_iap_revenue_interaction', 'retention_level_interaction',
    'ad_iap_interaction', 'level_ad_interaction', 'level_iap_interaction'
]

time_series_columns = [
    *[f'RetentionD{i}' for i in range(16)],
    *[f'AdRevenueD{i}' for i in range(16)],
    *[f'IAPRevenueD{i}' for i in range(16)],
    *[f'LevelAdvancedCountD{i}' for i in range(16)]
]

train_data[time_series_columns] = train_data[time_series_columns].astype(float)

# Apply different scalers to sequential and engineered features
scaler_seq = StandardScaler()
scaler_eng = StandardScaler()

# Sequential data scaling
X_sequential = train_data[time_series_columns].values.reshape(train_data.shape[0], -1)
X_sequential = scaler_seq.fit_transform(X_sequential).reshape(train_data.shape[0], 16, 4)

# Engineered features scaling
X_engineered = scaler_eng.fit_transform(train_data[engineered_columns].values)

y = train_data['TARGET'].values.astype('float32')

# 3. Train-test split
X_train_seq, X_val_seq, X_train_eng, X_val_eng, y_train_seq, y_val_seq = train_test_split(
    X_sequential, X_engineered, y, test_size=0.2, random_state=42)

# 4. Build the LSTM model
from tensorflow.keras.layers import BatchNormalization

def build_lstm_nn(input_shape_seq, input_shape_eng):
    seq_inputs = Input(shape=input_shape_seq)
    x = Bidirectional(LSTM(400, return_sequences=False))(seq_inputs)
    x = Dropout(0.3)(x)

    eng_inputs = Input(shape=input_shape_eng)

    combined = Concatenate()([x, eng_inputs])

    x = Dense(256, activation='relu')(combined)
    x = BatchNormalization()(x)  # Add batch normalization
    x = Dropout(0.3)(x)
    x = Dense(128, activation='relu')(x)
    x = BatchNormalization()(x)  # Add batch normalization
    x = Dense(64, activation='relu')(x)
    x = Dense(16, activation='relu')(x)

    outputs = Dense(1)(x)

    model = Model(inputs=[seq_inputs, eng_inputs], outputs=outputs)
    model.compile(optimizer=Adam(learning_rate=0.00003), loss='mean_squared_error')

    return model


# 5. Build and train the model
input_shape_seq = (X_train_seq.shape[1], X_train_seq.shape[2])
input_shape_eng = (X_train_eng.shape[1],)
model = build_lstm_nn(input_shape_seq, input_shape_eng)

early_stopping = EarlyStopping(monitor='val_loss', patience=9, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4)  # Reduced factor for more gradual reduction

# Train the model
history = model.fit(
    [X_train_seq, X_train_eng], y_train_seq,
    validation_data=([X_val_seq, X_val_eng], y_val_seq),
    epochs=100,
    batch_size=512,
    callbacks=[early_stopping, lr_scheduler],
    verbose=1
)

# Predict and calculate RMSE
y_val_pred = model.predict([X_val_seq, X_val_eng])
rmse = np.sqrt(mean_squared_error(y_val_seq, y_val_pred))
print(f'Validation RMSE: {rmse}')

# ---------------------------------------
# 6. Generate Predictions for Test Data
# ---------------------------------------

test_data = pd.merge(users_test, user_features_test, on='ID')
test_data = feature_engineering(test_data)

# Sequential data for the test set
X_test_seq = test_data[time_series_columns].values.reshape(test_data.shape[0], -1)
X_test_seq = scaler_seq.transform(X_test_seq).reshape(test_data.shape[0], 16, 4)

# Engineered features for the test set
X_test_eng = scaler_eng.transform(test_data[engineered_columns].values)

# Predict on test data
test_predictions = model.predict([X_test_seq, X_test_eng])

# Create the submission file
submission = pd.DataFrame({
    'ID': test_data['ID'],  # Replace 'ID' with actual column name in the test set
    'TARGET': test_predictions.flatten()
})

submission.to_csv('submission.csv', index=False)

Epoch 1/100
[1m1373/1373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 11ms/step - loss: 3.9599 - val_loss: 3.7356 - learning_rate: 3.0000e-05
Epoch 2/100
[1m1373/1373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 11ms/step - loss: 2.7612 - val_loss: 7.8160 - learning_rate: 3.0000e-05
Epoch 3/100
[1m1373/1373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 11ms/step - loss: 2.6930 - val_loss: 6.2852 - learning_rate: 3.0000e-05
Epoch 4/100
[1m1373/1373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 11ms/step - loss: 2.7194 - val_loss: 10.6211 - learning_rate: 3.0000e-05
Epoch 5/100
[1m1373/1373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 11ms/step - loss: 2.4266 - val_loss: 8.3937 - learning_rate: 3.0000e-05
Epoch 6/100
[1m1373/1373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 11ms/step - loss: 2.6445 - val_loss: 12.2898 - learning_rate: 6.0000e-06
Epoch 7/100
[1m1373/1373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m

KeyboardInterrupt: 

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, Input, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

# 1. Feature Engineering: Aggregate features and interactions
def feature_engineering(df):
    time_series_columns = {
        'retention': [f'RetentionD{i}' for i in range(16)],
        'ad_revenue': [f'AdRevenueD{i}' for i in range(16)],
        'iap_revenue': [f'IAPRevenueD{i}' for i in range(16)],
        'level_advanced': [f'LevelAdvancedCountD{i}' for i in range(16)]
    }

    # Aggregate features
    for feature, columns in time_series_columns.items():
        df.loc[:, f'{feature}_total'] = df[columns].sum(axis=1)
        df.loc[:, f'{feature}_mean'] = df[columns].mean(axis=1)
        df.loc[:, f'{feature}_max'] = df[columns].max(axis=1)
        df.loc[:, f'{feature}_std'] = df[columns].std(axis=1)

    # Interaction features
    df.loc[:, 'retention_ad_revenue_interaction'] = df['retention_total'] * df['ad_revenue_total']
    df.loc[:, 'retention_iap_revenue_interaction'] = df['retention_total'] * df['iap_revenue_total']
    df.loc[:, 'retention_level_interaction'] = df['retention_total'] * df['level_advanced_total']

    # Additional interactions
    df.loc[:, 'ad_iap_interaction'] = df['ad_revenue_total'] * df['iap_revenue_total']
    df.loc[:, 'level_ad_interaction'] = df['level_advanced_total'] * df['ad_revenue_total']
    df.loc[:, 'level_iap_interaction'] = df['level_advanced_total'] * df['iap_revenue_total']

    return df

# 2. Load, merge, and process training data
train_data = pd.merge(users_train, user_features_train, on='ID')
train_data = pd.merge(train_data, targets_train, on='ID')
train_data = feature_engineering(train_data)

engineered_columns = [
    'retention_total', 'retention_mean', 'retention_max', 'retention_std',
    'ad_revenue_total', 'ad_revenue_mean', 'ad_revenue_max', 'ad_revenue_std',
    'iap_revenue_total', 'iap_revenue_mean', 'iap_revenue_max', 'iap_revenue_std',
    'level_advanced_total', 'level_advanced_mean', 'level_advanced_max', 'level_advanced_std',
    'retention_ad_revenue_interaction', 'retention_iap_revenue_interaction', 'retention_level_interaction',
    'ad_iap_interaction', 'level_ad_interaction', 'level_iap_interaction'
]

time_series_columns = [
    *[f'RetentionD{i}' for i in range(16)],
    *[f'AdRevenueD{i}' for i in range(16)],
    *[f'IAPRevenueD{i}' for i in range(16)],
    *[f'LevelAdvancedCountD{i}' for i in range(16)]
]

train_data[time_series_columns] = train_data[time_series_columns].astype(float)

# Apply different scalers to sequential and engineered features
scaler_seq = StandardScaler()
scaler_eng = StandardScaler()

# Sequential data scaling
X_sequential = train_data[time_series_columns].values.reshape(train_data.shape[0], -1)
X_sequential = scaler_seq.fit_transform(X_sequential).reshape(train_data.shape[0], 16, 4)

# Engineered features scaling
X_engineered = scaler_eng.fit_transform(train_data[engineered_columns].values)

y = train_data['TARGET'].values.astype('float32')

# 3. Train-test split
X_train_seq, X_val_seq, X_train_eng, X_val_eng, y_train_seq, y_val_seq = train_test_split(
    X_sequential, X_engineered, y, test_size=0.2, random_state=42)

# 4. Build the LSTM model
def build_lstm_nn(input_shape_seq, input_shape_eng):
    seq_inputs = Input(shape=input_shape_seq)
    x = Bidirectional(LSTM(400, return_sequences=False))(seq_inputs)  # Increased from 300 to 400
    x = Dropout(0.2)(x)  # Reduced dropout to 0.2

    eng_inputs = Input(shape=input_shape_eng)

    combined = Concatenate()([x, eng_inputs])

    x = Dense(256, activation='relu')(combined)  # Increased from 128 to 256
    x = Dropout(0.18)(x)  # Reduced dropout to 0.2
    x = Dense(128, activation='relu')(x)  # Increased from 64 to 128
    x = Dense(64, activation='relu')(x)  # Increased from 32 to 64
    x = Dense(16, activation='relu')(x)  # New Dense layer with 16 units

    outputs = Dense(1)(x)

    model = Model(inputs=[seq_inputs, eng_inputs], outputs=outputs)
    model.compile(optimizer=Adam(learning_rate=0.00005), loss='mean_squared_error')

    return model

# 5. Build and train the model
input_shape_seq = (X_train_seq.shape[1], X_train_seq.shape[2])
input_shape_eng = (X_train_eng.shape[1],)
model = build_lstm_nn(input_shape_seq, input_shape_eng)

early_stopping = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4)  # Reduced factor for more gradual reduction

# Train the model
history = model.fit(
    [X_train_seq, X_train_eng], y_train_seq,
    validation_data=([X_val_seq, X_val_eng], y_val_seq),
    epochs=100,
    batch_size=528,
    callbacks=[early_stopping, lr_scheduler],
    verbose=1
)

# Predict and calculate RMSE
y_val_pred = model.predict([X_val_seq, X_val_eng])
rmse = np.sqrt(mean_squared_error(y_val_seq, y_val_pred))
print(f'Validation RMSE: {rmse}')

# ---------------------------------------
# 6. Generate Predictions for Test Data
# ---------------------------------------

test_data = pd.merge(users_test, user_features_test, on='ID')
test_data = feature_engineering(test_data)

# Sequential data for the test set
X_test_seq = test_data[time_series_columns].values.reshape(test_data.shape[0], -1)
X_test_seq = scaler_seq.transform(X_test_seq).reshape(test_data.shape[0], 16, 4)

# Engineered features for the test set
X_test_eng = scaler_eng.transform(test_data[engineered_columns].values)

# Predict on test data
test_predictions = model.predict([X_test_seq, X_test_eng])

# Create the submission file
submission = pd.DataFrame({
    'ID': test_data['ID'],  # Replace 'ID' with actual column name in the test set
    'TARGET': test_predictions.flatten()
})

submission.to_csv('submission.csv', index=False)

Epoch 1/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 10ms/step - loss: 3.8481 - val_loss: 2.4237 - learning_rate: 5.0000e-05
Epoch 2/100
[1m 169/1332[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m10s[0m 9ms/step - loss: 1.4036

KeyboardInterrupt: 

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, Input, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

# 1. Feature Engineering: Aggregate features and interactions
def feature_engineering(df):
    time_series_columns = {
        'retention': [f'RetentionD{i}' for i in range(16)],
        'ad_revenue': [f'AdRevenueD{i}' for i in range(16)],
        'iap_revenue': [f'IAPRevenueD{i}' for i in range(16)],
        'level_advanced': [f'LevelAdvancedCountD{i}' for i in range(16)]
    }

    # Aggregate features
    for feature, columns in time_series_columns.items():
        df.loc[:, f'{feature}_total'] = df[columns].sum(axis=1)
        df.loc[:, f'{feature}_mean'] = df[columns].mean(axis=1)
        df.loc[:, f'{feature}_max'] = df[columns].max(axis=1)
        df.loc[:, f'{feature}_std'] = df[columns].std(axis=1)

    # Interaction features
    df.loc[:, 'retention_ad_revenue_interaction'] = df['retention_total'] * df['ad_revenue_total']
    df.loc[:, 'retention_iap_revenue_interaction'] = df['retention_total'] * df['iap_revenue_total']
    df.loc[:, 'retention_level_interaction'] = df['retention_total'] * df['level_advanced_total']

    # Additional interactions
    df.loc[:, 'ad_iap_interaction'] = df['ad_revenue_total'] * df['iap_revenue_total']
    df.loc[:, 'level_ad_interaction'] = df['level_advanced_total'] * df['ad_revenue_total']
    df.loc[:, 'level_iap_interaction'] = df['level_advanced_total'] * df['iap_revenue_total']

    return df

# 2. Load, merge, and process training data
train_data = pd.merge(users_train, user_features_train, on='ID')
train_data = pd.merge(train_data, targets_train, on='ID')
train_data = feature_engineering(train_data)

engineered_columns = [
    'retention_total', 'retention_mean', 'retention_max', 'retention_std',
    'ad_revenue_total', 'ad_revenue_mean', 'ad_revenue_max', 'ad_revenue_std',
    'iap_revenue_total', 'iap_revenue_mean', 'iap_revenue_max', 'iap_revenue_std',
    'level_advanced_total', 'level_advanced_mean', 'level_advanced_max', 'level_advanced_std',
    'retention_ad_revenue_interaction', 'retention_iap_revenue_interaction', 'retention_level_interaction',
    'ad_iap_interaction', 'level_ad_interaction', 'level_iap_interaction'
]

time_series_columns = [
    *[f'RetentionD{i}' for i in range(16)],
    *[f'AdRevenueD{i}' for i in range(16)],
    *[f'IAPRevenueD{i}' for i in range(16)],
    *[f'LevelAdvancedCountD{i}' for i in range(16)]
]

train_data[time_series_columns] = train_data[time_series_columns].astype(float)

# Apply different scalers to sequential and engineered features
scaler_seq = StandardScaler()
scaler_eng = StandardScaler()

# Sequential data scaling
X_sequential = train_data[time_series_columns].values.reshape(train_data.shape[0], -1)
X_sequential = scaler_seq.fit_transform(X_sequential).reshape(train_data.shape[0], 16, 4)

# Engineered features scaling
X_engineered = scaler_eng.fit_transform(train_data[engineered_columns].values)

y = train_data['TARGET'].values.astype('float32')

# 3. Train-test split
X_train_seq, X_val_seq, X_train_eng, X_val_eng, y_train_seq, y_val_seq = train_test_split(
    X_sequential, X_engineered, y, test_size=0.2, random_state=42)

# 4. Build the LSTM model
def build_lstm_nn(input_shape_seq, input_shape_eng):
    seq_inputs = Input(shape=input_shape_seq)
    x = Bidirectional(LSTM(400, return_sequences=False))(seq_inputs)  # Increased from 300 to 400
    x = Dropout(0.2)(x)  # Reduced dropout to 0.2

    eng_inputs = Input(shape=input_shape_eng)

    combined = Concatenate()([x, eng_inputs])

    x = Dense(256, activation='relu')(combined)  # Increased from 128 to 256
    x = Dropout(0.2)(x)  # Reduced dropout to 0.2
    x = Dense(128, activation='relu')(x)  # Increased from 64 to 128
    x = Dense(64, activation='relu')(x)  # Increased from 32 to 64
    x = Dense(32, activation='relu')(x)  # Increased from 32 to 64
    x = Dense(16, activation='relu')(x)  # New Dense layer with 16 units

    outputs = Dense(1)(x)

    model = Model(inputs=[seq_inputs, eng_inputs], outputs=outputs)
    model.compile(optimizer=Adam(learning_rate=0.00005), loss='mean_squared_error')

    return model

# 5. Build and train the model
input_shape_seq = (X_train_seq.shape[1], X_train_seq.shape[2])
input_shape_eng = (X_train_eng.shape[1],)
model = build_lstm_nn(input_shape_seq, input_shape_eng)

early_stopping = EarlyStopping(monitor='val_loss', patience=11, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4)  # Reduced factor for more gradual reduction

# Train the model
history = model.fit(
    [X_train_seq, X_train_eng], y_train_seq,
    validation_data=([X_val_seq, X_val_eng], y_val_seq),
    epochs=100,
    batch_size=528,
    callbacks=[early_stopping, lr_scheduler],
    verbose=1
)

# Predict and calculate RMSE
y_val_pred = model.predict([X_val_seq, X_val_eng])
rmse = np.sqrt(mean_squared_error(y_val_seq, y_val_pred))
print(f'Validation RMSE: {rmse}')

# ---------------------------------------
# 6. Generate Predictions for Test Data
# ---------------------------------------

test_data = pd.merge(users_test, user_features_test, on='ID')
test_data = feature_engineering(test_data)

# Sequential data for the test set
X_test_seq = test_data[time_series_columns].values.reshape(test_data.shape[0], -1)
X_test_seq = scaler_seq.transform(X_test_seq).reshape(test_data.shape[0], 16, 4)

# Engineered features for the test set
X_test_eng = scaler_eng.transform(test_data[engineered_columns].values)

# Predict on test data
test_predictions = model.predict([X_test_seq, X_test_eng])

# Create the submission file
submission = pd.DataFrame({
    'ID': test_data['ID'],  # Replace 'ID' with actual column name in the test set
    'TARGET': test_predictions.flatten()
})

submission.to_csv('submission.csv', index=False)

Epoch 1/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 11ms/step - loss: 3.4883 - val_loss: 2.4419 - learning_rate: 5.0000e-05
Epoch 2/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 10ms/step - loss: 2.2530 - val_loss: 2.2507 - learning_rate: 5.0000e-05
Epoch 3/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 10ms/step - loss: 2.1054 - val_loss: 2.2452 - learning_rate: 5.0000e-05
Epoch 4/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 10ms/step - loss: 1.8878 - val_loss: 2.2925 - learning_rate: 5.0000e-05
Epoch 5/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 10ms/step - loss: 2.3417 - val_loss: 2.2059 - learning_rate: 5.0000e-05
Epoch 6/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 10ms/step - loss: 1.9834 - val_loss: 2.1903 - learning_rate: 5.0000e-05
Epoch 7/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1

KeyboardInterrupt: 

In [None]:
#1.4688169956207275
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, Input, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

# 1. Feature Engineering: Aggregate features and interactions
def feature_engineering(df):
    time_series_columns = {
        'retention': [f'RetentionD{i}' for i in range(16)],
        'ad_revenue': [f'AdRevenueD{i}' for i in range(16)],
        'iap_revenue': [f'IAPRevenueD{i}' for i in range(16)],
        'level_advanced': [f'LevelAdvancedCountD{i}' for i in range(16)]
    }

    # Aggregate features
    for feature, columns in time_series_columns.items():
        df.loc[:, f'{feature}_total'] = df[columns].sum(axis=1)
        df.loc[:, f'{feature}_mean'] = df[columns].mean(axis=1)
        df.loc[:, f'{feature}_max'] = df[columns].max(axis=1)
        df.loc[:, f'{feature}_std'] = df[columns].std(axis=1)

    # Interaction features
    df.loc[:, 'retention_ad_revenue_interaction'] = df['retention_total'] * df['ad_revenue_total']
    df.loc[:, 'retention_iap_revenue_interaction'] = df['retention_total'] * df['iap_revenue_total']
    df.loc[:, 'retention_level_interaction'] = df['retention_total'] * df['level_advanced_total']

    # Additional interactions
    df.loc[:, 'ad_iap_interaction'] = df['ad_revenue_total'] * df['iap_revenue_total']
    df.loc[:, 'level_ad_interaction'] = df['level_advanced_total'] * df['ad_revenue_total']
    df.loc[:, 'level_iap_interaction'] = df['level_advanced_total'] * df['iap_revenue_total']

    return df

# 2. Load, merge, and process training data
train_data = pd.merge(users_train, user_features_train, on='ID')
train_data = pd.merge(train_data, targets_train, on='ID')
train_data = feature_engineering(train_data)

engineered_columns = [
    'retention_total', 'retention_mean', 'retention_max', 'retention_std',
    'ad_revenue_total', 'ad_revenue_mean', 'ad_revenue_max', 'ad_revenue_std',
    'iap_revenue_total', 'iap_revenue_mean', 'iap_revenue_max', 'iap_revenue_std',
    'level_advanced_total', 'level_advanced_mean', 'level_advanced_max', 'level_advanced_std',
    'retention_ad_revenue_interaction', 'retention_iap_revenue_interaction', 'retention_level_interaction',
    'ad_iap_interaction', 'level_ad_interaction', 'level_iap_interaction'
]

time_series_columns = [
    *[f'RetentionD{i}' for i in range(16)],
    *[f'AdRevenueD{i}' for i in range(16)],
    *[f'IAPRevenueD{i}' for i in range(16)],
    *[f'LevelAdvancedCountD{i}' for i in range(16)]
]

train_data[time_series_columns] = train_data[time_series_columns].astype(float)

# Apply different scalers to sequential and engineered features
scaler_seq = StandardScaler()
scaler_eng = StandardScaler()

# Sequential data scaling
X_sequential = train_data[time_series_columns].values.reshape(train_data.shape[0], -1)
X_sequential = scaler_seq.fit_transform(X_sequential).reshape(train_data.shape[0], 16, 4)

# Engineered features scaling
X_engineered = scaler_eng.fit_transform(train_data[engineered_columns].values)

y = train_data['TARGET'].values.astype('float32')

# 3. Train-test split
X_train_seq, X_val_seq, X_train_eng, X_val_eng, y_train_seq, y_val_seq = train_test_split(
    X_sequential, X_engineered, y, test_size=0.2, random_state=42)

# 4. Build the LSTM model
def build_lstm_nn(input_shape_seq, input_shape_eng):
    seq_inputs = Input(shape=input_shape_seq)
    # Increase LSTM units back to 400 and remove L2 regularization from the LSTM
    x = Bidirectional(LSTM(400, return_sequences=False))(seq_inputs)
    x = Dropout(0.3)(x)

    eng_inputs = Input(shape=input_shape_eng)

    combined = Concatenate()([x, eng_inputs])

    # Keep L2 regularization in Dense layers only
    x = Dense(256, activation='relu', kernel_regularizer=l2(0.001))(combined)  # Increased back to 256 units
    x = Dropout(0.2)(x)
    x = Dense(128, activation='relu', kernel_regularizer=l2(0.001))(x)  # Increased to 128 units
    x = Dense(64, activation='relu', kernel_regularizer=l2(0.001))(x)

    outputs = Dense(1)(x)

    # Compile the model with Adam optimizer
    model = Model(inputs=[seq_inputs, eng_inputs], outputs=outputs)
    model.compile(optimizer=Adam(learning_rate=0.00002), loss='mean_squared_error')

    return model



# 5. Build and train the model
input_shape_seq = (X_train_seq.shape[1], X_train_seq.shape[2])
input_shape_eng = (X_train_eng.shape[1],)
model = build_lstm_nn(input_shape_seq, input_shape_eng)

early_stopping = EarlyStopping(monitor='val_loss', patience=9, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4)  # Reduced factor for more gradual reduction

# Train the model
history = model.fit(
    [X_train_seq, X_train_eng], y_train_seq,
    validation_data=([X_val_seq, X_val_eng], y_val_seq),
    epochs=100,
    batch_size=528,
    callbacks=[early_stopping, lr_scheduler],
    verbose=1
)

# Predict and calculate RMSE
y_val_pred = model.predict([X_val_seq, X_val_eng])
rmse = np.sqrt(mean_squared_error(y_val_seq, y_val_pred))
print(f'Validation RMSE: {rmse}')

# ---------------------------------------
# 6. Generate Predictions for Test Data
# ---------------------------------------

test_data = pd.merge(users_test, user_features_test, on='ID')
test_data = feature_engineering(test_data)

# Sequential data for the test set
X_test_seq = test_data[time_series_columns].values.reshape(test_data.shape[0], -1)
X_test_seq = scaler_seq.transform(X_test_seq).reshape(test_data.shape[0], 16, 4)

# Engineered features for the test set
X_test_eng = scaler_eng.transform(test_data[engineered_columns].values)

# Predict on test data
test_predictions = model.predict([X_test_seq, X_test_eng])

# Create the submission file
submission = pd.DataFrame({
    'ID': test_data['ID'],  # Replace 'ID' with actual column name in the test set
    'TARGET': test_predictions.flatten()
})

submission.to_csv('submission.csv', index=False)


Epoch 1/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 10ms/step - loss: 7.5453 - val_loss: 4.1500 - learning_rate: 2.0000e-05
Epoch 2/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - loss: 3.3120 - val_loss: 3.4330 - learning_rate: 2.0000e-05
Epoch 3/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - loss: 3.1997 - val_loss: 3.1686 - learning_rate: 2.0000e-05
Epoch 4/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - loss: 2.7527 - val_loss: 3.0114 - learning_rate: 2.0000e-05
Epoch 5/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - loss: 3.1133 - val_loss: 2.9395 - learning_rate: 2.0000e-05
Epoch 6/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - loss: 2.5316 - val_loss: 2.8155 - learning_rate: 2.0000e-05
Epoch 7/100
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 1

# Base Lstm

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, Input, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

# 1. Feature Engineering: Aggregate features and interactions
def feature_engineering(df):
    time_series_columns = {
        'retention': [f'RetentionD{i}' for i in range(16)],
        'ad_revenue': [f'AdRevenueD{i}' for i in range(16)],
        'iap_revenue': [f'IAPRevenueD{i}' for i in range(16)],
        'level_advanced': [f'LevelAdvancedCountD{i}' for i in range(16)]
    }

    # Aggregate features
    for feature, columns in time_series_columns.items():
        df.loc[:, f'{feature}_total'] = df[columns].sum(axis=1)
        df.loc[:, f'{feature}_mean'] = df[columns].mean(axis=1)
        df.loc[:, f'{feature}_max'] = df[columns].max(axis=1)
        df.loc[:, f'{feature}_std'] = df[columns].std(axis=1)

    # Interaction features
    df.loc[:, 'retention_ad_revenue_interaction'] = df['retention_total'] * df['ad_revenue_total']
    df.loc[:, 'retention_iap_revenue_interaction'] = df['retention_total'] * df['iap_revenue_total']
    df.loc[:, 'retention_level_interaction'] = df['retention_total'] * df['level_advanced_total']

    return df

# 2. Load and process training data (adjust with your file paths)
train_data = pd.merge(users_train, user_features_train, on='ID')
train_data = pd.merge(train_data, targets_train, on='ID')
train_data = feature_engineering(train_data)

# Define time-series and engineered columns
time_series_columns = [
    *[f'RetentionD{i}' for i in range(16)],
    *[f'AdRevenueD{i}' for i in range(16)],
    *[f'IAPRevenueD{i}' for i in range(16)],
    *[f'LevelAdvancedCountD{i}' for i in range(16)]
]

engineered_columns = [
    'retention_total', 'retention_mean', 'retention_max', 'retention_std',
    'ad_revenue_total', 'ad_revenue_mean', 'ad_revenue_max', 'ad_revenue_std',
    'iap_revenue_total', 'iap_revenue_mean', 'iap_revenue_max', 'iap_revenue_std',
    'level_advanced_total', 'level_advanced_mean', 'level_advanced_max', 'level_advanced_std',
    'retention_ad_revenue_interaction', 'retention_iap_revenue_interaction', 'retention_level_interaction'
]

# Apply different scalers to sequential and engineered features
scaler_seq = StandardScaler()
scaler_eng = StandardScaler()

X_sequential = train_data[time_series_columns].values.reshape(train_data.shape[0], -1)
X_sequential = scaler_seq.fit_transform(X_sequential).reshape(train_data.shape[0], 16, 4)

X_engineered = scaler_eng.fit_transform(train_data[engineered_columns].values)

y = train_data['TARGET'].values.astype('float32')

# 3. Train-test split
X_train_seq, X_val_seq, X_train_eng, X_val_eng, y_train, y_val = train_test_split(
    X_sequential, X_engineered, y, test_size=0.2, random_state=42)

# 4. Build the LSTM model
def build_lstm_nn(input_shape_seq, input_shape_eng):
    seq_inputs = Input(shape=input_shape_seq)
    x = Bidirectional(LSTM(300, return_sequences=False))(seq_inputs)
    x = Dropout(0.3)(x)

    eng_inputs = Input(shape=input_shape_eng)

    combined = Concatenate()([x, eng_inputs])

    x = Dense(128, activation='relu', kernel_regularizer=l2(0.001))(combined)
    x = Dropout(0.3)(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)

    outputs = Dense(1)(x)

    model = Model(inputs=[seq_inputs, eng_inputs], outputs=outputs)
    model.compile(optimizer=Adam(learning_rate=0.00005), loss='mean_squared_error')

    return model

# 5. Build and train the model
input_shape_seq = (X_train_seq.shape[1], X_train_seq.shape[2])
input_shape_eng = (X_train_eng.shape[1],)
model = build_lstm_nn(input_shape_seq, input_shape_eng)

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4)

# Train the model
history = model.fit(
    [X_train_seq, X_train_eng], y_train,
    validation_data=([X_val_seq, X_val_eng], y_val),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping, lr_scheduler],
    verbose=1
)

# Predict and calculate RMSE
y_val_pred = model.predict([X_val_seq, X_val_eng])
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f'Validation RMSE: {rmse}')


test_data = pd.merge(users_test, user_features_test, on='ID')
test_data = feature_engineering(test_data)

# Sequential data for the test set
X_test_seq = test_data[time_series_columns].values.reshape(test_data.shape[0], -1)
X_test_seq = scaler_seq.transform(X_test_seq).reshape(test_data.shape[0], 16, 4)

# Engineered features for the test set
X_test_eng = scaler_eng.transform(test_data[engineered_columns].values)

# Predict on test data
test_predictions = model.predict([X_test_seq, X_test_eng])

# Create the submission file
submission = pd.DataFrame({
    'ID': test_data['ID'],  # Replace 'ID' with actual column name in the test set
    'TARGET': test_predictions.flatten()
})

submission.to_csv('submission.csv', index=False)


Epoch 1/100
[1m21965/21965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 7ms/step - loss: 3.1538 - val_loss: 2.3952 - learning_rate: 5.0000e-05
Epoch 2/100
[1m21965/21965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 7ms/step - loss: 2.3776 - val_loss: 2.3898 - learning_rate: 5.0000e-05
Epoch 3/100
[1m21965/21965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 7ms/step - loss: 2.1626 - val_loss: 2.3598 - learning_rate: 5.0000e-05
Epoch 4/100
[1m21965/21965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 7ms/step - loss: 2.4342 - val_loss: 2.4163 - learning_rate: 5.0000e-05
Epoch 5/100
[1m21965/21965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 7ms/step - loss: 2.0585 - val_loss: 2.3966 - learning_rate: 5.0000e-05
Epoch 6/100
[1m21965/21965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 7ms/step - loss: 1.8956 - val_loss: 2.4505 - learning_rate: 5.0000e-05
Epoch 7/100
[1m21965/21965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, GRU, Dense, Dropout, Bidirectional, Input, Concatenate, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.optimizers import Adam

# 1. Feature Engineering: Aggregate features and interactions
def feature_engineering(df):
    time_series_columns = {
        'retention': [f'RetentionD{i}' for i in range(16)],
        'ad_revenue': [f'AdRevenueD{i}' for i in range(16)],
        'iap_revenue': [f'IAPRevenueD{i}' for i in range(16)],
        'level_advanced': [f'LevelAdvancedCountD{i}' for i in range(16)]
    }

    # Aggregate features
    for feature, columns in time_series_columns.items():
        df.loc[:, f'{feature}_total'] = df[columns].sum(axis=1)
        df.loc[:, f'{feature}_mean'] = df[columns].mean(axis=1)
        df.loc[:, f'{feature}_max'] = df[columns].max(axis=1)
        df.loc[:, f'{feature}_std'] = df[columns].std(axis=1)

    # Interaction features
    df.loc[:, 'retention_ad_revenue_interaction'] = df['retention_total'] * df['ad_revenue_total']
    df.loc[:, 'retention_iap_revenue_interaction'] = df['retention_total'] * df['iap_revenue_total']
    df.loc[:, 'retention_level_interaction'] = df['retention_total'] * df['level_advanced_total']

    return df

# 2. Load, merge, and process training data
train_data = pd.merge(users_train, user_features_train, on='ID')
train_data = pd.merge(train_data, targets_train, on='ID')
train_data = feature_engineering(train_data)

engineered_columns = [
    'retention_total', 'retention_mean', 'retention_max', 'retention_std',
    'ad_revenue_total', 'ad_revenue_mean', 'ad_revenue_max', 'ad_revenue_std',
    'iap_revenue_total', 'iap_revenue_mean', 'iap_revenue_max', 'iap_revenue_std',
    'level_advanced_total', 'level_advanced_mean', 'level_advanced_max', 'level_advanced_std',
    'retention_ad_revenue_interaction', 'retention_iap_revenue_interaction', 'retention_level_interaction'
]

time_series_columns = [
    *[f'RetentionD{i}' for i in range(16)],
    *[f'AdRevenueD{i}' for i in range(16)],
    *[f'IAPRevenueD{i}' for i in range(16)],
    *[f'LevelAdvancedCountD{i}' for i in range(16)]
]

train_data[time_series_columns] = train_data[time_series_columns].astype(float)

# Apply different scalers to sequential and engineered features
scaler_seq = StandardScaler()
scaler_eng = StandardScaler()

# Sequential data scaling
X_sequential = train_data[time_series_columns].values.reshape(train_data.shape[0], -1)
X_sequential = scaler_seq.fit_transform(X_sequential).reshape(train_data.shape[0], 16, 4)

# Engineered features scaling
X_engineered = scaler_eng.fit_transform(train_data[engineered_columns].values)

y = train_data['TARGET'].values.astype('float32')

# 3. Train-test split
X_train_seq, X_val_seq, X_train_eng, X_val_eng, y_train_seq, y_val_seq = train_test_split(
    X_sequential, X_engineered, y, test_size=0.2, random_state=42)

# 4. Build the LSTM or GRU model (with stronger regularization and batch normalization)
def build_lstm_nn(input_shape_seq, input_shape_eng, use_gru=False):
    seq_inputs = Input(shape=input_shape_seq)

    if use_gru:
        x = Bidirectional(GRU(300, return_sequences=False, kernel_regularizer=l1_l2(l1=0.001, l2=0.001)))(seq_inputs)  # GRU with L1 & L2 regularization
    else:
        x = Bidirectional(LSTM(300, return_sequences=False, kernel_regularizer=l1_l2(l1=0.001, l2=0.001)))(seq_inputs)  # LSTM with L1 & L2 regularization

    x = BatchNormalization()(x)  # Batch normalization after LSTM/GRU
    x = Dropout(0.5)(x)  # Increase dropout to 0.5

    eng_inputs = Input(shape=input_shape_eng)

    combined = Concatenate()([x, eng_inputs])

    x = Dense(128, activation='relu', kernel_regularizer=l1_l2(l1=0.001, l2=0.001))(combined)
    x = BatchNormalization()(x)  # Batch normalization after Dense
    x = Dropout(0.5)(x)  # Increase dropout to 0.5
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)

    outputs = Dense(1)(x)

    model = Model(inputs=[seq_inputs, eng_inputs], outputs=outputs)
    model.compile(optimizer=Adam(learning_rate=0.00001), loss='mean_squared_error')

    return model

# 5. Build and train the model (with GRU option for comparison)
input_shape_seq = (X_train_seq.shape[1], X_train_seq.shape[2])
input_shape_eng = (X_train_eng.shape[1],)
model = build_lstm_nn(input_shape_seq, input_shape_eng, use_gru=False)  # Switch to True if you want to use GRU

early_stopping = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)  # Further reduced patience
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)

# Train the model
history = model.fit(
    [X_train_seq, X_train_eng], y_train_seq,
    validation_data=([X_val_seq, X_val_eng], y_val_seq),
    epochs=150,
    batch_size=64,  # Increased batch size
    callbacks=[early_stopping, lr_scheduler],
    verbose=1
)

# Predict and calculate RMSE
y_val_pred = model.predict([X_val_seq, X_val_eng])
rmse = np.sqrt(mean_squared_error(y_val_seq, y_val_pred))
print(f'Validation RMSE: {rmse}')

# ---------------------------------------
# 6. Generate Predictions for Test Data
# ---------------------------------------

test_data = pd.merge(users_test, user_features_test, on='ID')
test_data = feature_engineering(test_data)

# Sequential data for the test set
X_test_seq = test_data[time_series_columns].values.reshape(test_data.shape[0], -1)
X_test_seq = scaler_seq.transform(X_test_seq).reshape(test_data.shape[0], 16, 4)

# Engineered features for the test set
X_test_eng = scaler_eng.transform(test_data[engineered_columns].values)

# Predict on test data
test_predictions = model.predict([X_test_seq, X_test_eng])

# Create the submission file
submission = pd.DataFrame({
    'ID': test_data['ID'],  # Replace 'ID' with actual column name in the test set
    'TARGET': test_predictions.flatten()
})

submission.to_csv('submission.csv', index=False)


Epoch 1/150
[1m10983/10983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 9ms/step - loss: 10.5097 - val_loss: 8.1850 - learning_rate: 1.0000e-05
Epoch 2/150
[1m10983/10983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 9ms/step - loss: 7.8536 - val_loss: 8.5403 - learning_rate: 1.0000e-05
Epoch 3/150
[1m10983/10983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 9ms/step - loss: 8.1881 - val_loss: 8.1344 - learning_rate: 1.0000e-05
Epoch 4/150
[1m10650/10983[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m2s[0m 8ms/step - loss: 6.6487

KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, Input, Concatenate, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import RMSprop

# 1. Feature Engineering: Aggregate features and interactions
def feature_engineering(df):
    time_series_columns = {
        'retention': [f'RetentionD{i}' for i in range(16)],
        'ad_revenue': [f'AdRevenueD{i}' for i in range(16)],
        'iap_revenue': [f'IAPRevenueD{i}' for i in range(16)],
        'level_advanced': [f'LevelAdvancedCountD{i}' for i in range(16)]
    }

    for feature, columns in time_series_columns.items():
        df.loc[:, f'{feature}_total'] = df[columns].sum(axis=1)
        df.loc[:, f'{feature}_mean'] = df[columns].mean(axis=1)
        df.loc[:, f'{feature}_max'] = df[columns].max(axis=1)
        df.loc[:, f'{feature}_std'] = df[columns].std(axis=1)

    df.loc[:, 'retention_ad_revenue_interaction'] = df['retention_total'] * df['ad_revenue_total']
    df.loc[:, 'retention_iap_revenue_interaction'] = df['retention_total'] * df['iap_revenue_total']
    df.loc[:, 'retention_level_interaction'] = df['retention_total'] * df['level_advanced_total']

    return df

# 2. Load, merge, and process training data
train_data = pd.merge(users_train, user_features_train, on='ID')
train_data = pd.merge(train_data, targets_train, on='ID')
train_data = feature_engineering(train_data)

engineered_columns = [
    'retention_total', 'retention_mean', 'retention_max', 'retention_std',
    'ad_revenue_total', 'ad_revenue_mean', 'ad_revenue_max', 'ad_revenue_std',
    'iap_revenue_total', 'iap_revenue_mean', 'iap_revenue_max', 'iap_revenue_std',
    'level_advanced_total', 'level_advanced_mean', 'level_advanced_max', 'level_advanced_std',
    'retention_ad_revenue_interaction', 'retention_iap_revenue_interaction', 'retention_level_interaction'
]

time_series_columns = [
    *[f'RetentionD{i}' for i in range(16)],
    *[f'AdRevenueD{i}' for i in range(16)],
    *[f'IAPRevenueD{i}' for i in range(16)],
    *[f'LevelAdvancedCountD{i}' for i in range(16)]
]

train_data[time_series_columns] = train_data[time_series_columns].astype(float)

# Apply different scalers to sequential and engineered features
scaler_seq = StandardScaler()
scaler_eng = StandardScaler()

X_sequential = train_data[time_series_columns].values.reshape(train_data.shape[0], -1)
X_sequential = scaler_seq.fit_transform(X_sequential).reshape(train_data.shape[0], 16, 4)

X_engineered = scaler_eng.fit_transform(train_data[engineered_columns].values)

y = train_data['TARGET'].values.astype('float32')

# 3. Train-test split
X_train_seq, X_val_seq, X_train_eng, X_val_eng, y_train_seq, y_val_seq = train_test_split(
    X_sequential, X_engineered, y, test_size=0.2, random_state=42)

# 4. Build the optimized LSTM model
def build_lstm_nn(input_shape_seq, input_shape_eng):
    seq_inputs = Input(shape=input_shape_seq)
    x = Bidirectional(LSTM(200, return_sequences=True))(seq_inputs)  # Stacked LSTM
    x = LSTM(100, return_sequences=False)(x)
    x = Dropout(0.3)(x)

    eng_inputs = Input(shape=input_shape_eng)

    combined = Concatenate()([x, eng_inputs])

    x = Dense(128, activation='relu', kernel_regularizer=l2(0.001))(combined)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)

    outputs = Dense(1)(x)

    optimizer = RMSprop(learning_rate=0.00003, clipvalue=1.0)
    model = Model(inputs=[seq_inputs, eng_inputs], outputs=outputs)
    model.compile(optimizer=optimizer, loss='mean_squared_error')

    return model

# 5. Build and train the model
input_shape_seq = (X_train_seq.shape[1], X_train_seq.shape[2])
input_shape_eng = (X_train_eng.shape[1],)
model = build_lstm_nn(input_shape_seq, input_shape_eng)

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4)

history = model.fit(
    [X_train_seq, X_train_eng], y_train_seq,
    validation_data=([X_val_seq, X_val_eng], y_val_seq),
    epochs=150,
    batch_size=64,
    callbacks=[early_stopping, lr_scheduler],
    verbose=1
)

# Predict and calculate RMSE
y_val_pred = model.predict([X_val_seq, X_val_eng])
rmse = np.sqrt(mean_squared_error(y_val_seq, y_val_pred))
print(f'Validation RMSE: {rmse}')

# ---------------------------------------
# 6. Generate Predictions for Test Data
# ---------------------------------------
test_data = pd.merge(users_test, user_features_test, on='ID')
test_data = feature_engineering(test_data)

X_test_seq = test_data[time_series_columns].values.reshape(test_data.shape[0], -1)
X_test_seq = scaler_seq.transform(X_test_seq).reshape(test_data.shape[0], 16, 4)

X_test_eng = scaler_eng.transform(test_data[engineered_columns].values)

test_predictions = model.predict([X_test_seq, X_test_eng])

submission = pd.DataFrame({
    'ID': test_data['ID'],
    'TARGET': test_predictions.flatten()
})

submission.to_csv('submission.csv', index=False)


Epoch 1/150
[1m10983/10983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 8ms/step - loss: 5.0182 - val_loss: 2.7338 - learning_rate: 3.0000e-05
Epoch 2/150
[1m10983/10983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 8ms/step - loss: 4.2428 - val_loss: 2.5363 - learning_rate: 3.0000e-05
Epoch 3/150
[1m10983/10983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 8ms/step - loss: 3.1556 - val_loss: 2.6596 - learning_rate: 3.0000e-05
Epoch 4/150
[1m10983/10983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 8ms/step - loss: 3.8051 - val_loss: 2.6438 - learning_rate: 3.0000e-05
Epoch 5/150
[1m10983/10983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 8ms/step - loss: 3.4768 - val_loss: 2.7255 - learning_rate: 3.0000e-05
Epoch 6/150
[1m10983/10983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 8ms/step - loss: 3.4464 - val_loss: 2.4303 - learning_rate: 3.0000e-05
Epoch 7/150
[1m10983/10983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, Input, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Nadam
import shap

# 1. Feature Engineering: Aggregate features and interactions
def feature_engineering(df):
    time_series_columns = {
        'retention': [f'RetentionD{i}' for i in range(16)],
        'ad_revenue': [f'AdRevenueD{i}' for i in range(16)],
        'iap_revenue': [f'IAPRevenueD{i}' for i in range(16)],
        'level_advanced': [f'LevelAdvancedCountD{i}' for i in range(16)]
    }

    for feature, columns in time_series_columns.items():
        df.loc[:, f'{feature}_total'] = df[columns].sum(axis=1)
        df.loc[:, f'{feature}_mean'] = df[columns].mean(axis=1)
        df.loc[:, f'{feature}_max'] = df[columns].max(axis=1)
        df.loc[:, f'{feature}_std'] = df[columns].std(axis=1)

    df.loc[:, 'retention_ad_revenue_interaction'] = df['retention_total'] * df['ad_revenue_total']
    df.loc[:, 'retention_iap_revenue_interaction'] = df['retention_total'] * df['iap_revenue_total']
    df.loc[:, 'retention_level_interaction'] = df['retention_total'] * df['level_advanced_total']

    return df

# 2. Load, merge, and process training data
train_data = pd.merge(users_train, user_features_train, on='ID')
train_data = pd.merge(train_data, targets_train, on='ID')
train_data = feature_engineering(train_data)

engineered_columns = [
    'retention_total', 'retention_mean', 'retention_max', 'retention_std',
    'ad_revenue_total', 'ad_revenue_mean', 'ad_revenue_max', 'ad_revenue_std',
    'iap_revenue_total', 'iap_revenue_mean', 'iap_revenue_max', 'iap_revenue_std',
    'level_advanced_total', 'level_advanced_mean', 'level_advanced_max', 'level_advanced_std',
    'retention_ad_revenue_interaction', 'retention_iap_revenue_interaction', 'retention_level_interaction'
]

time_series_columns = [
    *[f'RetentionD{i}' for i in range(16)],
    *[f'AdRevenueD{i}' for i in range(16)],
    *[f'IAPRevenueD{i}' for i in range(16)],
    *[f'LevelAdvancedCountD{i}' for i in range(16)]
]

train_data[time_series_columns] = train_data[time_series_columns].astype(float)

# Apply different scalers to sequential and engineered features
scaler_seq = StandardScaler()
scaler_eng = StandardScaler()

# Sequential data scaling
X_sequential = train_data[time_series_columns].values.reshape(train_data.shape[0], -1)
X_sequential = scaler_seq.fit_transform(X_sequential).reshape(train_data.shape[0], 16, 4)

# Engineered features scaling
X_engineered = scaler_eng.fit_transform(train_data[engineered_columns].values)

y = train_data['TARGET'].values.astype('float32')

# 3. Train-test split
X_train_seq, X_val_seq, X_train_eng, X_val_eng, y_train_seq, y_val_seq = train_test_split(
    X_sequential, X_engineered, y, test_size=0.2, random_state=42)

# 4. SHAP Feature Importance (optional, can skip for training speed)
def get_shap_importance(model, X_val_seq, X_val_eng):
    X_val_combined = np.concatenate([X_val_seq.reshape(X_val_seq.shape[0], -1), X_val_eng], axis=1)
    explainer = shap.KernelExplainer(lambda x: model.predict([x[:, :64].reshape(-1, 16, 4), x[:, 64:]]), X_val_combined)
    shap_values = explainer.shap_values(X_val_combined, nsamples=100)
    shap_mean_importance = np.abs(shap_values).mean(axis=0)
    top_features = np.argsort(shap_mean_importance)[::-1][:20]  # Select top 20 features
    return top_features

# 5. Build the optimized LSTM model
def build_lstm_nn(input_shape_seq, input_shape_eng):
    seq_inputs = Input(shape=input_shape_seq)
    x = Bidirectional(LSTM(350, return_sequences=True))(seq_inputs)  # Increased to 350 units
    x = LSTM(150, return_sequences=False)(x)  # Stacked LSTM layer with 150 units
    x = Dropout(0.2)(x)  # Reduced Dropout to 0.2

    eng_inputs = Input(shape=input_shape_eng)

    combined = Concatenate()([x, eng_inputs])

    x = Dense(256, activation='relu', kernel_regularizer=l2(0.001))(combined)  # Increased dense layer
    x = Dropout(0.2)(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)

    outputs = Dense(1)(x)

    optimizer = Nadam(learning_rate=0.0001)  # Changed to Nadam optimizer
    model = Model(inputs=[seq_inputs, eng_inputs], outputs=outputs)
    model.compile(optimizer=optimizer, loss='mean_squared_error')

    return model

# 6. Build and train the model
input_shape_seq = (X_train_seq.shape[1], X_train_seq.shape[2])
input_shape_eng = (X_train_eng.shape[1],)
model = build_lstm_nn(input_shape_seq, input_shape_eng)

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)

history = model.fit(
    [X_train_seq, X_train_eng], y_train_seq,
    validation_data=([X_val_seq, X_val_eng], y_val_seq),
    epochs=150,
    batch_size=64,  # Increased batch size for smoother gradients
    callbacks=[early_stopping, lr_scheduler],
    verbose=1
)

# Predict and calculate RMSE
y_val_pred = model.predict([X_val_seq, X_val_eng])
rmse = np.sqrt(mean_squared_error(y_val_seq, y_val_pred))
print(f'Validation RMSE: {rmse}')

# ---------------------------------------
# 7. Generate Predictions for Test Data
# ---------------------------------------

test_data = pd.merge(users_test, user_features_test, on='ID')
test_data = feature_engineering(test_data)

# Sequential data for the test set
X_test_seq = test_data[time_series_columns].values.reshape(test_data.shape[0], -1)
X_test_seq = scaler_seq.transform(X_test_seq).reshape(test_data.shape[0], 16, 4)

# Engineered features for the test set
X_test_eng = scaler_eng.transform(test_data[engineered_columns].values)

# Predict on test data
test_predictions = model.predict([X_test_seq, X_test_eng])

# Create the submission file
submission = pd.DataFrame({
    'ID': test_data['ID'],  # Replace 'ID' with actual column name in the test set
    'TARGET': test_predictions.flatten()
})

submission.to_csv('submission.csv', index=False)


Epoch 1/150
[1m10983/10983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 10ms/step - loss: 3.3191 - val_loss: 2.4092 - learning_rate: 1.0000e-04
Epoch 2/150
[1m10983/10983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 10ms/step - loss: 2.4060 - val_loss: 2.2914 - learning_rate: 1.0000e-04
Epoch 3/150
[1m10983/10983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 10ms/step - loss: 2.6685 - val_loss: 2.3246 - learning_rate: 1.0000e-04
Epoch 4/150
[1m10983/10983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 10ms/step - loss: 2.5586 - val_loss: 2.2975 - learning_rate: 1.0000e-04
Epoch 5/150
[1m10983/10983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 10ms/step - loss: 2.3241 - val_loss: 2.4082 - learning_rate: 1.0000e-04
Epoch 6/150
[1m10983/10983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 10ms/step - loss: 2.0009 - val_loss: 2.2703 - learning_rate: 1.0000e-04
Epoch 7/150
[1m10983/10983[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, Input, Concatenate, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Nadam
import shap

# 1. Feature Engineering: Aggregate features and interactions
def feature_engineering(df):
    time_series_columns = {
        'retention': [f'RetentionD{i}' for i in range(16)],
        'ad_revenue': [f'AdRevenueD{i}' for i in range(16)],
        'iap_revenue': [f'IAPRevenueD{i}' for i in range(16)],
        'level_advanced': [f'LevelAdvancedCountD{i}' for i in range(16)]
    }

    for feature, columns in time_series_columns.items():
        df.loc[:, f'{feature}_total'] = df[columns].sum(axis=1)
        df.loc[:, f'{feature}_mean'] = df[columns].mean(axis=1)
        df.loc[:, f'{feature}_max'] = df[columns].max(axis=1)
        df.loc[:, f'{feature}_std'] = df[columns].std(axis=1)

    df.loc[:, 'retention_ad_revenue_interaction'] = df['retention_total'] * df['ad_revenue_total']
    df.loc[:, 'retention_iap_revenue_interaction'] = df['retention_total'] * df['iap_revenue_total']
    df.loc[:, 'retention_level_interaction'] = df['retention_total'] * df['level_advanced_total']

    return df

# 2. Load, merge, and process training data
train_data = pd.merge(users_train, user_features_train, on='ID')
train_data = pd.merge(train_data, targets_train, on='ID')
train_data = feature_engineering(train_data)

engineered_columns = [
    'retention_total', 'retention_mean', 'retention_max', 'retention_std',
    'ad_revenue_total', 'ad_revenue_mean', 'ad_revenue_max', 'ad_revenue_std',
    'iap_revenue_total', 'iap_revenue_mean', 'iap_revenue_max', 'iap_revenue_std',
    'level_advanced_total', 'level_advanced_mean', 'level_advanced_max', 'level_advanced_std',
    'retention_ad_revenue_interaction', 'retention_iap_revenue_interaction', 'retention_level_interaction'
]

time_series_columns = [
    *[f'RetentionD{i}' for i in range(16)],
    *[f'AdRevenueD{i}' for i in range(16)],
    *[f'IAPRevenueD{i}' for i in range(16)],
    *[f'LevelAdvancedCountD{i}' for i in range(16)]
]

train_data[time_series_columns] = train_data[time_series_columns].astype(float)

# Apply different scalers to sequential and engineered features
scaler_seq = StandardScaler()
scaler_eng = StandardScaler()

# Sequential data scaling
X_sequential = train_data[time_series_columns].values.reshape(train_data.shape[0], -1)
X_sequential = scaler_seq.fit_transform(X_sequential).reshape(train_data.shape[0], 16, 4)

# Engineered features scaling
X_engineered = scaler_eng.fit_transform(train_data[engineered_columns].values)

y = train_data['TARGET'].values.astype('float32')

# 3. Train-test split
X_train_seq, X_val_seq, X_train_eng, X_val_eng, y_train_seq, y_val_seq = train_test_split(
    X_sequential, X_engineered, y, test_size=0.2, random_state=42)

# 4. Build the optimized LSTM model
def build_lstm_nn(input_shape_seq, input_shape_eng):
    seq_inputs = Input(shape=input_shape_seq)
    x = Bidirectional(LSTM(400, return_sequences=True, kernel_initializer='he_normal'))(seq_inputs)  # Increased LSTM units
    x = BatchNormalization()(x)  # Added BatchNormalization
    x = LSTM(200, return_sequences=False)(x)  # Stacked LSTM layer with 200 units
    x = Dropout(0.25)(x)  # Adjusted Dropout

    eng_inputs = Input(shape=input_shape_eng)

    combined = Concatenate()([x, eng_inputs])

    x = Dense(256, activation='relu', kernel_regularizer=l2(0.001))(combined)  # Increased dense layer
    x = BatchNormalization()(x)  # BatchNormalization to stabilize learning
    x = Dropout(0.2)(x)
    x = Dense(128, activation='relu')(x)
    x = Dense(64, activation='relu')(x)

    outputs = Dense(1)(x)

    optimizer = Nadam(learning_rate=0.00005)  # Slightly reduced learning rate
    model = Model(inputs=[seq_inputs, eng_inputs], outputs=outputs)
    model.compile(optimizer=optimizer, loss='mean_squared_error')

    return model

# 5. Build and train the model
input_shape_seq = (X_train_seq.shape[1], X_train_seq.shape[2])
input_shape_eng = (X_train_eng.shape[1],)
model = build_lstm_nn(input_shape_seq, input_shape_eng)

early_stopping = EarlyStopping(monitor='val_loss', patience=12, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4)

history = model.fit(
    [X_train_seq, X_train_eng], y_train_seq,
    validation_data=([X_val_seq, X_val_eng], y_val_seq),
    epochs=200,
    batch_size=128,  # Increased batch size to smooth gradients
    callbacks=[early_stopping, lr_scheduler],
    verbose=1
)

# Predict and calculate RMSE
y_val_pred = model.predict([X_val_seq, X_val_eng])
rmse = np.sqrt(mean_squared_error(y_val_seq, y_val_pred))
print(f'Validation RMSE: {rmse}')

# ---------------------------------------
# 6. Generate Predictions for Test Data
# ---------------------------------------

test_data = pd.merge(users_test, user_features_test, on='ID')
test_data = feature_engineering(test_data)

# Sequential data for the test set
X_test_seq = test_data[time_series_columns].values.reshape(test_data.shape[0], -1)
X_test_seq = scaler_seq.transform(X_test_seq).reshape(test_data.shape[0], 16, 4)

# Engineered features for the test set
X_test_eng = scaler_eng.transform(test_data[engineered_columns].values)

# Predict on test data
test_predictions = model.predict([X_test_seq, X_test_eng])

# Create the submission file
submission = pd.DataFrame({
    'ID': test_data['ID'],  # Replace 'ID' with actual column name in the test set
    'TARGET': test_predictions.flatten()
})

submission.to_csv('submission.csv', index=False)


Epoch 1/200
[1m5492/5492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 12ms/step - loss: 4.6247 - val_loss: 4.4100 - learning_rate: 5.0000e-05
Epoch 2/200
[1m5492/5492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 12ms/step - loss: 2.5980 - val_loss: 5.7143 - learning_rate: 5.0000e-05
Epoch 3/200
[1m5492/5492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 12ms/step - loss: 2.3513 - val_loss: 5.1023 - learning_rate: 5.0000e-05
Epoch 4/200
[1m5492/5492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 12ms/step - loss: 2.4631 - val_loss: 7.0263 - learning_rate: 5.0000e-05
Epoch 5/200
[1m5492/5492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 12ms/step - loss: 2.3026 - val_loss: 5.5399 - learning_rate: 5.0000e-05
Epoch 6/200
[1m5492/5492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 12ms/step - loss: 2.5715 - val_loss: 6.7586 - learning_rate: 2.5000e-05
Epoch 7/200
[1m5492/5492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 1

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GRU, Dense, Dropout, Input, Concatenate, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import LeakyReLU

# 1. Optimized Feature Engineering
def feature_engineering(df):
    time_series_columns = {
        'retention': [f'RetentionD{i}' for i in range(16)],
        'ad_revenue': [f'AdRevenueD{i}' for i in range(16)],
        'iap_revenue': [f'IAPRevenueD{i}' for i in range(16)],
        'level_advanced': [f'LevelAdvancedCountD{i}' for i in range(16)]
    }

    # Convert boolean columns to numeric (if any boolean columns exist)
    for feature, columns in time_series_columns.items():
        for column in columns:
            if df[column].dtype == 'bool':
                df[column] = df[column].astype(int)

    # Aggregated and new features
    for feature, columns in time_series_columns.items():
        df[f'{feature}_total'] = df[columns].sum(axis=1)
        df[f'{feature}_mean'] = df[columns].mean(axis=1)
        df[f'{feature}_max'] = df[columns].max(axis=1)
        df[f'{feature}_std'] = df[columns].std(axis=1)
        df[f'{feature}_slope'] = (df[columns].iloc[:, -1] - df[columns].iloc[:, 0]) / 15
        df[f'{feature}_acceleration'] = df[columns].diff().mean(axis=1)
        df[f'{feature}_last_3_mean'] = df[columns].iloc[:, -3:].mean(axis=1)

    # Interaction features
    df['retention_ad_revenue_interaction'] = df['retention_total'] * df['ad_revenue_total']
    df['retention_iap_revenue_interaction'] = df['retention_total'] * df['iap_revenue_total']
    df['retention_level_interaction'] = df['retention_total'] * df['level_advanced_total']
    df['ad_iap_revenue_ratio'] = df['ad_revenue_total'] / (df['iap_revenue_total'] + 1)

    return df

# 2. Load, merge, and process training data
train_data = pd.merge(users_train, user_features_train, on='ID')
train_data = pd.merge(train_data, targets_train, on='ID')

# Apply feature engineering
train_data = feature_engineering(train_data)

# Ensure no NaN values in numeric columns are filled with their mean
numeric_columns = train_data.select_dtypes(include=[np.number]).columns
train_data[numeric_columns] = train_data[numeric_columns].fillna(train_data[numeric_columns].mean())

# For non-numeric columns, fill with empty strings
non_numeric_columns = train_data.select_dtypes(exclude=[np.number]).columns
train_data[non_numeric_columns] = train_data[non_numeric_columns].fillna('')

# 3. Define engineered features and time series columns
engineered_columns = [
    'retention_total', 'retention_mean', 'retention_max', 'retention_std', 'retention_slope', 'retention_acceleration', 'retention_last_3_mean',
    'ad_revenue_total', 'ad_revenue_mean', 'ad_revenue_max', 'ad_revenue_std', 'ad_revenue_slope', 'ad_revenue_acceleration', 'ad_revenue_last_3_mean',
    'iap_revenue_total', 'iap_revenue_mean', 'iap_revenue_max', 'iap_revenue_std', 'iap_revenue_slope', 'iap_revenue_acceleration', 'iap_revenue_last_3_mean',
    'level_advanced_total', 'level_advanced_mean', 'level_advanced_max', 'level_advanced_std', 'level_advanced_slope', 'level_advanced_acceleration', 'level_advanced_last_3_mean',
    'retention_ad_revenue_interaction', 'retention_iap_revenue_interaction', 'retention_level_interaction', 'ad_iap_revenue_ratio'
]

time_series_columns = [
    *[f'RetentionD{i}' for i in range(16)],
    *[f'AdRevenueD{i}' for i in range(16)],
    *[f'IAPRevenueD{i}' for i in range(16)],
    *[f'LevelAdvancedCountD{i}' for i in range(16)]
]

# Convert relevant columns to float
train_data[time_series_columns] = train_data[time_series_columns].astype(float)

# 4. Apply scalers
scaler_seq = RobustScaler()
scaler_eng = RobustScaler()

# Sequential data scaling
X_sequential = train_data[time_series_columns].values.reshape(train_data.shape[0], -1)
X_sequential = scaler_seq.fit_transform(X_sequential).reshape(train_data.shape[0], 16, 4)

# Engineered features scaling
X_engineered = scaler_eng.fit_transform(train_data[engineered_columns].values)

y = train_data['TARGET'].values.astype('float32')

# 5. Train-test split
X_train_seq, X_val_seq, X_train_eng, X_val_eng, y_train, y_val = train_test_split(
    X_sequential, X_engineered, y, test_size=0.2, random_state=42)

# 6. Optimized Model with Lower Complexity and Regularization
def build_optimized_lstm(input_shape_seq, input_shape_eng):
    seq_inputs = Input(shape=input_shape_seq)

    # Reduce LSTM units and switch to GRU (more efficient)
    x = GRU(128, return_sequences=True)(seq_inputs)
    x = GRU(64)(x)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)  # Increase dropout

    eng_inputs = Input(shape=input_shape_eng)
    y = Dense(32, activation='relu', kernel_regularizer=l2(0.001))(eng_inputs)  # Add L2 regularization
    y = BatchNormalization()(y)
    y = Dropout(0.4)(y)  # Increase dropout

    combined = Concatenate()([x, y])

    z = Dense(64, kernel_regularizer=l2(0.001))(combined)  # Add L2 regularization
    z = LeakyReLU()(z)
    z = BatchNormalization()(z)
    z = Dropout(0.5)(z)  # Increase dropout

    z = Dense(32, kernel_regularizer=l2(0.001))(z)  # Add L2 regularization
    z = LeakyReLU()(z)
    z = BatchNormalization()(z)
    z = Dropout(0.4)(z)

    outputs = Dense(1)(z)

    model = Model(inputs=[seq_inputs, eng_inputs], outputs=outputs)
    model.compile(optimizer=Adam(learning_rate=0.0001), loss='mean_squared_error')

    return model

# 7. Build and train the model with early stopping and learning rate scheduler
input_shape_seq = (X_train_seq.shape[1], X_train_seq.shape[2])
input_shape_eng = (X_train_eng.shape[1],)
model = build_optimized_lstm(input_shape_seq, input_shape_eng)

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)  # Reduce patience
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)

# Train the model with a larger batch size
history = model.fit(
    [X_train_seq, X_train_eng], y_train,
    validation_data=([X_val_seq, X_val_eng], y_val),
    epochs=200,
    batch_size=256,  # Increase batch size to speed up training
    callbacks=[early_stopping, lr_scheduler],
    verbose=1
)

# 8. Predict and calculate RMSE
y_val_pred = model.predict([X_val_seq, X_val_eng])
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f'Validation RMSE: {rmse}')

# 9. Generate Predictions for Test Data
test_data = pd.merge(users_test, user_features_test, on='ID')
test_data = feature_engineering(test_data)

# Sequential data for the test set
X_test_seq = test_data[time_series_columns].values.reshape(test_data.shape[0], -1)
X_test_seq = scaler_seq.transform(X_test_seq).reshape(test_data.shape[0], 16, 4)

# Engineered features for the test set
X_test_eng = scaler_eng.transform(test_data[engineered_columns].values)

# Predict on test data
test_predictions = model.predict([X_test_seq, X_test_eng])

# Create the submission file
submission = pd.DataFrame({
    'ID': test_data['ID'],
    'TARGET': test_predictions.flatten()
})

submission.to_csv('submission.csv', index=False)


Epoch 1/200
[1m2746/2746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 10ms/step - loss: 7.5254 - val_loss: 2.9538 - learning_rate: 1.0000e-04
Epoch 2/200
[1m2746/2746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 10ms/step - loss: 5.6580 - val_loss: 3.0674 - learning_rate: 1.0000e-04
Epoch 3/200
[1m2746/2746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 10ms/step - loss: 4.7978 - val_loss: 2.8781 - learning_rate: 1.0000e-04
Epoch 4/200
[1m2746/2746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 10ms/step - loss: 4.7640 - val_loss: 2.5894 - learning_rate: 1.0000e-04
Epoch 5/200
[1m2746/2746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 10ms/step - loss: 3.9376 - val_loss: 2.8889 - learning_rate: 1.0000e-04
Epoch 6/200
[1m2746/2746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 10ms/step - loss: 4.0088 - val_loss: 2.7340 - learning_rate: 1.0000e-04
Epoch 7/200
[1m2746/2746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 1

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, Input, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

# 1. Feature Engineering: Aggregate features and interactions
def feature_engineering(df):
    time_series_columns = {
        'retention': [f'RetentionD{i}' for i in range(16)],
        'ad_revenue': [f'AdRevenueD{i}' for i in range(16)],
        'iap_revenue': [f'IAPRevenueD{i}' for i in range(16)],
        'level_advanced': [f'LevelAdvancedCountD{i}' for i in range(16)]
    }

    # Aggregate features
    for feature, columns in time_series_columns.items():
        df[f'{feature}_total'] = df[columns].sum(axis=1)
        df[f'{feature}_mean'] = df[columns].mean(axis=1)
        df[f'{feature}_max'] = df[columns].max(axis=1)
        df[f'{feature}_std'] = df[columns].std(axis=1)

    # Interaction features
    df['retention_ad_revenue_interaction'] = df['retention_total'] * df['ad_revenue_total']
    df['retention_iap_revenue_interaction'] = df['retention_total'] * df['iap_revenue_total']
    df['retention_level_interaction'] = df['retention_total'] * df['level_advanced_total']

    return df

# 2. Load, merge, and process training data
train_data = pd.merge(users_train, user_features_train, on='ID')
train_data = pd.merge(train_data, targets_train, on='ID')
train_data = feature_engineering(train_data)

engineered_columns = [
    'retention_total', 'retention_mean', 'retention_max', 'retention_std',
    'ad_revenue_total', 'ad_revenue_mean', 'ad_revenue_max', 'ad_revenue_std',
    'iap_revenue_total', 'iap_revenue_mean', 'iap_revenue_max', 'iap_revenue_std',
    'level_advanced_total', 'level_advanced_mean', 'level_advanced_max', 'level_advanced_std',
    'retention_ad_revenue_interaction', 'retention_iap_revenue_interaction', 'retention_level_interaction'
]

time_series_columns = [
    *[f'RetentionD{i}' for i in range(16)],
    *[f'AdRevenueD{i}' for i in range(16)],
    *[f'IAPRevenueD{i}' for i in range(16)],
    *[f'LevelAdvancedCountD{i}' for i in range(16)]
]

train_data[time_series_columns] = train_data[time_series_columns].astype(float)

# Apply different scalers to sequential and engineered features
scaler_seq = StandardScaler()
scaler_eng = StandardScaler()

# Sequential data scaling
X_sequential = train_data[time_series_columns].values.reshape(train_data.shape[0], -1)
X_sequential = scaler_seq.fit_transform(X_sequential).reshape(train_data.shape[0], 16, 4)

# Engineered features scaling
X_engineered = scaler_eng.fit_transform(train_data[engineered_columns].values)

y = train_data['TARGET'].values.astype('float32')

# 3. Train-test split
X_train_seq, X_val_seq, X_train_eng, X_val_eng, y_train_seq, y_val_seq = train_test_split(
    X_sequential, X_engineered, y, test_size=0.2, random_state=42)

# 4. Build the LSTM model (300 units in Bidirectional LSTM)
def build_lstm_nn(input_shape_seq, input_shape_eng):
    seq_inputs = Input(shape=input_shape_seq)
    x = Bidirectional(LSTM(300, return_sequences=False))(seq_inputs)
    x = Dropout(0.3)(x)

    eng_inputs = Input(shape=input_shape_eng)

    combined = Concatenate()([x, eng_inputs])

    x = Dense(128, activation='relu', kernel_regularizer=l2(0.001))(combined)
    x = Dropout(0.3)(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)

    outputs = Dense(1)(x)

    model = Model(inputs=[seq_inputs, eng_inputs], outputs=outputs)
    model.compile(optimizer=Adam(learning_rate=0.00005), loss='mean_squared_error')

    return model

# 5. Build and train the model
input_shape_seq = (X_train_seq.shape[1], X_train_seq.shape[2])
input_shape_eng = (X_train_eng.shape[1],)
model = build_lstm_nn(input_shape_seq, input_shape_eng)

early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4)

# Train the model
history = model.fit(
    [X_train_seq, X_train_eng], y_train_seq,
    validation_data=([X_val_seq, X_val_eng], y_val_seq),
    epochs=150,
    batch_size=32,
    callbacks=[early_stopping, lr_scheduler],
    verbose=1
)

# 6. Predict and calculate RMSE
y_val_pred = model.predict([X_val_seq, X_val_eng])
rmse = np.sqrt(mean_squared_error(y_val_seq, y_val_pred))
print(f'Validation RMSE: {rmse}')

# 7. Generate Predictions for Test Data
test_data = pd.merge(users_test, user_features_test, on='ID')
test_data = feature_engineering(test_data)

# Sequential data for the test set
X_test_seq = test_data[time_series_columns].values.reshape(test_data.shape[0], -1)
X_test_seq = scaler_seq.transform(X_test_seq).reshape(test_data.shape[0], 16, 4)

# Engineered features for the test set
X_test_eng = scaler_eng.transform(test_data[engineered_columns].values)

# Predict on test data
test_predictions = model.predict([X_test_seq, X_test_eng])

# Ensure predictions are numeric and replace any invalid values
test_predictions = np.nan_to_num(test_predictions).flatten()  # Replace NaN with 0 or a small value if needed

# Create the submission file with proper formatting
submission = pd.DataFrame({
    'ID': test_data['ID'],
    'TARGET': test_predictions.astype(float)  # Ensure target values are floats
})

# Save the submission file
submission.to_csv('submission.csv', index=False)


Epoch 1/150
[1m21965/21965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 7ms/step - loss: 3.5191 - val_loss: 2.5590 - learning_rate: 5.0000e-05
Epoch 2/150
[1m21965/21965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 7ms/step - loss: 2.6463 - val_loss: 2.4640 - learning_rate: 5.0000e-05
Epoch 3/150
[1m21965/21965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 7ms/step - loss: 2.3498 - val_loss: 2.3848 - learning_rate: 5.0000e-05
Epoch 4/150
[1m21965/21965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 7ms/step - loss: 2.1368 - val_loss: 2.3248 - learning_rate: 5.0000e-05
Epoch 5/150
[1m21965/21965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 7ms/step - loss: 2.1145 - val_loss: 2.2588 - learning_rate: 5.0000e-05
Epoch 6/150
[1m21965/21965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 7ms/step - loss: 2.4595 - val_loss: 2.3130 - learning_rate: 5.0000e-05
Epoch 7/150
[1m21965/21965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

# Quantile transform and feature selecting

## Hyper parameters with optuna for lstm



In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.2-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.0/233.0 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [None]:
import optuna
from sklearn.metrics import mean_squared_error
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, Input, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop, Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer

# Objective function for Optuna optimization
def objective(trial):
    # Hyperparameters to be optimized
    lstm_units = trial.suggest_int('lstm_units', 64, 256)  # LSTM units
    dense_units = trial.suggest_int('dense_units', 32, 128)  # Dense units
    dropout_rate = trial.suggest_float('dropout_rate', 0.2, 0.5)  # Dropout rate
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128])  # Batch size

    # Optimizer: Let Optuna choose from RMSprop, Adam, and SGD
    optimizer_name = trial.suggest_categorical('optimizer', ['RMSprop', 'Adam', 'SGD'])

    # Learning rate: Optimized learning rate for selected optimizer
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)  # Log scale for learning rate

    # Choose the optimizer based on the Optuna trial
    if optimizer_name == 'RMSprop':
        optimizer = RMSprop(learning_rate=learning_rate)
    elif optimizer_name == 'Adam':
        optimizer = Adam(learning_rate=learning_rate)
    else:
        optimizer = SGD(learning_rate=learning_rate, momentum=0.9)  # Using momentum with SGD

    # Build the LSTM model
    def build_lstm_nn(input_shape_seq, input_shape_eng):
        seq_inputs = Input(shape=input_shape_seq)
        x = Bidirectional(LSTM(lstm_units, return_sequences=False))(seq_inputs)
        x = Dropout(dropout_rate)(x)

        # Feature-engineered part
        eng_inputs = Input(shape=input_shape_eng)

        # Combine sequential and engineered parts
        combined = Concatenate()([x, eng_inputs])

        # Dense layers with regularization
        x = Dense(dense_units, activation='relu')(combined)
        x = Dropout(dropout_rate)(x)
        x = Dense(dense_units // 2, activation='relu')(x)
        x = Dropout(dropout_rate)(x)

        # Output layer for regression
        outputs = Dense(1)(x)

        model = Model(inputs=[seq_inputs, eng_inputs], outputs=outputs)
        model.compile(optimizer=optimizer, loss='mean_squared_error')

        return model


    def feature_engineering(df):
    # Sequential features: keeping only relevant features from RFE
      time_series_columns = {
          'retention': [f'RetentionD{i}' for i in range(16)],
          'ad_revenue': [f'AdRevenueD{i}' for i in range(16)],
          'level_advanced': [f'LevelAdvancedCountD{i}' for i in range(16)]
      }

      # Aggregate features: sum, mean, and std
      for feature, columns in time_series_columns.items():
          # Convert boolean columns to integers if needed
          df[columns] = df[columns].astype(float)  # Ensure all columns are numeric

          df[f'{feature}_total'] = df[columns].sum(axis=1)
          df[f'{feature}_mean'] = df[columns].mean(axis=1)
          df[f'{feature}_std'] = df[columns].std(axis=1)

          # Lag features: Difference between consecutive days
          for i in range(1, 16):
              df.loc[:, f'{feature}_lag_{i}'] = df[f'{columns[i]}'] - df[f'{columns[i - 1]}']


          # Rolling statistics
          df[f'{feature}_rolling_mean_5'] = df[columns].rolling(window=5, axis=1).mean().mean(axis=1)
          df[f'{feature}_rolling_std_5'] = df[columns].rolling(window=5, axis=1).std().mean(axis=1)

      # Create the new columns separately
      new_columns = pd.DataFrame({
          'retention_ad_revenue_interaction': df['retention_total'] * df['ad_revenue_total'],
          'retention_level_interaction': df['retention_total'] * df['level_advanced_total']
      })

      # Concatenate the new columns with the original DataFrame
      df = pd.concat([df, new_columns], axis=1)

      return df
# 2. Merge and process training data
    train_data = pd.merge(users_train, user_features_train, on='ID')
    train_data = pd.merge(train_data, targets_train, on='ID')
    train_data = feature_engineering(train_data)

# List of the feature-engineered columns including new lag and rolling statistics
    engineered_columns = [
        'retention_total', 'retention_mean', 'retention_std',
        'ad_revenue_total', 'ad_revenue_mean', 'ad_revenue_std',
        'level_advanced_total', 'level_advanced_std',
        'retention_ad_revenue_interaction', 'retention_level_interaction',
        # Adding lag and rolling stats for each feature
        'retention_lag_1', 'retention_lag_2', 'retention_lag_3', 'retention_lag_4', 'retention_lag_5',
        'ad_revenue_lag_1', 'ad_revenue_lag_2', 'ad_revenue_lag_3', 'ad_revenue_lag_4', 'ad_revenue_lag_5',
        'level_advanced_lag_1', 'level_advanced_lag_2', 'level_advanced_lag_3', 'level_advanced_lag_4', 'level_advanced_lag_5',
        'retention_rolling_mean_5', 'retention_rolling_std_5',
        'ad_revenue_rolling_mean_5', 'ad_revenue_rolling_std_5',
        'level_advanced_rolling_mean_5', 'level_advanced_rolling_std_5'
    ]

    # Sequential columns for LSTM
    time_series_columns = [
        *[f'RetentionD{i}' for i in range(16)],
        *[f'AdRevenueD{i}' for i in range(16)],
        *[f'LevelAdvancedCountD{i}' for i in range(16)]
    ]

    # Convert boolean columns to integers
    boolean_columns = train_data[time_series_columns].select_dtypes(include=['bool']).columns
    train_data[boolean_columns] = train_data[boolean_columns].astype(int)

    # Convert all inputs to float32
    X_sequential = train_data[time_series_columns].values.reshape(train_data.shape[0], 16, 3).astype('float32')
    X_engineered = train_data[engineered_columns].values.astype('float32')

    # Apply quantile transformation to the selected engineered features
    qt = QuantileTransformer(output_distribution='normal', random_state=42)
    X_engineered_transformed = qt.fit_transform(X_engineered)

    y = train_data['TARGET'].values.astype('float32')


    # Split data
    X_train_seq, X_val_seq, X_train_eng, X_val_eng, y_train_seq, y_val_seq = train_test_split(
        X_sequential, X_engineered_transformed, y, test_size=0.2, random_state=42
    )

    # Build the model
    input_shape_seq = (X_train_seq.shape[1], X_train_seq.shape[2])
    input_shape_eng = (X_train_eng.shape[1],)
    model = build_lstm_nn(input_shape_seq, input_shape_eng)

    # Early stopping and learning rate scheduler
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)

    # Train the model
    history = model.fit(
        [X_train_seq, X_train_eng], y_train_seq,
        validation_data=([X_val_seq, X_val_eng], y_val_seq),
        epochs=100,
        batch_size=batch_size,  # Use the batch size selected by Optuna
        callbacks=[early_stopping, lr_scheduler],
        verbose=0  # Set verbose=0 for cleaner Optuna output
    )

    # Predict and calculate RMSE for validation set
    y_val_pred = model.predict([X_val_seq, X_val_eng])
    rmse = np.sqrt(mean_squared_error(y_val_seq, y_val_pred))

    return rmse

# Create an Optuna study and optimize it
study = optuna.create_study(direction='minimize')  # We want to minimize RMSE
study.optimize(objective, n_trials=8)  # Run 30 trials of hyperparameter optimization

# Output the best hyperparameters
print(f"Best trial: {study.best_trial.params}")


# Hyper parametered lstm try

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor, Pool
import xgboost as xgb

# Function to reduce memory usage
def reduce_memory_usage(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object and not isinstance(col_type, pd.CategoricalDtype):
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    return df

# Merge user metadata, features, and targets
train_data = pd.merge(users_train, user_features_train, on='ID')
train_data = pd.merge(train_data, targets_train, on='ID')

# Label encode categorical variables
label_encoders = {}
for column in train_data.select_dtypes(include=['category', 'object']).columns:
    le = LabelEncoder()
    train_data[column] = le.fit_transform(train_data[column].astype(str))
    label_encoders[column] = le

# Feature engineering - Date-based features
train_data['first_open_datetime'] = pd.to_datetime(train_data['first_open_timestamp'], unit='us')
train_data['first_open_day'] = train_data['first_open_datetime'].dt.day
train_data['first_open_month'] = train_data['first_open_datetime'].dt.month
train_data['first_open_hour'] = train_data['first_open_datetime'].dt.hour

# Aggregated behavioral features
train_data['total_retention'] = train_data[[f'RetentionD{i}' for i in range(1, 16)]].sum(axis=1)
train_data['total_levels_completed'] = train_data[[f'LevelAdvancedCountD{i}' for i in range(1, 16)]].sum(axis=1)
train_data['total_ad_revenue'] = train_data[[f'AdRevenueD{i}' for i in range(1, 16)]].sum(axis=1)
train_data['total_iap_revenue'] = train_data[[f'IAPRevenueD{i}' for i in range(1, 16)]].sum(axis=1)

# Drop unnecessary columns (including datetime columns)
useless_columns = ['ID', 'first_open_date', 'first_open_timestamp', 'local_first_open_timestamp', 'first_open_datetime']
train_data.drop(columns=useless_columns, inplace=True)

# Separate features and target
X = train_data.drop(columns=['TARGET'])
y = train_data['TARGET']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical features
cat_features = ['country', 'platform', 'device_category', 'device_brand', 'device_model', 'ad_network']

# Scale the numerical features only
num_features = [col for col in X_train.columns if col not in cat_features]
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train[num_features])
X_val_num_scaled = scaler.transform(X_val[num_features])

# Combine scaled numerical features with original (unscaled) categorical features
X_train_combined = np.hstack([X_train_num_scaled, X_train[cat_features].values])
X_val_combined = np.hstack([X_val_num_scaled, X_val[cat_features].values])

# Convert the data into CatBoost Pool format
train_pool = Pool(X_train, y_train, cat_features=[X.columns.get_loc(col) for col in cat_features])
val_pool = Pool(X_val, y_val, cat_features=[X.columns.get_loc(col) for col in cat_features])

# Train CatBoost
catboost_model = CatBoostRegressor(
    iterations=1346,
    learning_rate=0.042,
    depth=8,
    loss_function='RMSE',
    silent=True
)
catboost_model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=100)

# XGBoost Best Parameters
xgboost_model = xgb.XGBRegressor(
    tree_method='hist',
    subsample=0.8,
    reg_lambda=1.0,
    reg_alpha=0.1,
    objective='reg:squarederror',
    n_estimators=1000,
    min_child_weight=1,
    max_depth=8,
    learning_rate=0.01,
    gamma=0.1,
    colsample_bytree=0.8,
    device='cuda',
    early_stopping_rounds=50
)

# Train XGBoost
xgboost_model.fit(X_train, y_train, eval_set=[(X_val, y_val)],  verbose=False)

# Blend predictions
catboost_preds = catboost_model.predict(X_val)
xgboost_preds = xgboost_model.predict(X_val)
final_preds = 0.5 * catboost_preds + 0.5 * xgboost_preds  # Blend with equal weights

# Evaluate
final_rmse = np.sqrt(mean_squared_error(y_val, final_preds))
print(f'Final blended RMSE: {final_rmse}')

# Test set predictions
test_data = pd.merge(users_test, user_features_test, on='ID')
for column in test_data.select_dtypes(include=['category', 'object']).columns:
    le = label_encoders.get(column)
    if le is not None:
        test_data[column] = test_data[column].map(lambda s: le.transform([s])[0] if s in le.classes_ else -1)

# Align test set columns with training set (handle any potential missing columns)
test_data = test_data.reindex(columns=X_train.columns, fill_value=0)

# Make predictions on the test set using both models and blend them
catboost_test_preds = catboost_model.predict(test_data)
xgboost_test_preds = xgboost_model.predict(test_data)
final_test_preds = 0.5 * catboost_test_preds + 0.5 * xgboost_test_preds

# Create a submission file
submission = pd.DataFrame({'ID': users_test['ID'], 'TARGET': final_test_preds})
submission.to_csv('submission.csv', index=False)


Final blended RMSE: 1.805513445119748


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import optuna
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from category_encoders import TargetEncoder

# Function to reduce memory usage
def reduce_memory_usage(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object and not isinstance(col_type, pd.CategoricalDtype):
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            if not isinstance(col_type, pd.CategoricalDtype):
                df[col] = df[col].astype('category')
    return df

# Load datasets
users_train = reduce_memory_usage(users_train)
user_features_train = reduce_memory_usage(user_features_train)
targets_train = reduce_memory_usage(targets_train)

# Merge user metadata, features, and targets
train_data = pd.merge(users_train, user_features_train, on='ID')
train_data = pd.merge(train_data, targets_train, on='ID')

# Drop unnecessary columns
useless_columns = ['first_open_date', 'first_open_timestamp', 'local_first_open_timestamp']
train_data.drop(columns=useless_columns, inplace=True)

# Target encode categorical variables
encoder = TargetEncoder(cols=train_data.select_dtypes(include=['category']).columns)
train_data = encoder.fit_transform(train_data, train_data['TARGET'])

# Separate features and target
X = train_data.drop(columns=['TARGET'])
y = train_data['TARGET']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define the cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Optuna optimization objective
def objective(trial):
    # Define hyperparameters to optimize
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 1500),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.05),
        'max_depth': trial.suggest_int('max_depth', 6, 20),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 10.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 10.0),
        'random_state': 42,
        'tree_method': 'hist',
        "device" :"cuda",# Use GPU if available
        'eval_metric': 'rmse'
    }

    rmse_scores = []

    # Cross-validation loop
    for train_index, val_index in kf.split(X_scaled):
        X_train, X_val = X_scaled[train_index], X_scaled[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Model initialization
        model = XGBRegressor(**params, early_stopping_rounds=50)

        # Train the model with early stopping
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=False
        )

        # Predict on validation set
        y_pred = model.predict(X_val)

        # Calculate RMSE for this fold
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores.append(rmse)

    # Average RMSE over all folds
    avg_rmse = np.mean(rmse_scores)
    return avg_rmse

# Run the optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Print best parameters
best_params = study.best_params
print(f'Best parameters: {best_params}')

# Train a final model with best parameters on the entire training set
model = XGBRegressor(**best_params, early_stopping_rounds=50)
model.fit(X_scaled, y)

# Test set processing
users_test = reduce_memory_usage(users_test)
user_features_test = reduce_memory_usage(user_features_test)

# Merge test data
test_data = pd.merge(users_test, user_features_test, on='ID')

# Drop unnecessary columns
test_data.drop(columns=useless_columns, inplace=True)

# Target encode test set
test_data = encoder.transform(test_data)

# Align test set columns with training set
test_data = test_data.reindex(columns=X.columns, fill_value=0)

# Scale the test set
test_data_scaled = scaler.transform(test_data)

# Make predictions on test set using the tuned model
test_preds = model.predict(test_data_scaled)

# Create a submission file
submission = pd.DataFrame({'ID': users_test['ID'], 'TARGET': test_preds})
submission.to_csv('submission.csv', index=False)


  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
[I 2024-09-11 12:58:20,291] A new study created in memory with name: no-name-229b56d4-d5b6-4bd4-ab14-65d291e3c123
[I 2024-09-11 12:59:07,198] Trial 0 finished with value: 1.6274292469024658 and parameters: {'n_estimators': 1094, 'learning_rate': 0.028416279861585833, 'max_depth': 10, 'subsample': 0.6752126557279976, 'colsample_bytree': 0.9307937929561753, 'reg_lambda': 6.2970212395971865, 'reg_alpha': 0.7968988460271819}. Best is trial 0 with value: 1.6274292469024658.
[I 2024-09-11 12:59:34,192] Trial 1 finished with value: 1.617990255355835 and parameters: {'n_estimators': 887, 'learning_rate': 0.0339148668401466, 'max_depth': 6, 'subsample': 0.6801617613652958, 'colsample_bytree': 0.552020

KeyboardInterrupt: 

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import optuna
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import StackingRegressor
from sklearn.preprocessing import LabelEncoder
from category_encoders import TargetEncoder
from sklearn.base import BaseEstimator, RegressorMixin

# Function to reduce memory usage
def reduce_memory_usage(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object and not isinstance(col_type, pd.CategoricalDtype):
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            if not isinstance(col_type, pd.CategoricalDtype):
                df[col] = df[col].astype('category')
    return df

# Custom Model for Stacking
class StackedModel(BaseEstimator, RegressorMixin):
    def __init__(self, models, meta_model):
        self.models = models
        self.meta_model = meta_model

    def fit(self, X, y):
        self.fitted_models = [model.fit(X, y) for model in self.models]
        meta_X = np.column_stack([model.predict(X) for model in self.fitted_models])
        self.meta_model.fit(meta_X, y)
        return self

    def predict(self, X):
        meta_X = np.column_stack([model.predict(X) for model in self.fitted_models])
        return self.meta_model.predict(meta_X)

# Load datasets
users_train = reduce_memory_usage(users_train)
user_features_train = reduce_memory_usage(user_features_train)
targets_train = reduce_memory_usage(targets_train)

# Merge user metadata, features, and targets
train_data = pd.merge(users_train, user_features_train, on='ID')
train_data = pd.merge(train_data, targets_train, on='ID')

train_data=train_data.head(50)
# Drop unnecessary columns
useless_columns = ['ID', 'first_open_date', 'first_open_timestamp', 'local_first_open_timestamp']
train_data.drop(columns=useless_columns, inplace=True)

# Target encode categorical variables
encoder = TargetEncoder(cols=train_data.select_dtypes(include=['category']).columns)
train_data = encoder.fit_transform(train_data, train_data['TARGET'])

# Separate features and target
X = train_data.drop(columns=['TARGET'])
y = train_data['TARGET']

# Apply log transformation to the target
y_log = np.log1p(y)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define the cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the models for the ensemble
def objective(trial):
    xgb_params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 1500),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.05),
        'max_depth': trial.suggest_int('max_depth', 6, 20),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 10.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 10.0),
        'tree_method': 'hist',
        "device":"cuda"
    }

    lgb_params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 1500),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.05),
        'max_depth': trial.suggest_int('max_depth', 6, 20),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 10.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 10.0),
        "verbose" : -1
    }

    catboost_params = {
        'iterations': trial.suggest_int('iterations', 500, 1500),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.05),
        'depth': trial.suggest_int('depth', 4, 16),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.01, 10.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'random_strength': trial.suggest_float('random_strength', 0.5, 2.0)
    }

    models = [
        XGBRegressor(**xgb_params),
        LGBMRegressor(**lgb_params),
        CatBoostRegressor(**catboost_params, silent=True)
    ]

    meta_model = RidgeCV()

    stacked_model = StackedModel(models=models, meta_model=meta_model)

    rmse_scores = []
    for train_index, val_index in kf.split(X_scaled):
        X_train, X_val = X_scaled[train_index], X_scaled[val_index]
        y_train, y_val = y_log[train_index], y_log[val_index]

        stacked_model.fit(X_train, y_train)
        y_pred_log = stacked_model.predict(X_val)
        y_pred = np.expm1(y_pred_log)  # Reverse the log transformation

        rmse = np.sqrt(mean_squared_error(np.expm1(y_val), y_pred))
        rmse_scores.append(rmse)

    return np.mean(rmse_scores)

# Run the optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Print best parameters
best_params = study.best_params
print(f'Best parameters: {best_params}')

# Train a final stacked model with best parameters
models = [
    XGBRegressor(**best_params),
    LGBMRegressor(**best_params),
    CatBoostRegressor(**best_params, silent=True)
]
meta_model = RidgeCV()
stacked_model = StackedModel(models=models, meta_model=meta_model)
stacked_model.fit(X_scaled, y_log)

# Test set processing
users_test = reduce_memory_usage(users_test)
user_features_test = reduce_memory_usage(user_features_test)

# Merge test data
test_data = pd.merge(users_test, user_features_test, on='ID')

test_data=test_data.head(50)
# Drop unnecessary columns
test_data.drop(columns=useless_columns, inplace=True)

# Target encode test set
test_data = encoder.transform(test_data)

# Align test set columns with training set
test_data = test_data.reindex(columns=X.columns, fill_value=0)

# Scale the test set
test_data_scaled = scaler.transform(test_data)

# Make predictions on test set using the stacked model
test_preds_log = stacked_model.predict(test_data_scaled)
test_preds = np.expm1(test_preds_log)


[I 2024-09-11 13:56:59,919] A new study created in memory with name: no-name-828baf87-6c71-4f05-a80c-1138587cada0
[I 2024-09-11 13:57:38,877] Trial 0 finished with value: 0.40481151459575726 and parameters: {'n_estimators': 1495, 'learning_rate': 0.0068778966526878114, 'max_depth': 14, 'subsample': 0.9287179017599254, 'colsample_bytree': 0.6867226360612574, 'reg_lambda': 2.049985303257367, 'reg_alpha': 2.615738141383645, 'iterations': 1474, 'depth': 11, 'l2_leaf_reg': 4.252153887916706, 'random_strength': 0.9986113707482451}. Best is trial 0 with value: 0.40481151459575726.
[I 2024-09-11 13:59:01,219] Trial 1 finished with value: 0.3834178857259302 and parameters: {'n_estimators': 1294, 'learning_rate': 0.039921662184369516, 'max_depth': 20, 'subsample': 0.6377323608851418, 'colsample_bytree': 0.9280951400314306, 'reg_lambda': 3.8486062323410826, 'reg_alpha': 0.23432951549530692, 'iterations': 1162, 'depth': 15, 'l2_leaf_reg': 2.664826393781268, 'random_strength': 1.7777301891616974}. 

Best parameters: {'n_estimators': 1126, 'learning_rate': 0.019130839284656524, 'max_depth': 20, 'subsample': 0.5404667206302288, 'colsample_bytree': 0.9960968919338987, 'reg_lambda': 5.059996358253325, 'reg_alpha': 0.3258110190103913, 'iterations': 1299, 'depth': 5, 'l2_leaf_reg': 6.0718598045826555, 'random_strength': 0.9851585200596467}


TypeError: CatBoostRegressor.__init__() got an unexpected keyword argument 'colsample_bytree'

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

# Function to reduce memory usage
def reduce_memory_usage(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object and not isinstance(col_type, pd.CategoricalDtype):
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            if not isinstance(col_type, pd.CategoricalDtype):
                df[col] = df[col].astype('category')
    return df

# Load datasets
users_train = reduce_memory_usage(pd.read_csv('users_train.csv'))
user_features_train = reduce_memory_usage(pd.read_csv('user_features_train.csv'))
targets_train = reduce_memory_usage(pd.read_csv('targets_train.csv'))

# Merge user metadata, features, and targets
train_data = pd.merge(users_train, user_features_train, on='ID')
train_data = pd.merge(train_data, targets_train, on='ID')

# Drop unnecessary columns
useless_columns = ['ID', 'first_open_date', 'first_open_timestamp', 'local_first_open_timestamp']
train_data.drop(columns=useless_columns, inplace=True)

# Label encode categorical variables
label_encoders = {}
for column in train_data.select_dtypes(include=['category', 'object']).columns:
    le = LabelEncoder()
    train_data[column] = le.fit_transform(train_data[column].astype(str))
    label_encoders[column] = le

# Separate features and target
X = train_data.drop(columns=['TARGET'])
y = train_data['TARGET']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply log transformation to the target
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Best parameters from Optuna tuning
best_params = {
    'bootstrap_type': 'Bayesian',
    'iterations': 1346,
    'learning_rate': 0.04412581031554557,
    'depth': 4,
    'l2_leaf_reg': 4.814450377788459,
    'random_strength': 0.9047553220374154,
    'silent': True  # To suppress training logs
}

# Train final CatBoost model with the best parameters
best_catboost_model = CatBoostRegressor(**best_params)
best_catboost_model.fit(X_train_scaled, y_train_log)

# Calculate RMSE on the training set
train_preds_log = best_catboost_model.predict(X_train_scaled)
train_preds = np.expm1(train_preds_log)
train_rmse = np.sqrt(mean_squared_error(np.expm1(y_train_log), train_preds))
print(f'Final RMSE on the training set: {train_rmse}')

# Calculate RMSE on the validation set
val_preds_log = best_catboost_model.predict(X_val_scaled)
val_preds = np.expm1(val_preds_log)
val_rmse = np.sqrt(mean_squared_error(np.expm1(y_val_log), val_preds))
print(f'Final RMSE on the validation set: {val_rmse}')

# Test set processing
users_test = reduce_memory_usage(pd.read_csv('users_test.csv'))
user_features_test = reduce_memory_usage(pd.read_csv('user_features_test.csv'))

# Merge test data
test_data = pd.merge(users_test, user_features_test, on='ID')

# Drop unnecessary columns
test_data.drop(columns=useless_columns, inplace=True)

# Apply label encoding to the test set using the same encoders as in the training set
for column in test_data.select_dtypes(include=['category', 'object']).columns:
    le = label_encoders.get(column)
    if le is not None:
        # Handle unseen labels by mapping them to a default value (-1)
        test_data[column] = test_data[column].map(lambda s: le.transform([s])[0] if s in le.classes_ else -1)

# Align test set columns with training set (handle any potential missing columns)
test_data = test_data.reindex(columns=X_train.columns, fill_value=0)

# Scale the test set
test_data_scaled = scaler.transform(test_data)

# Make predictions on the test set using the final CatBoost model
test_preds_log = best_catboost_model.predict(test_data_scaled)
test_preds = np.expm1(test_preds_log)

# Create a submission file
submission = pd.DataFrame({'ID': users_test['ID'], 'TARGET': test_preds})
submission.to_csv('submission.csv', index=False)


Final RMSE on the training set: 1.4228414773605031
Final RMSE on the validation set: 1.8227595465578872


In [None]:
import optuna
from sklearn.model_selection import train_test_split

# Function to optimize CatBoost parameters using Optuna
def objective(trial):
    # Define hyperparameter search space
    catboost_params = {
        'iterations': trial.suggest_int('iterations', 500, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-2, 10),
        'random_strength': trial.suggest_float('random_strength', 0.5, 2.0),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'silent': True
    }

    # Only use subsample if bootstrap type is not 'Bayesian'
    if catboost_params['bootstrap_type'] != 'Bayesian':
        catboost_params['subsample'] = trial.suggest_float('subsample', 0.5, 1.0)

    # Initialize the CatBoostRegressor with trial parameters
    model = CatBoostRegressor(**catboost_params)

    # Train and validation split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    y_train_log = np.log1p(y_train)
    y_val_log = np.log1p(y_val)

    # Scale the features
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Fit the model
    model.fit(X_train_scaled, y_train_log)

    # Predict on validation set
    val_preds_log = model.predict(X_val_scaled)
    val_preds = np.expm1(val_preds_log)

    # Calculate RMSE
    val_rmse = np.sqrt(mean_squared_error(np.expm1(y_val_log), val_preds))

    return val_rmse

# Run Optuna optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Get the best parameters
best_params = study.best_params
print(f'Best parameters: {best_params}')


[I 2024-09-11 16:27:46,662] A new study created in memory with name: no-name-c4d259fe-b5f2-4376-a96b-cb46ea31d9db
[I 2024-09-11 16:28:20,166] Trial 0 finished with value: 1.8133966572437532 and parameters: {'iterations': 1076, 'learning_rate': 0.06279732410030457, 'depth': 5, 'l2_leaf_reg': 9.278194532222534, 'random_strength': 1.4302327096762606, 'bootstrap_type': 'MVS', 'grow_policy': 'SymmetricTree', 'subsample': 0.8359788010248591}. Best is trial 0 with value: 1.8133966572437532.
[I 2024-09-11 16:28:43,456] Trial 1 finished with value: 1.8968406723916555 and parameters: {'iterations': 1069, 'learning_rate': 0.014503924999202427, 'depth': 3, 'l2_leaf_reg': 1.1095666603169931, 'random_strength': 1.4270950861816796, 'bootstrap_type': 'MVS', 'grow_policy': 'SymmetricTree', 'subsample': 0.5647331311665895}. Best is trial 0 with value: 1.8133966572437532.
[I 2024-09-11 16:31:51,968] Trial 2 finished with value: 1.742345530997049 and parameters: {'iterations': 875, 'learning_rate': 0.0905

Best parameters: {'iterations': 1346, 'learning_rate': 0.042390909851230944, 'depth': 8, 'l2_leaf_reg': 1.5023079004958326, 'random_strength': 1.918922766073629, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'Lossguide', 'subsample': 0.7191772414063006}


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor, Pool

# Function to reduce memory usage
def reduce_memory_usage(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object and not isinstance(col_type, pd.CategoricalDtype):
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    return df

# Load datasets
train_data = pd.merge(users_train, user_features_train, on='ID')
train_data = pd.merge(train_data, targets_train, on='ID')

# Label encode categorical variables
label_encoders = {}
for column in train_data.select_dtypes(include=['category', 'object']).columns:
    le = LabelEncoder()
    train_data[column] = le.fit_transform(train_data[column].astype(str))
    label_encoders[column] = le

# Feature engineering - Date-based features
train_data['first_open_datetime'] = pd.to_datetime(train_data['first_open_timestamp'], unit='us')
train_data['first_open_day'] = train_data['first_open_datetime'].dt.day
train_data['first_open_month'] = train_data['first_open_datetime'].dt.month
train_data['first_open_hour'] = train_data['first_open_datetime'].dt.hour

# Aggregated behavioral features
train_data['total_retention'] = train_data[[f'RetentionD{i}' for i in range(1, 16)]].sum(axis=1)
train_data['total_levels_completed'] = train_data[[f'LevelAdvancedCountD{i}' for i in range(1, 16)]].sum(axis=1)
train_data['total_ad_revenue'] = train_data[[f'AdRevenueD{i}' for i in range(1, 16)]].sum(axis=1)
train_data['total_iap_revenue'] = train_data[[f'IAPRevenueD{i}' for i in range(1, 16)]].sum(axis=1)

# Drop unnecessary columns (including datetime columns)
useless_columns = ['ID', 'first_open_date', 'first_open_timestamp', 'local_first_open_timestamp', 'first_open_datetime']
train_data.drop(columns=useless_columns, inplace=True)

# Separate features and target
X = train_data.drop(columns=['TARGET'])
y = train_data['TARGET']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical features
cat_features = ['country', 'platform', 'device_category', 'device_brand', 'device_model', 'ad_network']

# Separate numerical features for scaling
num_features = [col for col in X_train.columns if col not in cat_features]

# Scale the numerical features only
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train[num_features])
X_val_num_scaled = scaler.transform(X_val[num_features])

# Combine scaled numerical features with original categorical features (categorical features are not scaled)
X_train_combined = pd.DataFrame(X_train_num_scaled, columns=num_features)
X_train_combined[cat_features] = X_train[cat_features].reset_index(drop=True)

X_val_combined = pd.DataFrame(X_val_num_scaled, columns=num_features)
X_val_combined[cat_features] = X_val[cat_features].reset_index(drop=True)

# Apply log transformation to the target
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

# Convert the data into CatBoost Pool format, making sure categorical features are passed correctly
train_pool = Pool(X_train_combined, y_train_log, cat_features=cat_features)
val_pool = Pool(X_val_combined, y_val_log, cat_features=cat_features)

# Best parameters from Optuna tuning
best_params = {
    'iterations': 1346,
    'learning_rate': 0.042390909851230944,
    'depth': 8,
    'l2_leaf_reg': 1.5023079004958326,
    'random_strength': 1.918922766073629,
    'bootstrap_type': 'Bernoulli',
    'grow_policy': 'Lossguide',
    'subsample': 0.7191772414063006,
    'silent': True,
    'early_stopping_rounds': 100,  # Early stopping
    'loss_function': 'RMSE'
}

# Train final CatBoost model with the best parameters
best_catboost_model = CatBoostRegressor(**best_params)
best_catboost_model.fit(train_pool, eval_set=val_pool)

# Test set processing
test_data = pd.merge(users_test, user_features_test, on='ID')

# Apply label encoding to the test set using the same encoders as in the training set
for column in test_data.select_dtypes(include=['category', 'object']).columns:
    le = label_encoders.get(column)
    if le is not None:
        test_data[column] = test_data[column].map(lambda s: le.transform([s])[0] if s in le.classes_ else -1)

# Align test set columns with training set (handle any potential missing columns)
test_data = test_data.reindex(columns=X_train_combined.columns, fill_value=0)

# Separate numerical features for scaling
test_data_num_scaled = scaler.transform(test_data[num_features])

# Combine scaled numerical features with original (unscaled) categorical features for test data
test_data_combined = pd.DataFrame(test_data_num_scaled, columns=num_features)
test_data_combined[cat_features] = test_data[cat_features].reset_index(drop=True)

# Make predictions on the test set using the final CatBoost model
test_pool = Pool(test_data_combined, cat_features=cat_features)
test_preds_log = best_catboost_model.predict(test_pool)
test_preds = np.expm1(test_preds_log)

# Create a submission file
submission = pd.DataFrame({'ID': users_test['ID'], 'TARGET': test_preds})
submission.to_csv('submission.csv', index=False)

# Assuming 'TARGET' is available in the test data for RMSE calculation
if 'TARGET' in test_data.columns:
    test_rmse = np.sqrt(mean_squared_error(test_data['TARGET'], test_preds))
    print(f'Final RMSE on the test set: {test_rmse}')


KeyboardInterrupt: 