# Part 1: Setup
## Imports

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import scipy
from tqdm.auto import tqdm

In [2]:
!ls /kaggle/input

media-campaign-cost-prediction	playground-series-s3e11


In [3]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

## Data ingest

In [4]:
train_df = pd.read_csv('/kaggle/input/playground-series-s3e11/train.csv', index_col='id')
test_df = pd.read_csv('/kaggle/input/playground-series-s3e11/test.csv', index_col='id')
original_df = pd.read_csv('/kaggle/input/media-campaign-cost-prediction/train_dataset.csv')

In [5]:
(train_df.columns == original_df.columns).all()

True

Train has a "cost" column that is our y-values, test does not (we have to predict the cost). "Original" is much smaller than train, which was generated from it, but is the "real" ground truth.

In [6]:
train_df['log_cost'] = np.log1p(train_df['cost'])
original_df['log_cost'] = np.log1p(original_df['cost'])

We want to fit the log_cost, not the cost, as the kaggle accuracy is MSE of log cost.

In [7]:
train_df['salad_bar & prepared_food'] = train_df['salad_bar'] + train_df['prepared_food']
original_df['salad_bar & prepared_food'] = original_df['salad_bar'] + original_df['prepared_food']
test_df['salad_bar & prepared_food'] = test_df['salad_bar'] + test_df['prepared_food']

## Part 3: Feature selection

In [8]:
selected_features = [
    'unit_sales(in millions)',
    'total_children',
    'num_children_at_home',
    'avg_cars_at home(approx).1',
    'store_sqft',
    'coffee_bar',
    'video_store',
    'salad_bar & prepared_food', 
    'florist'
]
selected_features2 = selected_features[1:]

## Part 2: Set up CV function

In [9]:
def cross_validate(model, train_df, original_df=None, selected_features=selected_features, fit_params={}, refit=True):
    kf = KFold(shuffle=True, random_state = 42)
    folds = list(kf.split(train_df))
    
    collected_scores = []
    for fold_train_idx, fold_test_idx in tqdm(folds):
        if not refit:
            print("skip CV as refit=False")
            break
        if original_df is None:
            fold_train = train_df.loc[fold_train_idx]
        else:
            fold_train = pd.concat((train_df.loc[fold_train_idx], original_df))
        fold_test = train_df.loc[fold_test_idx]
            
        if 'sample_weight' not in fold_train.columns:
            fold_train = fold_train\
                .groupby(selected_features)\
                ['log_cost']\
                .agg(['mean', 'count'])\
                .rename(columns={'mean': 'log_cost', 'count': 'sample_weight'})\
                .reset_index()

        fold_train_x = fold_train[selected_features]
        fold_train_y = fold_train['log_cost']
        fold_train_sample_weight = fold_train['sample_weight']
        
        model.fit(fold_train_x, fold_train_y, final__sample_weight=fold_train_sample_weight, **fit_params)
        
        fold_test_x = fold_test[selected_features]
        fold_test_y = fold_test['log_cost']
        
        fold_test_y_pred = model.predict(fold_test_x)
        score = mean_squared_error(fold_test_y, fold_test_y_pred, squared=False)
        collected_scores.append(score)
    
    train_df = pd.concat((train_df, original_df)) if original_df is not None else train_df
    if 'sample_weight' not in train_df.columns:
        train_df = train_df\
            .groupby(selected_features)\
            ['log_cost']\
            .agg(['mean', 'count'])\
            .rename(columns={'mean': 'log_cost', 'count': 'sample_weight'})\
            .reset_index()
    train_x = train_df[selected_features]
    train_y = train_df['log_cost']
    train_sample_weight = train_df['sample_weight']
    model.fit(train_x, train_y, final__sample_weight=train_sample_weight, **fit_params)

    overall_score = np.array(collected_scores).mean()
    print(overall_score)
    return overall_score

In [10]:
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
from sklearn.metrics import mean_squared_error

## Part 4: Can we replicate the ridge regression RMSLE?

In [11]:
from sklearn.linear_model import Ridge
%%time
model = Pipeline(
    steps=[
        ('ct', ColumnTransformer([('ohe', OneHotEncoder(drop='first'), 
                                   ['total_children',
                                    'num_children_at_home',
                                    'avg_cars_at home(approx).1',
                                    'store_sqft'])],
                                 remainder='passthrough')),
        ('pf', PolynomialFeatures(3, interaction_only=True, include_bias=False)),
        ('final', Ridge(random_state=1))
    ])
cross_validate(model, train_df, original_df, selected_features=selected_features2).mean()

UsageError: Line magic function `%%time` not found.


In [None]:
test_df['log_cost'] = model.predict(test_df[selected_features2])
test_df['cost'] = np.expm1(np.mean(test_df[['log_cost']], axis=1))
test_df['cost'].to_csv('/kaggle/working/predict_ridge_poly_oh_some.csv')

In [None]:
model_rr = model

### Part 5: Can we try manually tuning the hyperparameters w/ an LGBM?

In [None]:
from lightgbm import LGBMRegressor
lgbm_params = {'learning_rate': 0.1,
               'tree_method': 'hist',
               'random_state': 1,
               'eval_metric': 'rmse',
               'categorical_feature': [selected_features.index('store_sqft')],
               'verbose': -1,
               'n_estimators': 450,
                'num_leaves': 100,
                'min_child_samples': 1,
                'min_child_weight': 1e1,}
"""
lgbm_tuned_params = {
    'n_estimators': np.linspace(20, 1000, 100).astype(int),
    'num_leaves': np.linspace(20, 1000, 100).astype(int),
    'min_child_weight': np.linspace(0, 5, 100).astype(int),
}
lgbmr = LGBMRegressor(**lgbm_params)
cv_lgbm = RandomizedSearchCV(lgbmr,
                             lgbm_tuned_params,
                             n_jobs=8,
                             n_iter=20,
                             verbose=3,
                             scoring='neg_root_mean_squared_error')
cv_lgbm.fit(X=train_df_dedup[selected_features],
            y=train_df_dedup['log_cost'],
            sample_weight=train_df_dedup['sample_weight'])
"""

In [None]:
lgbm_params['verbose'] = -1
model = Pipeline(
    steps=[
        ('final', LGBMRegressor(**lgbm_params))
#        ('final', LGBMRegressor(**cv_lgbm.best_params_, verbose=-1))
    ])
score = cross_validate(model, train_df, original_df, selected_features=selected_features).mean()

In [None]:
test_df['log_cost'] = model.predict(test_df[selected_features])
#test_df['log_cost'] = cv_lgbm.best_estimator_.predict(test_df[selected_features])
test_df['cost'] = np.expm1(np.mean(test_df[['log_cost']], axis=1))
test_df['cost'].to_csv('predict_lgbm_selected_features.csv')

In [None]:
model_lgb1 = model

## Part 6: What about with only the categorical features?

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from lightgbm import LGBMRegressor
lgbm_params = {'learning_rate': 0.1,
               'tree_method': 'hist',
               'random_state': 1,
               'eval_metric': 'rmse',
               'categorical_feature': [selected_features.index('store_sqft')],
               'verbose': -1,
               'n_estimators': 450,
                'num_leaves': 100,
                'min_child_samples': 1,
                'min_child_weight': 1e1,}
"""
lgbm_tuned_params = {
    'n_estimators': np.linspace(20, 1000, 100).astype(int),
    'num_leaves': np.linspace(20, 1000, 100).astype(int),
    'min_child_weight': np.linspace(0, 5, 100).astype(int),
}
lgbmr = LGBMRegressor(**lgbm_params)
cv_lgbm = RandomizedSearchCV(lgbmr,
                             lgbm_tuned_params,
                             n_jobs=8,
                             n_iter=20,
                             verbose=3,
                             scoring='neg_root_mean_squared_error')
cv_lgbm.fit(X=train_df_dedup[selected_features2],
            y=train_df_dedup['log_cost'],
            sample_weight=train_df_dedup['sample_weight'])
"""

In [None]:
lgbm_params['verbose'] = -1
model = Pipeline(
    steps=[
        ('final', LGBMRegressor(**lgbm_params))
#        ('final', LGBMRegressor(**cv_lgbm.best_params_, verbose=-1))
    ])
score = cross_validate(model, train_df, original_df, selected_features=selected_features2).mean()

In [None]:
test_df['log_cost'] = model.predict(test_df[selected_features2])
test_df['cost'] = np.expm1(np.mean(test_df[['log_cost']], axis=1))
test_df['cost'].to_csv('predict_lgbm_selected_features2.csv')

In [None]:
model_lgb2 = model 

## Try ensembling?

In [None]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor
from sklearn.compose import ColumnTransformer

estimators = [
    (('ridge_regression', model_rr), selected_features2),
    (('lgbm1', model_lgb1), selected_features),
    (('lgbm2', model_lgb2), selected_features2)
]
final_estimator = Ridge(positive=True,
                        tol=1e-6,
                        alpha=100,
                        random_state=1,
                        fit_intercept=True)

In [None]:
train_df_merged = pd.concat((train_df, original_df))

In [None]:
estimator_names = [estimator[0] for estimator, columns in estimators]
for estimator, columns in tqdm(estimators):
    train_df_merged[estimator[0]] = estimator[1].predict(train_df_merged[columns])

In [None]:
final_estimator.fit(train_df_merged[estimator_names], y=train_df_merged['log_cost'])

In [None]:
final_estimator.coef_

In [None]:
estimator_names = [estimator[0] for estimator, columns in estimators]
for estimator, columns in tqdm(estimators):
    test_df[estimator[0]] = estimator[1].predict(test_df[columns])

In [None]:
test_df['log_cost'] = final_estimator.predict(test_df[estimator_names])
test_df['cost'] = np.expm1(np.mean(test_df[['log_cost']], axis=1))
test_df['cost'].to_csv('/kaggle/working/predict_ensemble1.csv')

Hmmm... It overfits. Try training on 75pct, then select the final estimators on the last 25pct?

## Try refitting?

In [None]:
final_estimator = Ridge(positive=True,
                        tol=1e-6,
                        alpha=0,
                        random_state=1,
                        fit_intercept=True)

In [None]:
from sklearn.model_selection import train_test_split
train_df_train, train_df_test = train_test_split(train_df, test_size=0.25, random_state=1)

In [None]:
for estimator, columns in tqdm(estimators):
    cross_validate(estimator[1], train_df_train, None, selected_features=columns, refit=False)

In [None]:
train_df_merged = train_df_test#pd.concat((train_df, original_df))
estimator_names = [estimator[0] for estimator, columns in estimators]
for estimator, columns in tqdm(estimators):
    train_df_merged[estimator[0]] = estimator[1].predict(train_df_merged[columns])

In [None]:
final_estimator.fit(train_df_merged[estimator_names], y=train_df_merged['log_cost'])
final_estimator.coef_

In [None]:
estimator_names = [estimator[0] for estimator, columns in estimators]
for estimator, columns in tqdm(estimators):
    test_df[estimator[0]] = estimator[1].predict(test_df[columns])

In [None]:
test_df['log_cost'] = final_estimator.predict(test_df[estimator_names])
test_df['cost'] = np.expm1(np.mean(test_df[['log_cost']], axis=1))
test_df['cost'].to_csv('/kaggle/working/predict_ensemble2.csv')

Hmmm... Actually consistently worse results...

# Replicate the Zoo of Models notebook

In [None]:
estimators = []

## Model 1: Dart

In [None]:
lgbm_params = {'boosting_type': 'dart',
               'learning_rate': 0.3,
               'n_estimators': 400,
               'num_leaves': 200,
               'min_child_samples': 1,
               'min_child_weight': 1e2,
               #'tree_method': 'hist',
               'random_state': 1,
               'eval_metric': 'rmse',
               #'categorical_feature': [selected_features.index('store_sqft')],
              }
lgbm_params['verbose'] = -1
model = Pipeline(
    steps=[
        ('final', LGBMRegressor(**lgbm_params))
#        ('final', LGBMRegressor(**cv_lgbm.best_params_, verbose=-1))
    ])
score = cross_validate(model, train_df, original_df, selected_features=selected_features2).mean()

In [None]:
test_df['log_cost'] = model.predict(test_df[selected_features2])
#test_df['log_cost'] = cv_lgbm.best_estimator_.predict(test_df[selected_features])
test_df['cost'] = np.expm1(np.mean(test_df[['log_cost']], axis=1))
test_df['cost'].to_csv('predict_dart.csv')

In [None]:
model_dart = model

## Model 2: LightGBM

In [None]:
from lightgbm import LGBMRegressor

lgbm_params = {'learning_rate': 0.1,
               'tree_method': 'hist',
               'random_state': 1,
               'eval_metric': 'rmse',
               'categorical_feature': [selected_features.index('store_sqft')],
               'verbose': -1,
               'n_estimators': 450,
                'num_leaves': 100,
                'min_child_samples': 1,
                'min_child_weight': 1e1,}
lgbm_params['verbose'] = -1
model = Pipeline(
    steps=[
        ('final', LGBMRegressor(**lgbm_params))
#        ('final', LGBMRegressor(**cv_lgbm.best_params_, verbose=-1))
    ])
score = cross_validate(model, train_df, original_df, selected_features=selected_features2).mean()

In [None]:
test_df['log_cost'] = model.predict(test_df[selected_features2])
#test_df['log_cost'] = cv_lgbm.best_estimator_.predict(test_df[selected_features])
test_df['cost'] = np.expm1(np.mean(test_df[['log_cost']], axis=1))
test_df['cost'].to_csv('predict_lgbm.csv')

In [None]:
model_lgbm = model

## Model 3: Neural network

In [None]:
import numpy as np
from sklearn.datasets import make_classification
import torch
from torch import nn
import pytorch_lightning as L

In [None]:
!pip install skorch

In [None]:
from skorch import NeuralNetRegressor

In [None]:
class MyModule(nn.Module):
    def __init__(self, num_units=10, nonlin=nn.ReLU):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(38, 256),
            nonlin(),
            nn.Linear(256, 128),
            nonlin(),
            nn.Linear(128, 64),
            nonlin(),
            nn.Linear(64, 64),
            nonlin(),
            nn.Linear(64, 1),
        )
        self.double()

    def forward(self, X, **kwargs):
        X = self.layers(X)
        return X

model = MyModule()
cuda = torch.device('cuda')
model = model.to(cuda)
net = NeuralNetRegressor(
    model,
    max_epochs=10,
    lr=0.1,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
    device='cuda'
)

In [None]:
from sklearn.model_selection import train_test_split
train_df_train, train_df_test = train_test_split(train_df.astype('float64'), test_size=0.25, random_state=1)

In [None]:
model = Pipeline(
    steps=[
        ('ct', ColumnTransformer([('ohe', OneHotEncoder(drop='first'), 
                                   selected_features2)],
                                 remainder='drop')),
        ('final', net)
    ])

In [None]:
model.fit(train_df_train[selected_features2], train_df_train['log_cost'])

# What about the additional feature engineering?

In [None]:
for df in [train_df, original_df, test_df]:
    df['store_score'] = df[['coffee_bar', 'video_store', 'salad_bar', 'prepared_food', 'florist']].mean(axis=1)
    df['store_score_ratio'] = df['store_sqft'] / df['store_score']

In [None]:
selected_features3 = selected_features2 + ['store_score', 'store_score_ratio']

In [None]:
from lightgbm import LGBMRegressor

lgbm_params = {'learning_rate': 0.1,
               'tree_method': 'hist',
               'random_state': 1,
               'eval_metric': 'rmse',
               'categorical_feature': [selected_features3.index(i) for i in ['store_sqft', 'store_score', 'store_score_ratio']],
               'verbose': -1,
               'n_estimators': 450,
                'num_leaves': 100,
                'min_child_samples': 1,
                'min_child_weight': 1e1,}
lgbm_params['verbose'] = -1
model = Pipeline(
    steps=[
        ('final', LGBMRegressor(**lgbm_params))
#        ('final', LGBMRegressor(**cv_lgbm.best_params_, verbose=-1))
    ])
score = cross_validate(model, train_df, original_df, selected_features=selected_features3).mean()

In [None]:
test_df['log_cost'] = model.predict(test_df[selected_features3])
#test_df['log_cost'] = cv_lgbm.best_estimator_.predict(test_df[selected_features])
test_df['cost'] = np.expm1(np.mean(test_df[['log_cost']], axis=1))
test_df['cost'].to_csv('predict_lgbm_2.csv')

In [None]:
model_lgbm2 = model

## What if we leave the store_score numerical?

In [None]:
from lightgbm import LGBMRegressor

lgbm_params = {'learning_rate': 0.1,
               'tree_method': 'hist',
               'random_state': 1,
               'eval_metric': 'rmse',
               'categorical_feature': [selected_features3.index(i) for i in ['store_sqft', 'store_score']],
               'verbose': -1,
               'n_estimators': 450,
                'num_leaves': 100,
                'min_child_samples': 1,
                'min_child_weight': 1e1,}
lgbm_params['verbose'] = -1
model = Pipeline(
    steps=[
        ('final', LGBMRegressor(**lgbm_params))
#        ('final', LGBMRegressor(**cv_lgbm.best_params_, verbose=-1))
    ])
score = cross_validate(model, train_df, original_df, selected_features=selected_features3).mean()

In [None]:
test_df['log_cost'] = model.predict(test_df[selected_features3])
#test_df['log_cost'] = cv_lgbm.best_estimator_.predict(test_df[selected_features])
test_df['cost'] = np.expm1(np.mean(test_df[['log_cost']], axis=1))
test_df['cost'].to_csv('predict_lgbm_3.csv')

In [None]:
model_lgbm3 = model