## Riiid! LGBM Single Model Ensembling - Scoring

This notebook is used as a demonstration for my thread on [Single Model Ensembling Guide | LightGBM Example](https://www.kaggle.com/c/riiid-test-answer-prediction/discussion/202344)

**Main Idea**: Use different number of trees to score on test data and take the weighted average of the outputs. 

This is a scoring only notebook. The Training Notebook is [available here](https://www.kaggle.com/manikanthr5/riiid-lgbm-single-model-ensembling-training/).

![](https://i.imgur.com/qlQTh0b.png)

**Acknowledgement:** I am using [this notebook](https://www.kaggle.com/its7171/lgbm-with-loop-feature-engineering/) as the starter to show my idea. If you like this kernel, please upvote [the actual kernel](https://www.kaggle.com/its7171/lgbm-with-loop-feature-engineering/execution/). I have removed some code which is not required for scoring purpose.

In [1]:
import gc
import joblib
import pandas as pd
import numpy as np
import lightgbm as lgb

## feature engineering

In [2]:
# funcs for user stats with loop
def add_user_feats_without_update(df, answered_correctly_sum_u_dict, count_u_dict):
    acsu = np.zeros(len(df), dtype=np.int32)
    cu = np.zeros(len(df), dtype=np.int32)
    for cnt,row in enumerate(df[['user_id']].values):
        acsu[cnt] = answered_correctly_sum_u_dict[row[0]]
        cu[cnt] = count_u_dict[row[0]]
    user_feats_df = pd.DataFrame({'answered_correctly_sum_u':acsu, 'count_u':cu})
    user_feats_df['answered_correctly_avg_u'] = user_feats_df['answered_correctly_sum_u'] / user_feats_df['count_u']
    df = pd.concat([df, user_feats_df], axis=1)
    return df

def update_user_feats(df, answered_correctly_sum_u_dict, count_u_dict):
    for row in df[['user_id','answered_correctly','content_type_id']].values:
        if row[2] == 0:
            answered_correctly_sum_u_dict[row[0]] += row[1]
            count_u_dict[row[0]] += 1

### Load Variables

In [3]:
answered_correctly_sum_u_dict = joblib.load("../input/lgbm-with-loop-feature-engineering-dataset/answered_correctly_sum_u_dict.pkl.zip")
count_u_dict = joblib.load("../input/lgbm-with-loop-feature-engineering-dataset/count_u_dict.pkl.zip")

questions_df = pd.read_feather('../input/lgbm-with-loop-feature-engineering-dataset/questions_df.feather')
content_df = pd.read_feather('../input/lgbm-with-loop-feature-engineering-dataset/content_df.feather')

prior_question_elapsed_time_mean = joblib.load("../input/lgbm-with-loop-feature-engineering-dataset/prior_question_elapsed_time_mean.pkl.zip")

## modeling

In [4]:
TARGET = 'answered_correctly'
FEATS = ['answered_correctly_avg_u', 'answered_correctly_sum_u', 'count_u', 
         'answered_correctly_avg_c', 'part', 'prior_question_had_explanation', 
         'prior_question_elapsed_time'
        ]

In [5]:
model = lgb.Booster(model_file="../input/lgbm-with-loop-feature-engineering-dataset/fold0_lgb_model.txt")
model.best_iteration = joblib.load("../input/lgbm-with-loop-feature-engineering-dataset/fold0_lgb_model_best_iteration.pkl.zip")

In [6]:
optimized_weights = joblib.load("../input/lgbm-with-loop-feature-engineering-dataset/optimized_weights.pkl.zip")

## inference

In [7]:
class Iter_Valid(object):
    def __init__(self, df, max_user=1000):
        df = df.reset_index(drop=True)
        self.df = df
        self.user_answer = df['user_answer'].astype(str).values
        self.answered_correctly = df['answered_correctly'].astype(str).values
        df['prior_group_responses'] = "[]"
        df['prior_group_answers_correct'] = "[]"
        self.sample_df = df[df['content_type_id'] == 0][['row_id']]
        self.sample_df['answered_correctly'] = 0
        self.len = len(df)
        self.user_id = df.user_id.values
        self.task_container_id = df.task_container_id.values
        self.content_type_id = df.content_type_id.values
        self.max_user = max_user
        self.current = 0
        self.pre_user_answer_list = []
        self.pre_answered_correctly_list = []

    def __iter__(self):
        return self
    
    def fix_df(self, user_answer_list, answered_correctly_list, pre_start):
        df= self.df[pre_start:self.current].copy()
        sample_df = self.sample_df[pre_start:self.current].copy()
        df.loc[pre_start,'prior_group_responses'] = '[' + ",".join(self.pre_user_answer_list) + ']'
        df.loc[pre_start,'prior_group_answers_correct'] = '[' + ",".join(self.pre_answered_correctly_list) + ']'
        self.pre_user_answer_list = user_answer_list
        self.pre_answered_correctly_list = answered_correctly_list
        return df, sample_df

    def __next__(self):
        added_user = set()
        pre_start = self.current
        pre_added_user = -1
        pre_task_container_id = -1
        pre_content_type_id = -1
        user_answer_list = []
        answered_correctly_list = []
        while self.current < self.len:
            crr_user_id = self.user_id[self.current]
            crr_task_container_id = self.task_container_id[self.current]
            crr_content_type_id = self.content_type_id[self.current]
            if crr_user_id in added_user and (crr_user_id != pre_added_user or (crr_task_container_id != pre_task_container_id and crr_content_type_id == 0 and pre_content_type_id == 0)):
                # known user(not prev user or (differnt task container and both question))
                return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
            if len(added_user) == self.max_user:
                if  crr_user_id == pre_added_user and (crr_task_container_id == pre_task_container_id or crr_content_type_id == 1):
                    user_answer_list.append(self.user_answer[self.current])
                    answered_correctly_list.append(self.answered_correctly[self.current])
                    self.current += 1
                    continue
                else:
                    return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
            added_user.add(crr_user_id)
            pre_added_user = crr_user_id
            pre_task_container_id = crr_task_container_id
            pre_content_type_id = crr_content_type_id
            user_answer_list.append(self.user_answer[self.current])
            answered_correctly_list.append(self.answered_correctly[self.current])
            self.current += 1
        if pre_start < self.current:
            return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
        else:
            raise StopIteration()

In [8]:
import riiideducation
env = riiideducation.make_env()
iter_test = env.iter_test()
set_predict = env.predict

In [9]:
previous_test_df = None
for (test_df, sample_prediction_df) in iter_test:
    if previous_test_df is not None:
        previous_test_df[TARGET] = eval(test_df["prior_group_answers_correct"].iloc[0])
        update_user_feats(previous_test_df, answered_correctly_sum_u_dict, count_u_dict)
    previous_test_df = test_df.copy()
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    test_df = add_user_feats_without_update(test_df, answered_correctly_sum_u_dict, count_u_dict)
    test_df = pd.merge(test_df, content_df, on='content_id',  how="left")
    test_df = pd.merge(test_df, questions_df, left_on='content_id', right_on='question_id', how='left')
    test_df['prior_question_had_explanation'] = test_df.prior_question_had_explanation.fillna(False).astype('int8')
    test_df['prior_question_elapsed_time_mean'] = test_df.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean)
    preds = optimized_weights[0] * model.predict(test_df[FEATS], num_iteration=400)
    preds += optimized_weights[1] * model.predict(test_df[FEATS], num_iteration=700)
    preds += optimized_weights[2] * model.predict(test_df[FEATS], num_iteration=model.best_iteration)
    test_df[TARGET] = preds
    set_predict(test_df[['row_id', TARGET]])