In [1]:
import numpy as np
import pandas as pd

### Two phase naive bayesian classifyer
The proposed model consists of two subodels (phases). The two submodels are trained separately. During the prediction stage submodel 2 will be applied only if submodel 1 fails. In short, submodel 2 can be said to be a 'safety bag'.

#### Submodel 1
A naive bayes classifyer by **_ticket_id_** and **_station_id_**. The idea is to perform classification of passengers by stations they depart from. The problem is that the prediction query can only be answered in case the corresponding _ticket_id_ was present in the training set. That's why the second phase is required.

#### Submodel 2
A naive bayesian classiyer by **_station_id_**, **_ticket_type_nm_**, **_weekday_** and **_times of day_** this model can not utilize the ticket info and thus is less precise and more sporadic.

In [2]:
from sklearn.naive_bayes import CategoricalNB
import warnings

# auxiliary analogue of scipy.stats.mode
def FindMostFrequentValue(array):
    counter = { key : 0 for key in np.unique(array) }
    for key in array:
        counter[key] += 1
    return max(counter, key=counter.get)


class TwoPhaseBayesian:
    def __init__(self):
        # phase 1 model
        self.ticket_id_model = []
        # a mapping from ticket_type_nm to id for NBC
        self.stringId2intId = []
        # phase 2 model
        self.NBC = []


    def __fit_ticket_id(self, data):
        print('Building ticket_id mapping ...')
        reduced = data[['ticket_id', 'station_id', 'time_to_under', 'label']]
        groups = reduced.groupby(by=['ticket_id', 'station_id'])
        # compute average travel time for each passenger departing from specific station
        # and most common (max likelihood) destination (label)
        self.ticket_id_model = groups.agg({
            'time_to_under' : 'mean',
            'label' : FindMostFrequentValue
        })


    def __prepare_nbc_input(self, data):
        reduced = data[['ticket_type_nm', 'station_id', 'pass_dttm']]
        # Replace ticket_id strings with integers
        reduced.loc[:, 'ticket_type_nm'] = reduced.loc[:, 'ticket_type_nm'].apply(lambda name : self.stringId2intId[name])
        reduced.rename(columns={'ticket_type_nm' : 'ticket_type_id'}, inplace=True)
        # extract week day from date
        reduced.loc[:, 'weekday'] = reduced.loc[:, 'pass_dttm'].apply(lambda date : date.weekday())
        # extract time from date
        reduced.loc[:, 'dayhour'] = reduced.loc[:, 'pass_dttm'].apply(lambda date : date.hour)
        X_data = reduced.loc[:, ['ticket_type_id', 'station_id', 'weekday', 'dayhour']]
        return X_data


    def __fit_nbc(self, data):
        print('Building Naive Bayesian classifyer ...')
        # prepare stringId2intId
        names = np.unique(data['ticket_type_nm'])
        self.stringId2intId = { names[idx] : idx for idx in range(names.size) }
        # fit NBC
        print('Preparing training data ...')
        X_train = self.__prepare_nbc_input(data)
        Y_train = data.loc[:, 'label'].to_numpy()
        print('Training Naive Bayesian Classifyer ...')
        self.NBC = CategoricalNB()
        self.NBC.fit(X_train, Y_train)


    def fit(self, data):
        warnings.filterwarnings('ignore')
        self.__fit_ticket_id(data)
        self.__fit_nbc(data)
        print('DONE')


    def predict(self, data):
        answers = pd.DataFrame({
            'time_to_under' : np.zeros(data.shape[0]),
            'label' : np.zeros(data.shape[0])
        })
        reduced = data[['ticket_id', 'ticket_type_nm', 'station_id', 'pass_dttm']]

        matching_ticket_id = \
            data[['ticket_id', 'station_id']] \
            .apply(lambda row : (row[0], row[1]) in self.ticket_id_model.index, axis=1)
        match_by_ticket_index = matching_ticket_id[matching_ticket_id].index
        match_by_nbc_index = matching_ticket_id[~matching_ticket_id].index

        # trying to match by ticket_id
        print(f'Matching {match_by_ticket_index.size} passengers by ticket_id predictor ...')
        match_by_ticket = data.loc[match_by_ticket_index, :]
        match_by_ticket.reset_index(inplace=True)
        X_data = [
            [match_by_ticket.loc[i, 'ticket_id'], match_by_ticket.loc[i, 'station_id']]
            for i in range(match_by_ticket.shape[0])]
        Y_prediction = self.ticket_id_model.loc[X_data]
        answers.loc[match_by_ticket_index] = Y_prediction.to_numpy()

        # in case ticket_id is not present, use a weaker Naive Bayes
        print(f'Matching {match_by_nbc_index.size} passengers via NBC ...')
        match_by_nbc = data.loc[match_by_nbc_index, :]
        match_by_nbc.reset_index(inplace=True)
        default_name = list(self.stringId2intId.keys())[0]
        match_by_nbc.loc[:, 'ticket_type_nm'] = \
            match_by_nbc.loc[:, 'ticket_type_nm'] \
            .apply(lambda name :
                    name if name in self.stringId2intId.keys() else default_name)
        X_data = self.__prepare_nbc_input(match_by_nbc)
        Y_labels = self.NBC.predict(X_data)
        answers.iloc[match_by_nbc_index, 1] = Y_labels
        answers.iloc[match_by_nbc_index, 0] = 500 # dummy

        return answers


In [3]:
data = pd.read_csv('train_dataset_train.csv', parse_dates=['pass_dttm'])

In [4]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.1)
train.reset_index(inplace=True)
test.reset_index(inplace=True)


In [5]:
model = TwoPhaseBayesian()
model.fit(train)

Building ticket_id mapping ...
Building Naive Bayesian classifyer ...
Preparing training data ...
Training Naive Bayesian Classifyer ...
DONE


In [6]:
prediction = model.predict(test)

Matching 90233 passengers by ticket_id predictor ...
Matching 18870 passengers via NBC ...


In [8]:
from sklearn.metrics import r2_score

label_prediction = prediction.loc[:, 'label'].to_numpy().astype(int)
label_expected = test.loc[:, 'label'].to_numpy()
tp_rate = np.sum(label_prediction == label_expected) / label_expected.size
print(f'Correct label predictions pct | {100 * tp_rate : .1f}%')

time_prediction = prediction.loc[:, 'time_to_under'].to_numpy()
time_expected = test.loc[:, 'time_to_under'].to_numpy()
r2 = r2_score(time_expected, time_prediction)
print(f'R2 score of time_to_under:    | {r2 : .3f}')

print(f'Total score:                  | {(r2 + tp_rate) / 2 : .2f}')

Correct label predictions pct |  57.8%
R2 score of time_to_under:    |  0.262
Total score:                  |  0.42
