# Final Project
Team 26

## Data Loader

In [1]:
import pandas as pd
import numpy as np

class Visitor:
    def __init__(self, mac_hash : str, label: int = -1) -> None:
        self.mac_hash = mac_hash
        self.label = label
        self.sniffer_loc_list = []
        self.created_time_list = []

    def __str__(self) -> str:
        return f'{self.mac_hash} {self.label} {self.sniffer_loc_list}'

    def add_loc_and_time(self, sniffer_loc: int, created_time: str) -> None:
        self.sniffer_loc_list.append(sniffer_loc)
        self.created_time_list.append(created_time)

def parse_label_and_feature(label_filepath: str, feature_filepath: str) -> dict:
    mac_hash_to_visitor = {}
    label_array = pd.read_csv(label_filepath, sep=',').to_numpy()
    for data in label_array:
        mac_hash = data[0]
        label = int(data[1]) if len(data) == 2 else -1
        mac_hash_to_visitor[mac_hash] = Visitor(mac_hash, label)

    feature_array = pd.read_csv(feature_filepath, sep=',').to_numpy()
    for mac_hash, sniffer_loc, created_time in feature_array:
        mac_hash_to_visitor[mac_hash].add_loc_and_time(sniffer_loc, created_time)
    return mac_hash_to_visitor

train_mac_hash_to_visitor = parse_label_and_feature('training-label.csv', 'train.csv')
test_mac_hash_to_visitor = parse_label_and_feature('submit_samples.csv', 'test.csv')

## Define Evaluating Function

In [2]:
def log_loss(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    log_prob_sum = 0
    for i, prob in enumerate(y_pred):
        y_prob = prob[int(y_true[i])]
        y_prob = 0.2 if y_prob == 0 else y_prob
        log_prob_sum += np.log(y_prob)
    return float(-log_prob_sum / y_true.shape[0])

## Data Preprocessing

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def to_second(time: str) -> int:
    h, m, s = time.split(':')
    return int(h) * 3600 + int(m) * 60 + int(s)

def get_duration_list(sniffer_loc_list: list, created_time_list: list) -> np.ndarray:
    loc_and_time_list = list(zip(sniffer_loc_list, created_time_list))
    loc_and_time_list.sort(key=lambda s : s[1])

    duration_list = np.zeros((14,), dtype=int)
    for i in range(len(loc_and_time_list) - 1):
        cur_loc = loc_and_time_list[i][0]
        cur_date, cur_time = str(loc_and_time_list[i][1]).split(' ')
        next_date, next_time = str(loc_and_time_list[i + 1][1]).split(' ')

        duration = to_second(next_time) - to_second(cur_time)
        duration_list[int(cur_loc) - 1] = 1 if next_date != cur_date or duration == 0 else duration
    duration_list[int(loc_and_time_list[-1][0]) - 1] = 1
    return duration_list

def feature_preprocess(mac_hash_to_visitor: dict) -> np.ndarray:
    sniffer_loc_to_group = { 1: -1,  2: 0,  3: -1,  4:  0,  5:  0,  6:  0,  7:  1,
                             8: -1,  9: 2, 10:  2, 11:  2, 12:  3, 13:  1, 14:  3}
    features = np.zeros((len(mac_hash_to_visitor), 22), dtype=int)
    for i, (_, visitor) in enumerate(mac_hash_to_visitor.items()):
        for sniffer_loc in visitor.sniffer_loc_list:
            features[i][int(sniffer_loc) - 1] = 1
        features[i][14] = len(visitor.sniffer_loc_list)
        for sniffer_loc in visitor.sniffer_loc_list:
            if sniffer_loc_to_group[int(sniffer_loc)] != -1:
                features[i][15 + sniffer_loc_to_group[int(sniffer_loc)]] += 1
        date, time = str(visitor.created_time_list[0]).split(' ')
        day = int(date.split('-')[2])
        features[i][19 + (day - 6)] = 1

        # duration_list = get_duration_list(visitor.sniffer_loc_list, visitor.created_time_list)
        # features[i][0:14] = duration_list
        # features[i][14] = np.sum(duration_list)
        # for sniffer_loc in visitor.sniffer_loc_list:
        #     if sniffer_loc_to_group[int(sniffer_loc)] != -1:
        #         features[i][15 + sniffer_loc_to_group[int(sniffer_loc)]] += duration_list[int(sniffer_loc) - 1]
        # date, time = str(visitor.created_time_list[0]).split(' ')
        # day = int(date.split('-')[2])
        # features[i][19 + (day - 6)] = 1
    return features

train_features = feature_preprocess(train_mac_hash_to_visitor)
train_labels = np.zeros((len(train_mac_hash_to_visitor),), dtype=int)
for i, (_, visitor) in enumerate(train_mac_hash_to_visitor.items()):
    train_labels[i] = int(visitor.label)
test_features = feature_preprocess(test_mac_hash_to_visitor)

scaler = StandardScaler()
scaler.fit(train_features)

train_features = scaler.transform(train_features)
test_features = scaler.transform(test_features)

X_train, X_validate, y_train, y_validate = train_test_split(train_features, train_labels, test_size=0.2, random_state=1)
X_test = test_features

## Probability Postprocessing

In [4]:
def prob_postprocess(y_pred: np.ndarray, y_pred_prob: np.ndarray, threshold: float = 0.99) -> np.ndarray:
    y_prob_post = np.copy(y_pred_prob)
    for i, y in enumerate(y_pred):
        if y_prob_post[i][int(y)] >= threshold:
            y_prob_post[i] = np.eye(1, y_prob_post.shape[1], int(y))[0]
    return y_prob_post

## Multi-Layer Perceptron
* Reference:\
    [MLPClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html)

In [5]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

mlp = MLPClassifier(max_iter=1000)
mlp.fit(X_train, y_train)

y_pred = mlp.predict(X_validate)
print(f'Accuracy: {accuracy_score(y_validate, y_pred):f}')

y_pred_prob = mlp.predict_proba(X_validate)
print(f'Log-loss: {log_loss(y_validate, y_pred_prob):f}')

y_prob_post = prob_postprocess(y_pred, y_pred_prob)
print(f'Log-loss: {log_loss(y_validate, y_prob_post):f}')

Accuracy: 0.989224
Log-loss: 0.029678
Log-loss: 0.026509


## Decision Tree
* Reference:\
    [DecisionTreeClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

y_pred = decision_tree.predict(X_validate)
print(f'Accuracy: {accuracy_score(y_validate, y_pred):f}')

y_pred_prob = decision_tree.predict_proba(X_validate)
print(f'Log-loss: {log_loss(y_validate, y_pred_prob):f}')

y_prob_post = prob_postprocess(y_pred, y_pred_prob)
print(f'Log-loss: {log_loss(y_validate, y_prob_post):f}')

Accuracy: 0.979885
Log-loss: 0.036367
Log-loss: 0.036367


## Random Forest
* Reference:\
    [RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_validate)
print(f'Accuracy: {accuracy_score(y_validate, y_pred):f}')

y_pred_prob = random_forest.predict_proba(X_validate)
print(f'Log-loss: {log_loss(y_validate, y_pred_prob):f}')

y_prob_post = prob_postprocess(y_pred, y_pred_prob)
print(f'Log-loss: {log_loss(y_validate, y_prob_post):f}')

Accuracy: 0.987787
Log-loss: 0.072180
Log-loss: 0.069622


## Extreme Gradient Boosting (XGBoost)
* Install in conda:
```
    $ conda install -c conda-forge py-xgboost
```
* Reference:\
    [XGBClassifier](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBClassifier)

In [8]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

xgboost = XGBClassifier(use_label_encoder=False)
xgboost.fit(X_train, y_train)

y_pred = xgboost.predict(X_validate)
print(f'Accuracy: {accuracy_score(y_validate, y_pred):f}')

y_pred_prob = xgboost.predict_proba(X_validate)
print(f'Log-loss: {log_loss(y_validate, y_pred_prob):f}')

y_prob_post = prob_postprocess(y_pred, y_pred_prob)
print(f'Log-loss: {log_loss(y_validate, y_prob_post):f}')



  from pandas import MultiIndex, Int64Index


Accuracy: 0.987787
Log-loss: 0.031896
Log-loss: 0.029086


## CatBoost
* Install in conda:
```
    $ conda config --add channels conda-forge
    $ conda install catboost
```
* Reference:\
    [CatBoostClassifier](https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier)

In [9]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

cat_boost = CatBoostClassifier(verbose=False)
cat_boost.fit(X_train, y_train)

y_pred = cat_boost.predict(X_validate)
print(f'Accuracy: {accuracy_score(y_validate, y_pred):f}')

y_pred_prob = cat_boost.predict_proba(X_validate)
print(f'Log-loss: {log_loss(y_validate, y_pred_prob):f}')

y_prob_post = prob_postprocess(y_pred, y_pred_prob)
print(f'Log-loss: {log_loss(y_validate, y_prob_post):f}')

Accuracy: 0.987069
Log-loss: 0.034429
Log-loss: 0.031379


## Voting
* Reference:\
    [VotingClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html)

In [10]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

voting = VotingClassifier(
    estimators=[('mlp', mlp), ('decision_tree', decision_tree), ('random_forest', random_forest), ('xgboost', xgboost), ('cat_boost', cat_boost)],
    voting='soft',
    weights=[1, 0, 0, 0.9, 1]
)
voting.fit(X_train, y_train)

y_pred = voting.predict(X_validate)
print(f'Accuracy: {accuracy_score(y_validate, y_pred):f}')

y_pred_prob = voting.predict_proba(X_validate)
print(f'Log-loss: {log_loss(y_validate, y_pred_prob):f}')

y_prob_post = prob_postprocess(y_pred, y_pred_prob)
print(f'Log-loss: {log_loss(y_validate, y_prob_post):f}')

Accuracy: 0.990661
Log-loss: 0.028647
Log-loss: 0.028026


## Choose a Model to Predict Testing Data

In [11]:
import joblib
import pandas as pd

## choose model
model = voting

## save model
joblib.dump(model, 'checkpoint/ckpt.tar.gz')

## load model
# model = joblib.load('checkpoint/ckpt_0.0407222.tar.gz')

## evaluate model
y_pred = model.predict(X_validate)
print(f'Accuracy: {accuracy_score(y_validate, y_pred):f}')

y_pred_prob = model.predict_proba(X_validate)
print(f'Log-loss: {log_loss(y_validate, y_pred_prob):f}')

y_prob_post = prob_postprocess(y_pred, y_pred_prob)
print(f'Log-loss: {log_loss(y_validate, y_prob_post):f}')

## predict testing data
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)
y_pred_prob = prob_postprocess(y_pred, y_pred_prob)

test_mac_hash_list = [visitor.mac_hash for _, visitor in test_mac_hash_to_visitor.items()]
mac_hash_df = pd.DataFrame(test_mac_hash_list, columns=['mac_hash'])
pred_prob_df = pd.DataFrame(y_pred_prob, columns=['C0', 'C1', 'C2', 'C3', 'C4'])
pred_df = pd.concat([mac_hash_df, pred_prob_df], axis=1)
pred_df.to_csv('result/submit.csv', index=False)

Accuracy: 0.990661
Log-loss: 0.028647
Log-loss: 0.028026
