In [1]:
%run data_loading.ipynb

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV

# This file contains models built only with the chisq data


## Data loading pipeline 

In [3]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
from tempfile import mkdtemp


class DataLoader(BaseEstimator, TransformerMixin):
    def __init__(self, type, content="top10"):
        self._type = type
        self._content = content
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return load_data_file(type=self._type, content=self._content)


# For Transforming Locations to Labels
class DataFrameLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, label="Location", label_encoder=None):
        self._label = label
        if label_encoder is None:
            self._label_encoder = LabelEncoder()
        else:
            self._label_encoder = label_encoder
        
    def fit(self, X, y=None):
        self._label_encoder.fit(X[self._label])
        return self
    
    def transform(self, X, y=None):
        return X.assign(**{"EncodedLabel": self._label_encoder.transform(X[self._label])})


# For splitting the dataset to X and y, without shuffling
class ChisqDataAttributeLabel(BaseEstimator, TransformerMixin):
    def __init__(self, label="EncodedLabel", columns_to_remove=["Instance_ID", "Location"], shuffle=False):
        self._columns_to_remove = columns_to_remove
        self._label = label
        self._shuffle = shuffle
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if self._shuffle:
            X = X.sample(frac=1)
        X_train = X.loc[:, [i for i in X.columns if i not in self._columns_to_remove + [self._label]]].to_numpy()
        y_train = X.loc[:, self._label].to_numpy()
        return X_train, y_train
    
    
def full_data_loading_pipeline(type="train", content="top10", label="Location", transformed_label="EncodedLabel", 
                               columns_to_remove=["Instance_ID", "Location"], 
                               label_encoder=None, shuffle=False):
    return Pipeline( 
        [
        ("DataLoader", DataLoader(type=type, content=content)),
        ("LabelEncoder", DataFrameLabelEncoder(label)),
        ("AttributeLabelProcessor", ChisqDataAttributeLabel(transformed_label, shuffle=shuffle)),
        ]
    )

In [4]:
all_data  = {
    content: {
        type: full_data_loading_pipeline(type, content, shuffle=False).fit_transform(None) for type in ["train", "dev", "test"] 
    } for content in ["top10", "top50", "top100"]
}

### MULTINOMINAL NB
From our prior observation, this might not perform well for the prior for this dataset is very uninformative.

In [5]:
from sklearn.naive_bayes import MultinomialNB

In [6]:
nb_accs = []
for content in all_data:
    mnb = MultinomialNB()
    X_train, y_train = all_data[content]["train"]
    X_dev, y_dev = all_data[content]["dev"]
    mnb.fit(X_train, y_train)
    acc = mnb.score(X_dev, y_dev)
    nb_accs.append((mnb, acc))
    print(f"{content}: {acc}")

top10: 0.29491370993675636
top50: 0.30134526744559975
top100: 0.3078304212670168


### SVM

In [6]:
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import Nystroem

In [8]:
sgd_svc_accs = []
for content in all_data:
    svc = SGDClassifier(max_iter=10000, random_state=42)
    nys = Nystroem(gamma=.2)
    X_train = nys.fit_transform(X_train)
    X_train, y_train = all_data[content]["train"]
    X_dev, y_dev = all_data[content]["dev"]
    svc.fit(X_train, y_train)
    acc = svc.score(X_dev, y_dev)
    sgd_svc_accs.append((svc, acc))
    print(f"{content}: {acc}")

top10: 0.2936541965912745
top50: 0.29553006753135386
top100: 0.3061421374209454


### Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [11]:
rfc_accs = []
for content in all_data:
    rfc = RandomForestClassifier(n_estimators=20)
    X_train, y_train = all_data[content]["train"]
    X_dev, y_dev = all_data[content]["dev"]
    rfc.fit(X_train, y_train)
    acc = rfc.score(X_dev, y_dev)
    rfc_accs.append((rfc, acc))
    print(f"{content}: {acc}")

top10: 0.2954764712187801
top50: 0.2982366813163254
top100: 0.30381069782398973


In [12]:
etc_accs = []
for content in all_data:
    etc = ExtraTreesClassifier(n_estimators=20)
    X_train, y_train = all_data[content]["train"]
    X_dev, y_dev = all_data[content]["dev"]
    etc.fit(X_train, y_train)
    acc = etc.score(X_dev, y_dev)
    etc_accs.append((etc, acc))
    print(f"{content}: {acc}")

top10: 0.2954764712187801
top50: 0.3000321577875442
top100: 0.3071872655161325


## Compare Lots of others with a smaller dataset

In [27]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, log_loss

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from time import process_time

from collections import defaultdict as dd


def rank_models(classifiers, X, y, n_splits=10):
    log_cols_acc = ["Classifier", "Accuracy"]
    log_cols_prec = ["Classifier", "Precision"]
    log_cols_rec = ["Classifier", "Recall"]
    log_cols_f1 = ["Classifier", "F1"]
    log_acc = pd.DataFrame(columns=log_cols_acc)
    log_prec = pd.DataFrame(columns=log_cols_prec)
    log_rec = pd.DataFrame(columns=log_cols_rec)
    log_f1 = pd.DataFrame(columns=log_cols_f1)
    
    sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.05, random_state=42)
    
    fitting_time_dict = dd(float)
    prec_dict = dd(float)
    rec_dict = dd(float)
    acc_dict = dd(float)
    f1_dict = dd(float)

    for i, (train_index, test_index) in enumerate(sss.split(X, y)):
        print(f"==================== Split{i} ====================")
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        for clf in classifiers:
            name = clf.__class__.__name__
            print(f"classifier {name}")
            start = process_time()
            clf.fit(X_train, y_train)
            duration = process_time() - start
            train_predictions = clf.predict(X_test)
            acc = accuracy_score(y_test, train_predictions)
            prec = precision_score(y_test, train_predictions, average="micro")
            rec = recall_score(y_test, train_predictions, average="micro")
            f1 = f1_score(y_test, train_predictions, average="micro")

            fitting_time_dict[name] += duration
            acc_dict[name] += acc
            prec_dict[name] += prec
            rec_dict[name] += rec
            f1_dict[name] += f1

    for clf in acc_dict:
        acc_dict[clf] = acc_dict[clf] / n_splits
        log_entry_acc = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols_acc)
        log_acc = log_acc.append(log_entry_acc)

    for clf in prec_dict:
        prec_dict[clf] = prec_dict[clf] / n_splits
        log_entry_prec = pd.DataFrame([[clf, prec_dict[clf]]], columns=log_cols_prec)
        log_prec = log_prec.append(log_entry_prec)

    for clf in rec_dict:
        rec_dict[clf] = rec_dict[clf] / n_splits
        log_entry_rec = pd.DataFrame([[clf, rec_dict[clf]]], columns=log_cols_rec)
        log_rec = log_rec.append(log_entry_rec)

    for clf in f1_dict:
        f1_dict[clf] = f1_dict[clf] / n_splits
        log_entry_f1 = pd.DataFrame([[clf, f1_dict[clf]]], columns=log_cols_f1)
        log_f1 = log_f1.append(log_entry_f1)

    for clf in f1_dict:
        f1_dict[clf] = f1_dict[clf] / n_splits
        log_entry_f1 = pd.DataFrame([[clf, f1_dict[clf]]], columns=log_cols_f1)
        log_f1 = log_f1.append(log_entry_f1)

    return fitting_time_dict, acc_dict, log_acc, prec_dict, log_prec, rec_dict, log_rec, f1_dict, log_f1

In [None]:
X_train, y_train = all_data["top10"]["train"]
X_dev, y_dev = all_data["top10"]["dev"]
X = np.append(X_train, X_dev, axis=0)
y = np.append(y_train, y_dev)

classifiers = [
    SVC(gamma="scale"),
    SGDClassifier(),
    KNeighborsClassifier(3),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=20),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LogisticRegression(multi_class="auto", solver="lbfgs", max_iter=10000)
]


size=10000000
fitting_time_dict, acc_dict, log_acc, prec_dict, log_prec, rec_dict, log_rec, f1_dict, log_f1 = \
            rank_models(classifiers, X[:size], y[:size], n_splits=1)

classifier SVC


In [None]:
fitting_time_dict

In [None]:
plt.xlabel('Accuracy')
plt.title('Classifier Accuracy')

sns.set_color_codes("muted")
sns.barplot(x='Accuracy', y='Classifier', data=log_acc, color="b")

plt.show()

In [None]:
plt.xlabel('Precision')
plt.title('Classifier Precision')

sns.set_color_codes("muted")
sns.barplot(x='Precision', y='Classifier', data=log_prec, color="r")

plt.show()

In [None]:
plt.xlabel('Recall')
plt.title('Classifier Recall')

sns.set_color_codes("muted")
sns.barplot(x='Recall', y='Classifier', data=log_rec, color="g")

plt.show()

In [None]:
plt.xlabel('F1')
plt.title('Classifier F1 Score')

sns.set_color_codes("muted")
sns.barplot(x='F1', y='Classifier', data=log_f1, color="b")

plt.show()

In [24]:
sorted(acc_dict.items(), key=lambda x: x[1], reverse=True)

[('DecisionTreeClassifier', 0.292),
 ('GradientBoostingClassifier', 0.292),
 ('RandomForestClassifier', 0.288),
 ('LogisticRegression', 0.288),
 ('AdaBoostClassifier', 0.2866666666666667),
 ('SVC', 0.2853333333333333),
 ('SGDClassifier', 0.2853333333333333),
 ('KNeighborsClassifier', 0.2813333333333334),
 ('GaussianNB', 0.27466666666666667)]

In [25]:
sorted(f1_dict.items(), key=lambda x: x[1], reverse=True)

[('DecisionTreeClassifier', 0.09733333333333333),
 ('GradientBoostingClassifier', 0.09733333333333333),
 ('RandomForestClassifier', 0.09599999999999999),
 ('LogisticRegression', 0.09599999999999999),
 ('AdaBoostClassifier', 0.09555555555555556),
 ('SVC', 0.0951111111111111),
 ('SGDClassifier', 0.0951111111111111),
 ('KNeighborsClassifier', 0.0937777777777778),
 ('GaussianNB', 0.09155555555555556)]

### Try tune logistic regression, GradientBoosting and GaussianNB
(SVC will take too long to tune. We will leave it till the last)

In [38]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, KFold
import numpy as np

In [76]:
# TODO also need test elastic net!
lr_param_dict = {
#     "penalty": ["l1", "l2", "none"],
    "penalty": ["l2", "none"],
    "C": np.logspace(-3,3, 14),
    "fit_intercept": [True, False],
    "max_iter": [10000],
    "multi_class": ["ovr", "multinomial"],
#     "multi_class": ["ovr"],
    "solver": ["lbfgs"]
}

In [125]:
load_data_file("train", "top50").head(5)

Unnamed: 0,Instance_ID,abcbrisbane,abotlangit,advanceqld,afcuw,afdonnerwetter,afl,afleaglesfreo,aflfantasy,aflpieseagles,...,victraffic,voodoo,wa,waterpoloa,waterpoloaus,waterpolosa,western,wests,xx,Location
0,11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Melbourne
1,12,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Melbourne
2,13,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Brisbane
3,14,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Perth
4,15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Perth


In [77]:
lr_search_cv = GridSearchCV(estimator=LogisticRegression(), param_grid=lr_param_dict, 
                            return_train_score=True, n_jobs=-1, verbose=3, cv=5)

In [78]:
X = np.append(X_train, X_dev, axis=0)
y = np.append(y_train, y_dev)
lr_search_cv.fit(X, y)

Fitting 3 folds for each of 112 candidates, totalling 336 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 336 out of 336 | elapsed:  5.2min finished
  "Setting penalty='none' will ignore the C and l1_ratio "


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': array([1.0000...
       7.01703829e-02, 2.03091762e-01, 5.87801607e-01, 1.70125428e+00,
       4.92388263e+00, 1.42510267e+01, 4.12462638e+01, 1.19377664e+02,
       3.45510729e+02, 1.00000000e+03]),
                         'fit_intercept': [True, False], 'max_iter': [10000],
                         'multi_

In [80]:
lr_search_cv.best_score_, lr_search_cv.best_estimator_

(0.29680023923444976,
 LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=False,
                    intercept_scaling=1, l1_ratio=None, max_iter=10000,
                    multi_class='ovr', n_jobs=None, penalty='none',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False))

### Try Gaussian Mixture Model

In [None]:
from sklearn.mixture import GaussianMixture

In [None]:
class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]