<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Load-Sparse-matrix" data-toc-modified-id="Load-Sparse-matrix-0.1"><span class="toc-item-num">0.1&nbsp;&nbsp;</span>Load Sparse matrix</a></span></li><li><span><a href="#Scale-data" data-toc-modified-id="Scale-data-0.2"><span class="toc-item-num">0.2&nbsp;&nbsp;</span>Scale data</a></span></li></ul></li><li><span><a href="#Models" data-toc-modified-id="Models-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Models</a></span><ul class="toc-item"><li><span><a href="#kMeans" data-toc-modified-id="kMeans-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>kMeans</a></span></li><li><span><a href="#PassiveAggressiveClassifier" data-toc-modified-id="PassiveAggressiveClassifier-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>PassiveAggressiveClassifier</a></span></li><li><span><a href="#Logistic-regression" data-toc-modified-id="Logistic-regression-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Logistic regression</a></span></li><li><span><a href="#Random-Forest" data-toc-modified-id="Random-Forest-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Random Forest</a></span></li><li><span><a href="#SGD" data-toc-modified-id="SGD-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>SGD</a></span></li></ul></li><li><span><a href="#Analyse-models" data-toc-modified-id="Analyse-models-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Analyse models</a></span></li><li><span><a href="#CV-Grid-search" data-toc-modified-id="CV-Grid-search-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>CV Grid search</a></span></li><li><span><a href="#ToDo:-CV-not-stratified" data-toc-modified-id="ToDo:-CV-not-stratified-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>ToDo: CV not stratified</a></span><ul class="toc-item"><li><span><a href="#TruncatedSVD" data-toc-modified-id="TruncatedSVD-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>TruncatedSVD</a></span></li></ul></li><li><span><a href="#Visual-analysis-of-sample-data" data-toc-modified-id="Visual-analysis-of-sample-data-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Visual analysis of sample data</a></span></li></ul></div>

In [None]:
import feather

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import sparse

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

from sklearn.naive_bayes import ComplementNB, MultinomialNB

from sklearn.cluster import KMeans

from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler, Normalizer, QuantileTransformer

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, GroupKFold

from sklearn.metrics import cohen_kappa_score, make_scorer, confusion_matrix, classification_report, SCORERS

from sklearn.utils import shuffle

from sklearn.decomposition import TruncatedSVD

from scipy.stats import describe

In [None]:
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

## Load Sparse matrix

In [None]:
path = '../data/final/info_train.feather'
info_train = feather.read_dataframe(path)

path = '../data/final/info_test.feather'
info_test = feather.read_dataframe(path)

path = '../data/final/sparse_matrix_train.npz'
sparse_matrix_train = sparse.load_npz(path)

path = '../data/final/sparse_matrix_test.npz'
sparse_matrix_test = sparse.load_npz(path)

X_train = sparse_matrix_train
X_test = sparse_matrix_test

y_train = list(info_train['s_crsp_obj_cd'])
y_test = list(info_test['s_crsp_obj_cd'])

data_train = np.array(X_train.sum(1)).flatten()
data_test = np.array(X_test.sum(1)).flatten()

TOL = 300

index_train = np.abs(data_train) < TOL
index_test = np.abs(data_test) < TOL

X_train = X_train[index_train.T]
X_test = X_test[index_test.T]

y_train = list(d for d, s in zip(y_train, index_train) if s)
y_test = list(d for d, s in zip(y_test, index_test) if s)

info_train = info_train[index_train.T]
info_test = info_test[index_test.T]

X_total = sparse.vstack((X_train,X_test))
y_total = y_train + y_test

groups = info_train.groupby(['port_no']).ngroup()

print('Rows training: \n{:,} rows info\n{:,} rows data'.format(len(y_train),X_train.shape[0]))
print('')
print('Rows testing: \n{:,} rows info\n{:,} rows data'.format(len(y_test),X_test.shape[0]))
print('')
print('Total rows: {:,}'.format(X_train.shape[0] + X_test.shape[0]))

## Scale data

In [None]:
scaler = StandardScaler(with_mean=False)

# Fit on training set only.
scaler.fit(X_train)

# Apply transform to both the training set and the test set.
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Models

## kMeans

In [None]:
%%time
print('Start kMeans...')
kmeans = KMeans(n_clusters=5,
                n_jobs=-1,
                random_state=0).fit(X_test)

In [None]:
kmeans.score()

In [None]:
PAC_pred = PAC.predict(X_test)
print('PAC kappa score',cohen_kappa_score(PAC_pred, y_test))
print('PAC score',PAC.score(X_test, y_test))

## PassiveAggressiveClassifier

In [None]:
%%time
print('Start PassiveAggressiveClassifier...')
PAC = PassiveAggressiveClassifier(C = 1e-7,
                                  tol = 1e-5,
                                  n_iter_no_change=100,
                                  n_jobs = -1,
                                  early_stopping = True
                                  ).fit(X_train, y_train)

In [None]:
PAC_pred = PAC.predict(X_test)
print('PAC kappa score',cohen_kappa_score(PAC_pred, y_test))
print('PAC score',PAC.score(X_test, y_test))

## Logistic regression

In [None]:
%%time
print('Start logistic regression...')
LR_1 = LogisticRegression(solver = 'saga',
                        multi_class = 'multinomial',
                        penalty = 'l2',
                        n_jobs = -1
                        ).fit(X_train, y_train)

In [None]:
LR_pred = LR_1.predict(X_test)
print('LR kappa score',cohen_kappa_score(LR_pred, y_test))
print('LR score',LR_1.score(X_test, y_test))

## Random Forest

In [None]:
%%time
print('Start Random Forest...')
le = preprocessing.LabelEncoder()
le.fit(y_train)

y_train_rf = le.fit_transform(y_train)
y_test_rf = le.transform(y_test)

RF = RandomForestClassifier(n_estimators=1000, 
                            min_samples_leaf=100,
                            random_state=0,
                            max_features = "auto",
                            verbose = 1,
                            n_jobs = -1
                           ).fit(X_train, y_train_rf)

In [None]:
RF_pred = RF.predict(X_test)
print('RF kappa score',cohen_kappa_score(RF_pred, y_test_rf))
print('RF score',RF.score(X_test, y_test_rf))

## SGD

In [None]:
%%time


# data has to be scaled!!!!

print('Start SGDClassifier...')

SGD = SGDClassifier(loss='log', 
                    penalty='l1', 
                    alpha=0.0001, 
                    l1_ratio=0.15,  
                    max_iter=1000, 
                    tol=None, 
                    verbose=0, 
                    n_jobs=-1, 
                    random_state=1, 
                    learning_rate='optimal', 
                    eta0=0.0, 
                    validation_fraction=0.1, 
                    n_iter_no_change=5, 
                    class_weight=None, 
                    warm_start=False, 
                    average=False, 
                    n_iter=None).fit(X_train, y_train_rf)

In [None]:
SGD_pred = SGD.predict(X_test)
print('SGD kappa score',cohen_kappa_score(SGD_pred, y_test_rf))
print('SGD score',SGD.score(X_test, y_test_rf))

In [None]:
%%time
PAC_pred = PAC.predict(X_test)
LR_pred = LR.predict(X_test)
RF_pred = RF.predict(X_test)

print('PAC kappa score',cohen_kappa_score(PAC_pred, y_test))
print('LR kappa score',cohen_kappa_score(LR_pred, y_test))
print('RF kappa score',cohen_kappa_score(RF_pred, y_test_rf))
print('')
print('PAC score',PAC.score(X_test, y_test))
print('LR score',LR.score(X_test, y_test))
print('RF score',RF.score(X_test, y_test_rf))
print('')

# Analyse models

In [None]:
model = clf

In [None]:
info_test['new'] = model.predict(X_test)

In [None]:
info_test.head(10)

In [None]:
prob_pos = model.predict_proba(X_test)

In [None]:
pred_label = model.predict(X_test)

In [None]:
print(model.classes_)

In [None]:
cm = confusion_matrix(y_test, pred_label, labels=model.classes_)
print(cm)

In [None]:
#np.set_printoptions(formatter={'float': '{: 0.2f}'.format})
print(np.round(cm.astype('float') / cm.sum(axis=1)[:, np.newaxis],2))

In [None]:
print(classification_report(y_test, pred_label))

# CV Grid search

# ToDo: CV not stratified

### Model

In [None]:
model_grid_cv = LogisticRegression(random_state=1)

model_grid_cv = PassiveAggressiveClassifier(random_state=1)

In [None]:
group_kfold = GroupKFold(n_splits=5)
cv = group_kfold.split(X_train, y_train, groups)

tuned_parameters_PAC = {'C': [1e-6], 'tol':[1e-5],'n_iter_no_change':[20]}

tuned_parameters_LR = {'solver': ['saga'],
                       'multi_class':['multinomial'],
                       'penalty':['l2']}

scoring = 'f1_micro'

clf = GridSearchCV(model_grid_cv, 
                   tuned_parameters_PAC, 
                   cv = group_kfold.split(X_train, y_train, groups), 
                   n_jobs=-1, 
                   verbose=10, 
                   refit = True,
                   scoring=scoring, 
                   return_train_score=True)

In [None]:
%%time
clf.fit(X_train_svd, y_train)

In [None]:
results = clf.cv_results_
test_score_mean = results['mean_test_score']
train_score_mean = results['mean_train_score']
test_std_mean = results['std_test_score']
train_std_mean = results['std_train_score']

for mean, std, params in zip(test_score_mean, test_std_mean, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

In [None]:
for mean, std, params in zip(train_score_mean, train_std_mean, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

In [None]:
X_axis = np.array(results['param_C'].data, dtype=float)

plt.figure(figsize=(13, 13))
plt.title("GridSearchCV evaluating using multiple scorers simultaneously",
          fontsize=16)

plt.xlabel("C")
plt.ylabel("Score")

ax = plt.gca()
ax.plot(X_axis, test_score_mean)
ax.plot(X_axis, train_score_mean)

# ax.set_xlim(0, 402)
# ax.set_ylim(0.73, 1)
# plt.legend(loc="best")
plt.grid(False)
plt.show()

## TruncatedSVD


In [None]:
%%time
svd = TruncatedSVD(n_components=200, n_iter=7, random_state=42)
svd.fit(X_train)  

In [None]:
print(svd.explained_variance_ratio_)  
print()
print('Total variance explained: {:.3f}%'.format(svd.explained_variance_ratio_.sum()*100))

In [None]:
X_train_svd = svd.transform(X_train)
X_test_svd = svd.transform(X_test)

In [None]:
plt.plot(X_train_svd[:,8],'o')

In [None]:
df = pd.DataFrame(X_train_svd)
df['class'] = y_train

In [None]:
groups = df.groupby('class')

In [None]:
fig, ax = plt.subplots()

for name, group in groups:
    ax.plot(group.iloc[:,0], group.iloc[:,1], marker='o', linestyle='', ms=10, label=name, alpha=0.5)
    plt.xlim(-0.0001, 0.001)
    plt.ylim(-0.0001, 0.001)
    ax.legend(numpoints=1)

# Visual analysis of sample data

In [None]:
data = X_total.sum(1)

In [None]:
index = np.abs(data) < 3000
y = data[index].T

In [None]:
sum(index == False)

In [None]:
plt.plot(y)

In [None]:
plt.plot(svd.explained_variance_ratio_)