In [60]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.semi_supervised import LabelSpreading
from sklearn.semi_supervised import SelfTrainingClassifier

from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import EnsembleVoteClassifier

In [2]:
df = pd.read_pickle('data/chkp3.pkl')
df['category_id'] = df.category_id.astype(np.int16)
mask = pd.read_pickle('data/known_unknown_mask.pkl')

In [3]:
unknown_df = df.copy()

unknown_df.loc[~mask, 'category_id'] = -1
unknown_df.category_id.value_counts(dropna=False)

-1     5768
 24     684
 10     608
 22     241
 23     220
 26     217
 17     183
 1      166
 25     152
 28     102
 20      95
 27      78
 15      49
 19      20
 2       16
 29       5
 43       2
Name: category_id, dtype: int64

In [4]:
X_df = unknown_df.drop('category_id', axis=1)
y_df = unknown_df['category_id']
y_true_df = df.category_id

In [5]:
rs = RobustScaler(quantile_range=(5.0, 95.0))
rs.fit(X_df)
X_df = rs.transform(X_df)

In [6]:
y_df.head(5), y_true_df.head(5)

(video_id
 9wRQljFNDW8   -1
 Om_zGhJLZ5U   -1
 goP4Z5wyOlM   -1
 8NHA23f7LvU   -1
 IE-xepGLVt8   -1
 Name: category_id, dtype: int16,
 video_id
 9wRQljFNDW8    17
 Om_zGhJLZ5U     1
 goP4Z5wyOlM    25
 8NHA23f7LvU    24
 IE-xepGLVt8    28
 Name: category_id, dtype: int16)

In [7]:
# Initial acc
print(f'Accuracy: {accuracy_score(y_true_df, y_df)}')
# print(f'Balanced accuracy: {balanced_accuracy_score(y_true_df, y_df)}') - nie ma sensu bo różne klasy

Accuracy: 0.32976992795723914


In [10]:
unknown_df['category_id_truth'] = df.category_id
temp_df = unknown_df[unknown_df.category_id == -1]
X_unkwn, y_unkwn = rs.transform(temp_df.drop(['category_id', 'category_id_truth'], axis=1)), temp_df.category_id_truth

print(f'Accuracy: {accuracy_score(temp_df.category_id, y_unkwn)}')

Accuracy: 0.0


## LR - baseline

In [56]:
df_known = unknown_df[unknown_df.category_id != -1]
X_kwn, y_kwn = rs.transform(df_known.drop(['category_id', 'category_id_truth'], axis=1)), df_known.category_id_truth

In [59]:
clf = LogisticRegression(
    max_iter=10000,
    random_state=42).fit(X_kwn, y_kwn)

y_pred = clf.predict(X_unkwn)
print(f'Accuracy: {accuracy_score(y_unkwn, y_pred)}')
print(f'Balanced accuracy: {balanced_accuracy_score(y_unkwn, y_pred)}')

Accuracy: 0.5979542302357836
Balanced accuracy: 0.5065406947337426


## Semi supervised learning

In [45]:
ls = LabelSpreading(
    kernel = 'rbf',
    gamma = .5,
    max_iter = 100,
    tol = 1e-3,
    n_jobs = -1)
ls.fit(X_df, y_df)

LabelSpreading(gamma=0.5, max_iter=100, n_jobs=-1, tol=0.0001)

In [46]:
# y_pred_ls = ls.transduction_ # gives same result
y_pred_ls = ls.predict(X_df)
print(f'Accuracy: {accuracy_score(y_true_df, y_pred_ls)}')
print(f'Balanced accuracy: {balanced_accuracy_score(y_true_df, y_pred_ls)}')

Accuracy: 0.6892865442714385
Balanced accuracy: 0.5980189154235116
  probabilities /= normalizer


In [47]:
y_pred_ls = ls.predict(X_unkwn)
print(f'Accuracy: {accuracy_score(y_unkwn, y_pred_ls)}')
print(f'Balanced accuracy: {balanced_accuracy_score(y_unkwn, y_pred_ls)}')

Accuracy: 0.5364077669902912
Balanced accuracy: 0.38338951343413236
  probabilities /= normalizer


## Self training clf

Pytanie czy nie powinniśmy udawać, że nie mamy ground truth przy wyborze algorytmu i dopiero na sam koniec sprawdzić na ground truth (train test split - część zaetykietowanych do testowego, część do treningowego). Wtedy teoretycznie wybór opieramy tylko na danych które mamy, a ostatecznie zweryfikujemy jak się sprawdza na ground truth wybrany algorytm.

In [48]:
svc = SVC(kernel='rbf', gamma="auto", probability=True)
stc = SelfTrainingClassifier(svc, max_iter=None)
stc.fit(X_df, y_df)

SelfTrainingClassifier(base_estimator=SVC(gamma='auto', probability=True),
                       max_iter=None)

In [49]:
y_pred_stc = stc.predict(X_df)
print(f'Accuracy: {accuracy_score(y_true_df, y_pred_stc)}')
print(f'Balanced accuracy: {balanced_accuracy_score(y_true_df, y_pred_stc)}')

Accuracy: 0.5845921450151057
Balanced accuracy: 0.3712053971571936


In [54]:
y_pred_stc = stc.predict(X_unkwn)
print(f'Accuracy: {accuracy_score(y_unkwn, y_pred_stc)}')
print(f'Balanced accuracy: {balanced_accuracy_score(y_unkwn, y_pred_stc)}')

Accuracy: 0.5646671289875174
Balanced accuracy: 0.38396386490393025


In [None]:
stc.transduction_

## Voting clf
Przeczytać jakie są defaulty: http://rasbt.github.io/mlxtend/user_guide/classifier/EnsembleVoteClassifier/#example-5-using-pre-fitted-classifiers

In [65]:
eclf = EnsembleVoteClassifier(clfs=[clf, ls, stc], weights=[1,1,1], fit_base_estimators=False)
eclf.fit(X_df, y_df)



EnsembleVoteClassifier(clfs=[LogisticRegression(max_iter=10000,
                                                random_state=42),
                             LabelSpreading(gamma=0.5, max_iter=100, n_jobs=-1,
                                            tol=0.0001),
                             SelfTrainingClassifier(base_estimator=SVC(gamma='auto',
                                                                       probability=True),
                                                    max_iter=None)],
                       fit_base_estimators=False, use_clones=False,
                       weights=[1, 1, 1])

In [66]:
y_pred = eclf.predict(X_df)
print(f'Accuracy: {accuracy_score(y_true_df, y_pred)}')
print(f'Balanced accuracy: {balanced_accuracy_score(y_true_df, y_pred)}')

  probabilities /= normalizer
Accuracy: 0.6967232163606786
Balanced accuracy: 0.6130449010323875


In [67]:
y_pred = eclf.predict(X_unkwn)
print(f'Accuracy: {accuracy_score(y_unkwn, y_pred)}')
print(f'Balanced accuracy: {balanced_accuracy_score(y_unkwn, y_pred)}')

  probabilities /= normalizer
Accuracy: 0.6031553398058253
Balanced accuracy: 0.4550214354921379
