In [31]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.semi_supervised import LabelSpreading
from sklearn.semi_supervised import SelfTrainingClassifier

from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.ensemble import VotingClassifier

In [2]:
df = pd.read_pickle('data/chkp3.pkl')
df['category_id'] = df.category_id.astype(np.int16)
mask = pd.read_pickle('data/known_unknown_mask.pkl')

In [3]:
unknown_df = df.copy()

unknown_df.loc[~mask, 'category_id'] = -1
unknown_df.category_id.value_counts(dropna=False)

-1     5768
 24     684
 10     608
 22     241
 23     220
 26     217
 17     183
 1      166
 25     152
 28     102
 20      95
 27      78
 15      49
 19      20
 2       16
 29       5
 43       2
Name: category_id, dtype: int64

In [4]:
X_df = unknown_df.drop('category_id', axis=1)
y_df = unknown_df['category_id']
y_true_df = df.category_id

In [5]:
rs = RobustScaler(quantile_range=(5.0, 95.0))
rs.fit(X_df)
X_df = rs.transform(X_df)

In [6]:
y_df.head(5), y_true_df.head(5)

(video_id
 9wRQljFNDW8   -1
 Om_zGhJLZ5U   -1
 goP4Z5wyOlM   -1
 8NHA23f7LvU   -1
 IE-xepGLVt8   -1
 Name: category_id, dtype: int16,
 video_id
 9wRQljFNDW8    17
 Om_zGhJLZ5U     1
 goP4Z5wyOlM    25
 8NHA23f7LvU    24
 IE-xepGLVt8    28
 Name: category_id, dtype: int16)

In [7]:
# Initial acc
print(f'Accuracy: {accuracy_score(y_true_df, y_df)}')
# print(f'Balanced accuracy: {balanced_accuracy_score(y_true_df, y_df)}') - nie ma sensu bo różne klasy

Accuracy: 0.32976992795723914


## Semi supervised learning

In [8]:
ls = LabelSpreading()
ls.fit(X_df, y_df)

LabelSpreading()

In [28]:
# y_pred_ls = ls.transduction_ # gives same result
y_pred_ls = ls.predict(X_df)
print(f'Accuracy: {accuracy_score(y_true_df, y_pred_ls)}')
print(f'Balanced accuracy: {balanced_accuracy_score(y_true_df, y_pred_ls)}')

Accuracy: 0.6429235417150825
Balanced accuracy: 0.5884771055367881
  probabilities /= normalizer


#### Only on unknown data

In [11]:
unknown_df['category_id_truth'] = df.category_id
temp_df = unknown_df[unknown_df.category_id == -1]
X, y = rs.transform(temp_df.drop(['category_id', 'category_id_truth'], axis=1)), temp_df.category_id_truth

In [12]:
y_pred = ls.predict(X)
print(f'Accuracy: {accuracy_score(y, y_pred)}')
print(f'Balanced accuracy: {balanced_accuracy_score(y, y_pred)}')

Accuracy: 0.4672330097087379
Balanced accuracy: 0.3678835128191864
  probabilities /= normalizer


## Self training clf

In [26]:
svc = SVC(kernel='rbf', gamma="auto", probability=True)
stc = SelfTrainingClassifier(svc, max_iter=None)
stc.fit(X_df, y_df)

SelfTrainingClassifier(base_estimator=SVC(gamma='auto', probability=True),
                       max_iter=None)

In [27]:
y_pred_stc = stc.predict(X_df)
print(f'Accuracy: {accuracy_score(y_true_df, y_pred_stc)}')
print(f'Balanced accuracy: {balanced_accuracy_score(y_true_df, y_pred_stc)}')

Accuracy: 0.5844759470137114
Balanced accuracy: 0.37009264788256313


In [29]:
stc.transduction_

array([17, -1, -1, ..., 10, 24, 10], dtype=int16)

## Voting clf

In [34]:
vclf = VotingClassifier(estimators=[
    ('ls', LabelSpreading()),
    ('stc_svc', SelfTrainingClassifier(SVC(kernel='rbf', gamma="auto", probability=True), max_iter=None))
], voting='hard', weights = [2,1])

vclf.fit(X_df, y_df)



VotingClassifier(estimators=[('ls', LabelSpreading()),
                             ('stc_svc',
                              SelfTrainingClassifier(base_estimator=SVC(gamma='auto',
                                                                        probability=True),
                                                     max_iter=None))],
                 weights=[2, 1])

In [35]:
y_pred = vclf.predict(X_df)
print(f'Accuracy: {accuracy_score(y_true_df, y_pred)}')
print(f'Balanced accuracy: {balanced_accuracy_score(y_true_df, y_pred)}')

Accuracy: 0.32976992795723914
Balanced accuracy: 0.35394505374197893
