In [2]:
import itertools as it
import os
import pickle
import re
from copy import deepcopy
from functools import reduce
from glob import glob
from operator import add, itemgetter
from pprint import pprint

import gensim
import gensim.downloader as api
import matplotlib
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from scipy.spatial.distance import pdist, squareform
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import nan_euclidean_distances
from sklearn.preprocessing import LabelEncoder, PowerTransformer, RobustScaler, Normalizer, StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.semi_supervised import LabelPropagation
import tqdm

from utils import video_id_without_categories



In [3]:
df = pd.read_pickle('data/chkp3.pkl')
df['category_id'] = df.category_id.astype(np.int16)
mask = pd.read_pickle('data/known_unknown_mask.pkl')

#### Wczytywanie danych

In [4]:
unknown_df = df.copy()

unknown_df.loc[~mask, 'category_id'] = -1
unknown_df.category_id.value_counts(dropna=False)

-1     5768
 24     684
 10     608
 22     241
 23     220
 26     217
 17     183
 1      166
 25     152
 28     102
 20      95
 27      78
 15      49
 19      20
 2       16
 29       5
 43       2
Name: category_id, dtype: int64

In [5]:
X_df = unknown_df.drop('category_id', axis=1)
y_df = unknown_df['category_id']
y_true_df = df.category_id

X_unknown_only = X_df.loc[~mask]
y_unknown_true = y_true_df.loc[~mask]

### Uczenie pół-nadzorowane

In [6]:
scalers = [RobustScaler(quantile_range=(5, 95)), StandardScaler(), Normalizer()]
classifiers = [LabelPropagation(kernel='rbf', max_iter=10000), LabelPropagation(kernel='knn', max_iter=10000)]

from itertools import product

pipelines = [make_pipeline(*steps) for steps in product(scalers, classifiers)]
for pipeline in pipelines:
    print(pipeline)

Pipeline(steps=[('robustscaler', RobustScaler(quantile_range=(5, 95))),
                ('labelpropagation', LabelPropagation(max_iter=10000))])
Pipeline(steps=[('robustscaler', RobustScaler(quantile_range=(5, 95))),
                ('labelpropagation',
                 LabelPropagation(kernel='knn', max_iter=10000))])
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('labelpropagation', LabelPropagation(max_iter=10000))])
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('labelpropagation',
                 LabelPropagation(kernel='knn', max_iter=10000))])
Pipeline(steps=[('normalizer', Normalizer()),
                ('labelpropagation', LabelPropagation(max_iter=10000))])
Pipeline(steps=[('normalizer', Normalizer()),
                ('labelpropagation',
                 LabelPropagation(kernel='knn', max_iter=10000))])


##### Trenowanie

In [7]:
from collections import defaultdict
import joblib
from pathlib import Path

clf_dir = Path('data/chkp3')
clf_dir.mkdir(exist_ok=True, parents=True)

results = defaultdict(list)

for pipeline in tqdm.tqdm(pipelines):
    name_hash = joblib.hash(str(pipeline)) + ".pkl"
    clf_path = clf_dir/ name_hash
    if clf_path.exists():
        print(f"Loading from {clf_path}")
        pipeline = joblib.load(str(clf_path))
    else:
        print(f"Saving to {clf_path}")
        pipeline.fit(X_df, y_df)
        joblib.dump(pipeline, str(clf_path))
    
    y_hat = pipeline.predict(X_df)
    y_unknown_only = pipeline.predict(X_unknown_only)
    results['name'].append(str(pipeline))
    results['accuracy'].append(accuracy_score(y_true_df, y_hat))
    results['balanced_accuracy'].append(balanced_accuracy_score(y_true_df, y_hat))
    
    results['unknown_accuracy'].append(accuracy_score(y_unknown_true, y_unknown_only))
    results['unknown_balanced_accuracy'].append(balanced_accuracy_score(y_unknown_true, y_unknown_only))
    
results = pd.DataFrame(results)

  0%|          | 0/6 [00:00<?, ?it/s]

Loading from data/chkp3/1007a4906619c10d7ab772fab547266c.pkl


 17%|█▋        | 1/6 [00:03<00:18,  3.68s/it]

Loading from data/chkp3/8ccd302550829a1d547381dbf4071dd6.pkl


 33%|███▎      | 2/6 [00:06<00:12,  3.13s/it]

Loading from data/chkp3/cca4953970b2f6d864b2de6ee03b3735.pkl


 50%|█████     | 3/6 [00:09<00:09,  3.11s/it]

Loading from data/chkp3/933388cebbc0d6fa2f772a49c0479333.pkl


 67%|██████▋   | 4/6 [00:12<00:06,  3.04s/it]

Loading from data/chkp3/2b60b5c8956383f47a852d3c8bb5240e.pkl


 83%|████████▎ | 5/6 [00:15<00:02,  2.97s/it]

Loading from data/chkp3/0e4db668d9ed12e2efe7d436f98d4e1e.pkl


100%|██████████| 6/6 [00:18<00:00,  3.01s/it]


In [8]:
results.sort_values('accuracy', ascending=False)

Unnamed: 0,name,accuracy,balanced_accuracy,unknown_accuracy,unknown_balanced_accuracy
4,"Pipeline(steps=[('normalizer', Normalizer()),\...",0.217058,0.066286,0.204577,0.06995
5,"Pipeline(steps=[('normalizer', Normalizer()),\...",0.096212,0.076124,0.066748,0.072164
3,"Pipeline(steps=[('standardscaler', StandardSca...",0.08436,0.078655,0.07611,0.081204
1,"Pipeline(steps=[('robustscaler', RobustScaler(...",0.078782,0.07516,0.066054,0.07423
0,"Pipeline(steps=[('robustscaler', RobustScaler(...",0.05287,0.0625,0.050104,0.066667
2,"Pipeline(steps=[('standardscaler', StandardSca...",0.05287,0.0625,0.050104,0.066667


## R

In [56]:
nr = Normalizer()
r_df = X_df.copy()
r_df[:] = nr.fit_transform(X_df)
r_df['Class'] = y_df
y_mask = y_df >= 24
# r_df.loc[y_mask, 'Class'] = 1
# r_df.loc[~y_mask, 'Class'] = 0
r_df.loc[~mask, 'Class'] = np.nan

r_df.to_csv('data/R_chkp4.csv', index=False)
r_df.head()

Unnamed: 0_level_0,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled,video_error_or_removed,publish_time_day,publish_time_month,publish_time_year,...,objects_6,objects_7,objects_8,objects_9,objects_10,objects_11,objects_12,objects_13,objects_14,Class
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9wRQljFNDW8,2.346992e-06,1.889084e-08,7.210245e-10,5.104853e-09,0.0,0.0,0.0,3.749327e-10,3.172508e-10,5.817226e-08,...,1.746708e-14,-5.135288e-14,-1.284299e-14,-3.148085e-14,4.801021e-14,1.304634e-14,-2.329582e-14,-2.518598e-14,-9.487018e-15,
Om_zGhJLZ5U,1.351123e-05,3.514337e-07,3.703732e-08,9.871943e-08,0.0,0.0,0.0,5.611715e-10,5.144072e-10,9.432358e-08,...,2.757011e-14,-8.452427e-14,-2.176896e-14,-5.008287e-14,8.045961e-14,1.992135e-14,-3.910513e-14,-4.028449e-14,-1.628317e-14,
goP4Z5wyOlM,2.089504e-07,1.850128e-09,1.561797e-10,2.480854e-09,0.0,0.0,0.0,7.208292e-11,6.607601e-11,1.211594e-08,...,4.212454e-14,-2.768126e-14,-2.883515e-14,-1.806023e-14,-3.469699e-14,-5.76601e-15,-2.938299e-14,-3.168926e-14,-2.561524e-16,
8NHA23f7LvU,4.889397e-05,5.266422e-07,7.71206e-09,1.977032e-08,0.0,0.0,0.0,3.267822e-10,3.594604e-10,6.591197e-08,...,1.937792e-14,-5.894161e-14,-1.546653e-14,-3.510396e-14,5.599271e-14,1.388852e-14,-2.722585e-14,-2.806887e-14,-1.121339e-14,
IE-xepGLVt8,4.273401e-07,3.414298e-09,1.332585e-10,4.682054e-10,0.0,0.0,0.0,1.80079e-11,1.980869e-11,3.632193e-09,...,-3.22248e-13,-1.858249e-13,-5.547759e-14,-3.403553e-14,-9.862859e-15,2.622265e-14,7.152344e-14,-2.195817e-15,-5.685228e-15,


In [57]:
r_df['Class'].value_counts(dropna=False)

NaN     5768
24.0     684
10.0     608
22.0     241
23.0     220
26.0     217
17.0     183
1.0      166
25.0     152
28.0     102
20.0      95
27.0      78
15.0      49
19.0      20
2.0       16
29.0       5
43.0       2
Name: Class, dtype: int64

In [None]:
X_df = unknown_df.drop('category_id', axis=1)
y_df = unknown_df['category_id']
y_true_df = df.category_id

X_unknown_only = X_df.loc[~mask]
y_unknown_true = y_true_df.loc[~mask]

In [9]:
from rpy2.robjects.packages import importr, isinstalled
from rpy2.robjects.vectors import StrVector

In [10]:
utils = importr('utils')
utils.chooseCRANmirror(ind=1)

<rpy2.rinterface_lib.sexp.NULLType object at 0x7feadf9bb440> [RTYPES.NILSXP]

In [11]:
packnames = ['RSSL']

# Selectively install what needs to be install.
# We are fancy, just because we can.
names_to_install = [x for x in packnames if not isinstalled(x)]
print(names_to_install)
if len(names_to_install) > 0:
    utils.install_packages(StrVector(names_to_install))

R[write to console]: Installing package into ‘/home/hylomorph/R/x86_64-pc-linux-gnu-library/3.6’
(as ‘lib’ is unspecified)



['RSSL']


R[write to console]: also installing the dependencies ‘digest’, ‘ggplot2’


R[write to console]: trying URL 'https://cloud.r-project.org/src/contrib/digest_0.6.27.tar.gz'

R[write to console]: Content type 'application/x-gzip'
R[write to console]:  length 164373 bytes (160 KB)

R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write 

In [14]:
rssl = importr('RSSL')

In [47]:
ro.globalenv

<rpy2.robjects.environments.Environment object at 0x7feadf9afc40> [RTYPES.ENVSXP]
R classes: ('environment',)
n items: 0

In [55]:
# rssl.S4VM(X_df, y_df)
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri, numpy2ri, FactorVector, StrVector

from rpy2.robjects.conversion import localconverter

X_known_df = X_df[mask]
X_unknown_df = X_df[~mask]
y_known_df = y_df[mask]
y_mask = y_known_df >= 24
y_known_df[~mask] = 1
y_known_df[mask] = 0


with localconverter(ro.default_converter + pandas2ri.converter + numpy2ri.converter):
    rX_df = ro.conversion.py2rpy(X_known_df.values)
    rX_unknow = ro.conversion.py2rpy(X_unknown_df.values)
    ry_df = ro.conversion.py2rpy(y_known_df.values)
    
    rssl.S4VM(X=rX_df, y=ro.r['as.factor'](ry_df), X_u=rX_unknow)

R[write to console]: Error in PreProcessing(X = X, y = y, X_u = X_u, scale = scale, intercept = FALSE,  : 
  No valid input for X, y and X_u.



RRuntimeError: Error in PreProcessing(X = X, y = y, X_u = X_u, scale = scale, intercept = FALSE,  : 
  No valid input for X, y and X_u.
