[Reference](https://towardsdatascience.com/does-semi-supervised-learning-help-to-train-better-models-338283d1f4e9)

# Assessment of semi-supervised learning

In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)

from sklearn.metrics import f1_score
import xgboost as xgb
import sys

In [3]:
!git clone https://github.com/ReinhardSellmair/ssl.git

Cloning into 'ssl'...
remote: Enumerating objects: 17, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 17 (delta 0), reused 13 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (17/17), 36.51 KiB | 890.00 KiB/s, done.


In [4]:
%cd ssl/src

/content/ssl/src


In [5]:
!ls

config.py  defs.py  ml.py  pre_processing.py  tuning.py  visualizing.py


In [7]:
!pip install pympler

Collecting pympler
  Downloading Pympler-1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading Pympler-1.1-py3-none-any.whl (165 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/165.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/165.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.8/165.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pympler
Successfully installed pympler-1.1


In [8]:
from pre_processing import assign_data_set, ohe_features
from ml import BaselineClf, SelfTrainingClf, LabelPropagationClf, LabelSpreadingClf
from visualizing import plot_scores
from defs import VAL_SCORE, TEST_SCORE
from tuning import tune_param, get_best_score

In [9]:
DATA_FILE = 'https://raw.githubusercontent.com/Branden-Kang/Python-practice/master/Data/diabetes_prediction_dataset.csv'

# columns to be one-hot encoded
ENCODE_COLS = ['gender', 'smoking_history']
# column to be predicted
TARGET_COL = 'diabetes'

SCORE_FCN = f1_score

# classifier evaluation metric
CLF = xgb.XGBClassifier
LABEL_SIZES = [20, 30, 100, 200, 300, 500, 1000, 3000, 10000, 30000, 80000]

# Data

In [10]:
df = pd.read_csv(DATA_FILE)
len(df)

100000

In [11]:
# split into train, val, test sets
split_df = assign_data_set(df)
split_df.index.value_counts()

Unnamed: 0_level_0,count
data_set,Unnamed: 1_level_1
TRAIN,80000
VAL,10000
TEST,10000


In [13]:
feature_df = ohe_features(split_df, ENCODE_COLS)
feature_df.head()

Unnamed: 0_level_0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
data_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
TRAIN,52.0,0,0,27.32,4.8,140,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
TRAIN,56.0,0,0,27.32,4.8,100,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
TRAIN,22.0,0,0,37.16,6.6,85,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
TRAIN,49.0,0,0,43.83,5.0,160,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
TRAIN,10.0,0,0,14.18,4.0,155,0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# Baseline

In [14]:
baseline_scores = []
for label_size in LABEL_SIZES:
    baseline_clf = BaselineClf(feature_df, CLF, SCORE_FCN, TARGET_COL)
    baseline_scores.append(baseline_clf.fit_score(label_size))

baseline_scores_df = pd.DataFrame(baseline_scores)
baseline_scores_df

Unnamed: 0,model,label_size,train_score,val_score,test_score,fit_time,score_time,model_size
0,Baseline,20,0.0,0.0,0.0,0.243603,0.081586,3728
1,Baseline,30,0.5,0.280603,0.263405,0.04576,0.087283,3728
2,Baseline,100,0.923077,0.360544,0.393258,0.050009,0.102419,3728
3,Baseline,200,1.0,0.591156,0.599848,0.066882,0.090862,3728
4,Baseline,300,1.0,0.671598,0.691445,0.055005,0.101227,3728
5,Baseline,500,1.0,0.690858,0.710317,0.060452,0.112659,3728
6,Baseline,1000,1.0,0.793903,0.796262,0.068644,0.121505,3728
7,Baseline,3000,0.998088,0.78836,0.803069,0.093714,0.12852,3728
8,Baseline,10000,0.953077,0.777994,0.815424,0.180466,0.143907,3728
9,Baseline,30000,0.875854,0.801868,0.819462,0.305032,0.181546,3728


In [15]:
plot_scores(baseline_scores_df, 'label_size', VAL_SCORE, 'model').show()

# Self-Training

In [16]:
st_scores = []
for label_size in LABEL_SIZES:
    st_clf = SelfTrainingClf(feature_df, CLF, SCORE_FCN, TARGET_COL)
    st_scores.append(st_clf.fit_score(label_size))

st_base_scores_df = pd.DataFrame(st_scores)


y contains no unlabeled samples



# Tune Threshold

In [17]:
param_tune = {'threshold': [0.5, 0.7, 0.9, 0.97, 0.99, 0.997, 0.999]}

st_tune_scores_df = tune_param(SelfTrainingClf, CLF, feature_df, SCORE_FCN, TARGET_COL, param_tune,
                               LABEL_SIZES, n_workers=-1)

Tuning threshold


In [18]:
plot_scores(st_tune_scores_df, 'threshold', VAL_SCORE, 'label_size', xaxis_type='reverselog').show()

In [19]:
# get best parameter of each training size
st_best_scores_df = get_best_score(st_tune_scores_df)
st_best_scores_df

Unnamed: 0,model,label_size,threshold,criterion,k_best,train_score,val_score,test_score,fit_time,score_time,model_size,param_tuned
0,SelfTraining,20,0.5,threshold,10,0.0,0.0,0.0,0.892158,0.124866,1286112,threshold
1,SelfTraining,30,0.5,threshold,10,0.5,0.28222,0.266667,1.392822,0.140951,1286112,threshold
2,SelfTraining,100,0.99,threshold,10,0.833333,0.446937,0.464696,11.623424,0.123557,1286016,threshold
3,SelfTraining,200,0.999,threshold,10,0.965517,0.695718,0.715623,8.581262,0.126757,1286016,threshold
4,SelfTraining,300,0.99,threshold,10,0.96,0.719234,0.737288,13.743758,0.178435,1286016,threshold
5,SelfTraining,500,0.99,threshold,10,0.948718,0.772054,0.796496,12.615437,0.254621,1286016,threshold
6,SelfTraining,1000,0.97,threshold,10,0.938547,0.805517,0.820886,14.967106,0.160078,1286016,threshold
7,SelfTraining,3000,0.9,threshold,10,0.873118,0.803347,0.823608,14.434843,0.347498,1286016,threshold
8,SelfTraining,10000,0.7,threshold,10,0.875082,0.798662,0.821244,15.817291,0.203469,1286016,threshold
9,SelfTraining,30000,0.5,threshold,10,0.842416,0.808136,0.829649,1.923643,0.298438,1286112,threshold


In [20]:
# plot baseline, self-training default and self-training tuned
combine_df = pd.concat([baseline_scores_df, st_base_scores_df.assign(model='ST Default'),
                        st_best_scores_df.assign(model='ST Threshold Tuned')])

plot_scores(combine_df, 'label_size', TEST_SCORE, 'model', xaxis_type='log').show()

# Tune k_best

In [21]:
param_tune = {'criterion': ['k_best'], 'k_best': [1, 3, 10, 30, 100, 300, 1000]}
st_tune_kb_scores_df = tune_param(SelfTrainingClf, CLF, feature_df, SCORE_FCN, TARGET_COL, param_tune,
                                  LABEL_SIZES, n_workers=-1)

Tuning k_best


In [23]:
plot_scores(st_tune_kb_scores_df, 'k_best', 'test_score', 'label_size').show()

In [24]:
st_kb_best_df = get_best_score(st_tune_kb_scores_df)

In [25]:
# plot baseline, self-training default and self-training tuned
combine_df = pd.concat([baseline_scores_df, st_base_scores_df.assign(model='ST Default'),
                        st_best_scores_df.assign(model='ST Thres Tuned'),
                        st_kb_best_df.assign(model='ST KB Tuned')])

plot_scores(combine_df, 'label_size', TEST_SCORE, 'model', xaxis_type='log').show()

# Label Propagation

In [26]:
lp_base_res = []
for label_size in LABEL_SIZES:
    lp_clf = LabelPropagationClf(feature_df, CLF, SCORE_FCN, TARGET_COL)
    lp_base_res.append(lp_clf.fit_score(label_size))
lp_base_df = pd.DataFrame(lp_base_res)


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide



In [27]:
plot_scores(pd.concat([baseline_scores_df, lp_base_df]), 'label_size', TEST_SCORE, 'model').show()

In [None]:
lp_time_res = []
label_size = 30000
for rbf_size in [100, 300, 1000, 3000, 10000, 30000]:
    lp_clf = LabelPropagationClf(feature_df, CLF, SCORE_FCN, TARGET_COL, rbf_size=rbf_size)
    lp_time_res.append(lp_clf.fit_score(label_size))
lp_time_df = pd.DataFrame(lp_time_res)
lp_time_df


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide



In [None]:
fig = plot_scores(lp_time_df, 'rbf_size', 'fit_time', 'model')
# set y-axis to log scale
fig.update_yaxes(type='log')
fig.show()

# Tune RBF kernel

In [None]:
param_tune = dict(kernel=['rbf'], gamma=[0.01, 0.03, 0.1, 0.3, 1, 3], rbf_size=[100, 300, 1000, 3000, 10000])
lp_rbf_tune_df = tune_param(LabelPropagationClf, CLF, feature_df, SCORE_FCN, TARGET_COL, param_tune,
                            LABEL_SIZES, n_workers=1)

In [None]:
plot_df = lp_rbf_tune_df.query('param_tuned == "gamma"')
plot_scores(plot_df, 'gamma', TEST_SCORE, 'label_size').show()

In [None]:
plot_df = lp_rbf_tune_df.query('param_tuned == "rbf_size"')
plot_scores(plot_df, 'rbf_size', TEST_SCORE, 'label_size').show()

In [None]:
plot_df = lp_rbf_tune_df.query('param_tuned == "rbf_size"')
plot_scores(plot_df, 'rbf_size', 'model_size', 'label_size').show()

In [None]:
# get best parameter of each training size
lp_rbf_best_df = get_best_score(lp_rbf_tune_df)

# Tune KNN kernel

In [None]:
param_tune = dict(kernel=['knn'], n_neighbors=[1, 3, 10, 30, 100, 300])
lp_knn_tune_df = tune_param(LabelPropagationClf, CLF, feature_df, SCORE_FCN, TARGET_COL, param_tune,
                            LABEL_SIZES, n_workers=-1)

In [None]:
plot_scores(lp_knn_tune_df, 'n_neighbors', TEST_SCORE, 'label_size').show()

In [None]:
lp_knn_best_df = get_best_score(lp_knn_tune_df)

In [None]:
# compare baseline, label propagation
combine_df = pd.concat([baseline_scores_df, lp_base_df.assign(model='LP Base'),
                        lp_rbf_best_df.assign(model='LP RBF Tuned'), lp_knn_best_df.assign(model='LP KNN Tuned')])
plot_scores(combine_df, 'label_size', TEST_SCORE, 'model').show()