### KDD 2008 

This notebook to run the method decsribed in 

"Learning classifiers from only positive and unlabeled data." KDD 2008. Cited 637

**Method Discussion**: 
- X are the features

- y are the labels, Positive label: 1.0, Negative label: -1.0

- s are indicator for labeled data. s=1 labeled, s=0 unlabeled 

The method assumes that a classifier 'estimator' was given (not trained) using s=1 and s=0 as the labels. 

Given 'estimator', the method predicts p(y=1|x).


**TODO**: 

[ ] Load the data using unified (aka CMS) framework. 




In [None]:
import sys
sys.path.append('../../')
from lightsaber import constants as C
from lightsaber.data_utils import utils as du
from lightsaber.data_utils import sk_dataloader as skd

from lightsaber.model_lib.pu_models.puAdapter import PUAdapter
from lightsaber import metrics

import numpy as np
import io
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline

## Load data 

In [None]:
# data loading
_conf = """
tgt_file: /home/shared/expt_covid19/cohorts/severity_v1/SEVERITY_OUT_V1.csv
feat_file: /home/shared/expt_covid19/cohorts/severity_v1/SEVERITY_FEAT_V1_DEMO-COMORB.csv

idx_col: ["EID"]
tgt_col: ["SEVERITY"]
feat_cols: ["SEX", "BMI_LATEST", "AGE", 
    "has_MI", "has_STROKE" , "has_ISCH_STROKE", "has_ASTHMA", "has_RENAL",
    "has_COPD", "has_DEMENTIA", "has_NEURONE", "has_PARKINSON", "has_CANCER", "has_DM"]
"""

conf = du.yaml.load(io.StringIO(_conf), Loader=du._Loader)

tgt_file = conf['tgt_file']
feat_file = conf['feat_file']
category_map = dict()
fill_value=0.
flatten=['sum']
preprocessor = [MinMaxScaler()]

dataloader = skd.SKDataLoader(tgt_file, feat_file, 
                              idx_col=conf['idx_col'],
                              tgt_col=conf['tgt_col'],
                              feat_columns=conf['feat_cols'],
                              category_map=category_map,
                              fill_value=fill_value,
                              flatten=flatten,
                              preprocessor=preprocessor)

# Preprocessors before fitting
X, y = dataloader.read_data(refit=True)


In [None]:
X.describe().T

In [None]:
y.SEVERITY.value_counts(normalize=True)

In [None]:
# X = None # modify to load the data using CMS framework 
# y = None # modify to load the labels using CMS framework 

if C.LABEL_NEGATIVE == 0 : 
    y[y == C.LABEL_NEGATIVE] = -1.

estimator = LogisticRegression(penalty='l2', ) # Lasso(alpha=0.7) # Any SK estimator such as SVC or RandomForeest ,etc. 


## Train the PU model

In [None]:
'''
Input: 
    X: List of feature vectors
    y: Labels associated to each feature vector in X (Positive label: 1.0, Negative label: -1.0)
    estimator: Any SK estimator that implements .fit, .predict_proba functions 

Output:
    pu_estimator: estimator which complie with SK estimators. So, it has .fit, .predict, .predict_proba functions 
'''

pu_estimator = PUAdapter(estimator, hold_out_ratio=0.2) # hold out is used for estimating p(s=1|y=1)


pu_estimator.fit(X.values, y.values, treshold=0.5)

print(pu_estimator)
print("Comparison of estimator and PUAdapter(estimator):")
print("Number of disagreements: ", len(np.where((pu_estimator.predict(X) == estimator.predict(X)) == False)[0]))
print("Number of agreements: ", len(np.where((pu_estimator.predict(X) == estimator.predict(X)) == True)[0]))

## Prediction  

In [None]:
# Test data 
X_test = X # None

In [None]:
y_hat = pu_estimator.predict(X_test)
y_prob = pu_estimator.predict_proba(X_test)

## Evaluate

In [None]:
metrics.f1_pu(y, y_hat)

In [None]:
metrics.accuracy_pu(y, y_hat)

In [None]:
metrics.calculate_metrics(y, y_hat, y_prob)