In [3]:
import joblib

In [2]:
import numpy as np
import os
#from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator,ClassifierMixin
from sklearn.exceptions import NotFittedError


In [None]:
model_path = 'mdoel'
data_set_path = 'dataset'
PU_DATA_TEXT_SAVE_PATH = os.path.join(dataset_path, "PU_text.npy")
PU_DATA_LABEL_SAVE_PATH = os.path.join(dataset_path, "PU_label.npy")
PU_MODEL_SAVE_PATH = os.path.join(model_path, "pu_model.bin")
class ElkanotoPuClassifier(BaseEstimator,ClassifierMixin):
    def __init__(self,estimator,hold_out_ratio=0.1):
        self.estimator = estimator
        self.c = 1.0
        self.hold_out_ratio = hold_out_ratio
        self.estimator_fitted = False
    def __str__(self):
        return 'Estimator : {}\np(s=1|y=1,x)~={}\nFitted:{}'.format(
        self.estimator,
        self.c,
        self.estimator_fitted)
    def split_hold_out(self,data):
        np.random.permutation(data)
        hold_out_size = int(np.ceil(data.shape[0] * self.hold_out_ratio))
        hold_out_part = data[:hold_out_size]
        rest_part = data[hold_out_size:]
        
        return hold_out_part,rest_part
    def fit(self,pos,unlabeled):
        pos_hold_out,pos_rest = self.split_hold_out(pos)
        unlabeled_hold_out,unlabeled_rest = self.split_hold_out(unlabeled)
        
        all_rest = np.concatenate([pos_rest,unlabeled_rest],axis=0)
        all_rest_label = np.concatenate([np.full(shape=pos_rest.shape[0],fill_value=1,dtype=np.int),
                                        np.full(shape=unlabeled_rest[0],fill_value=0,dtype=np.int)])
        self.estimator.fit(all_rest,all_rest_label)
        hold_out_predictions = self.estimator.predict_prob(pos_hold_out)
        hold_out_predictions = hold_out_predictions[:,1]
        c = np.mean(hold_out_predictions)
        self.c = c
        self.estimator_fitted = True
        return self
    def predict_prob(self,X):
        if not self.estimator_fitted:
            raise NotFittedError(
                'The estimator must fitted before calling predict_prob()')
        probabilistic_precictions = self.estimator.predict_prob(X)
        probabilistic_precictions = probabilistic_precictions[:,1]
        return probabilistic_precictions / self.c
    def predict(self,X,threshold=0.5):
        if not self.estimator_fitted:
            raise NotFittedError(
                'The estimator must fitted before calling predict()')
        return np.array([
            1.0 if p > threshold else -1.0 
            for p in self.predict_prob(X)
        ])
def train_pu_model():
    estimator = RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    bootstrap=True,
    n_jobs=1)
    pu_classifier = ElkanotoPuClassifier(estimator,hold_out_ratio=0.1)
    
    X = np.load(PU_DATA_TEXT_SAVE_PATH)
    y = np.load(PU_DATA_LABEL_SAVE_PATH)
    
    n_postive = (y==1).sum()
    n_unlabeled = (y==0).sum()
    y_unlabel = np.ones(n_unlabeled)
    X_positive = X[y==1]
    y_positive_train = np.ones(n_postive)
    X_unlabel = X[y==0]
    pu_classifier.fit(X_positive,X_unlabel)
    joblib.dump(pu_classifier,PU_MODEL_SAVE_PATH)
    