Skip to content

Commit

Permalink
Merge pull request #28 from maskani-moh/pep8-fix
Browse files Browse the repository at this point in the history
Pep8 fix
  • Loading branch information
AxeldeRomblay committed Aug 2, 2017
2 parents 5b25989 + e582590 commit d6a2357
Show file tree
Hide file tree
Showing 7 changed files with 464 additions and 350 deletions.
2 changes: 1 addition & 1 deletion python-package/mlbox/preprocessing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
from .reader import *

import time
time.sleep(30) #waiting for the engines to start
time.sleep(30) # Waiting for the engines to start
4 changes: 3 additions & 1 deletion python-package/mlbox/preprocessing/drift/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,6 @@
import os
os.system("ipcluster start --profile=home &")

warnings.warn("ipCluster is starting. Please wait 30 sec and check in terminal that 'the engines appear to have started successfully'.")
warnings.warn("ipCluster is starting. "
"Please wait 30 sec and check in terminal that "
"'the engines appear to have started successfully'.")
129 changes: 72 additions & 57 deletions python-package/mlbox/preprocessing/drift/drift_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold


def cross_val_predict_proba(estimator, X, y, cv):


"""Evaluates the target by cross-validation
Expand All @@ -35,21 +35,19 @@ def cross_val_predict_proba(estimator, X, y, cv):
"""


y_pred = np.zeros(len(y))
for train_index, test_index in cv.split(X,y):

for train_index, test_index in cv.split(X, y):

estimator.fit(X.iloc[train_index], y.iloc[train_index])

y_pred[test_index] = estimator.predict_proba(X.iloc[test_index])[:,1]

return y_pred

y_pred[test_index] = estimator.predict_proba(X.iloc[test_index])[:, 1]

return y_pred


class DriftEstimator():

"""Estimates the drift between two datasets
Expand All @@ -60,35 +58,42 @@ class DriftEstimator():
n_folds : int, defaut = 2
Number of folds used to estimate the drift
stratify : bool, defaut = True
Whether the cv is stratified (same number of train and test samples within each fold)
random_state : int, defaut = 1
Random state for cv
"""

def __init__(self, estimator = RandomForestClassifier(n_estimators = 50, n_jobs=-1, max_features=1., min_samples_leaf = 5, max_depth = 5), n_folds = 2, stratify = True, random_state = 1):

def __init__(self,
estimator=RandomForestClassifier(n_estimators=50,
n_jobs=-1,
max_features=1.,
min_samples_leaf=5,
max_depth=5),
n_folds=2,
stratify=True,
random_state=1):

self.estimator = estimator
self.n_folds = n_folds
self.stratify = stratify
self.random_state = random_state
self.__cv = None
self.__pred = None
# TODO: Change to 'cible' to english 'target'?
self.__cible = None
self.__fitOK = False


self.__fitOK = False

def get_params(self):

return {'estimator': self.estimator,
'n_folds' : self.n_folds,
'stratify' : self.stratify,
'random_state' : self.random_state}
return {'estimator': self.estimator,
'n_folds': self.n_folds,
'stratify': self.stratify,
'random_state': self.random_state}


def set_params(self,**params):
def set_params(self, **params):

if('estimator' in params.keys()):
self.estimator = params['estimator']
Expand All @@ -97,12 +102,12 @@ def set_params(self,**params):
if('stratify' in params.keys()):
self.stratify = params['stratify']
if('random_state' in params.keys()):
self.random_state = params['random_state']


self.random_state = params['random_state']

def fit(self, df_train, df_test):

"""Computes the drift between the two datasets

"""
Computes the drift between the two datasets
Parameters
----------
Expand All @@ -117,30 +122,37 @@ def fit(self, df_train, df_test):
self : object
Returns self.
"""

df_train["target"] = 0
df_test["target"] = 1

self.__cible = pd.concat((df_train.target, df_test.target),ignore_index=True)

if(self.stratify == True):
self.__cv = StratifiedKFold(n_splits = self.n_folds, shuffle = True, random_state = self.random_state)
self.__cible = pd.concat((df_train.target, df_test.target),
ignore_index=True)

if self.stratify:
self.__cv = StratifiedKFold(n_splits=self.n_folds,
shuffle=True,
random_state=self.random_state)
else:
self.__cv = KFold(n_splits = self.n_folds, shuffle = True, random_state = self.random_state)
self.__cv = KFold(n_splits=self.n_folds,
shuffle=True,
random_state=self.random_state)


self.__pred = cross_val_predict_proba(estimator = self.estimator,
X = pd.concat((df_train, df_test),ignore_index=True).drop(['target'], axis = 1),
y = self.__cible,
cv = self.__cv)
X_tmp = pd.concat((df_train, df_test),
ignore_index=True).drop(['target'], axis=1)

self.__pred = cross_val_predict_proba(estimator=self.estimator,
X=X_tmp,
y=self.__cible,
cv=self.__cv)

del df_train["target"]
del df_test["target"]

self.__fitOK = True

return self



def score(self):

"""Returns the global drift measure between two datasets.
Expand All @@ -152,35 +164,38 @@ def score(self):
float
The drift measure
"""

S = []

if self.__fitOK:

for train_index, test_index in self.__cv.split(X=np.zeros(len(self.__cible)), y=self.__cible):

S.append(roc_auc_score(self.__cible.iloc[test_index], self.__pred[test_index]))

return max(np.mean(S),1-np.mean(S)) # renvoyer la moyenne des AUC symetrisee

X_zeros = np.zeros(len(self.__cible))

for train_index, test_index in self.__cv.split(X=X_zeros,
y=self.__cible):

S.append(roc_auc_score(self.__cible.iloc[test_index],
self.__pred[test_index]))

# TODO: return the mean of mirrored AUC metrics
return max(np.mean(S), 1-np.mean(S))

else:
raise ValueError('Call the fit function before !')


def predict(self):

"""Returns the probabilities that the sample belongs to the test dataset
raise ValueError('Call the fit function before !')

def predict(self):

"""Returns the probabilities that the sample belongs to the test dataset
Returns
-------
Array of shape = (n_train+n_test,)
The probabilities
"""


if self.__fitOK:

return self.__pred

else:
raise ValueError('Call the fit function before !')

0 comments on commit d6a2357

Please sign in to comment.