Skip to content

Commit

Permalink
mlbox 0.5
Browse files Browse the repository at this point in the history
* improvement in verbose mode
* add of new dates features 
* add of a new strategy for missing categorical values
* new parallel computing
  • Loading branch information
AxeldeRomblay committed Aug 24, 2017
1 parent 27f441c commit f3f6e87
Show file tree
Hide file tree
Showing 43 changed files with 1,161 additions and 490 deletions.
Binary file added docker/mlbox-0.5.0-py2.py3-none-any.whl
Binary file not shown.
7 changes: 7 additions & 0 deletions docs/history.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,10 @@ History
* pep8 style
* normalization of drift coefficients
* warning size of folder 'save'

0.5.0 (2017-08-24)
------------------
* improvement in verbose mode
* add of new dates features
* add of a new strategy for missing categorical values
* new parallel computing
70 changes: 70 additions & 0 deletions docs/history.rst~
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
History
=======

0.1.0 (2017-02-09)
------------------
* First non-official release.

0.1.1 (2017-02-23)
------------------
* add of several estimators : Random Forest, Extra Trees, Logistic Regression, ...
* improvement in verbose mode for reader.

0.1.2 (2017-03-02)
------------------
* add of dropout for entity embeddings.
* improvement in optimiser.

0.2.0 (2017-03-22)
------------------
* add of feature importances for base learners.
* add of leak detection.
* add of stacking meta-model.
* improvement in verbose mode for optimiser (folds variance).

0.2.1 (2017-04-26)
------------------
* add of feature importances for bagging and boosting meta-models.

0.2.2 (first official release : 2017-06-13)
-------------------------------------------
* update of dependencies (Keras 2.0,...).
* add of LightGBM model.

0.3.0 (2017-07-11)
------------------
* Python 2.7 & Python 3.4-3.6 compatibilities

0.3.1 (2017-07-12)
------------------
* Availability on PyPI.

0.4.0 (2017-07-18)
------------------
* add of pipeline memory.

0.4.1 (2017-07-21)
------------------
* improvement in verbose mode for reader (display missing values)

0.4.2 (2017-07-25)
------------------
* update of dependencies

0.4.3 (2017-07-26)
------------------
* improvement in verbose mode for predictor (display feature importances)
* wait until modules and engines are imported

0.4.4 (2017-08-04)
------------------
* pep8 style
* normalization of drift coefficients
* warning size of folder 'save'

0.5.0 (2017-08-24)
------------------
* improvement in verbose mode
* add of new dates features
* new parallel computing
* warning size of folder 'save'
335 changes: 165 additions & 170 deletions examples/classification/example.ipynb

Large diffs are not rendered by default.

327 changes: 165 additions & 162 deletions examples/regression/example.ipynb

Large diffs are not rendered by default.

Binary file not shown.
2 changes: 1 addition & 1 deletion python-package/mlbox/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

__author__ = """Axel ARONIO DE ROMBLAY"""
__email__ = 'axelderomblay@gmail.com'
__version__ = '0.4.4'
__version__ = '0.5.0'

from .preprocessing import *
from .encoding import *
Expand Down
Binary file added python-package/mlbox/encoding/__init__.pyc
Binary file not shown.
Binary file not shown.
79 changes: 62 additions & 17 deletions python-package/mlbox/encoding/na_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class NA_encoder():
categorical_strategy : str, default = '<NULL>'
The strategy to encode NA for categorical features.
Available strategies = a string or np.NaN
Available strategies = a string or "most_frequent"
"""

def __init__(self,
Expand All @@ -37,6 +37,7 @@ def __init__(self,
self.__Lcat = []
self.__Lnum = []
self.__imp = None
self.__mode = dict()
self.__fitOK = False


Expand Down Expand Up @@ -81,6 +82,8 @@ def fit(self, df_train, y_train=None):
self.__Lcat = df_train.dtypes[df_train.dtypes == 'object'].index
self.__Lnum = df_train.dtypes[df_train.dtypes != 'object'].index

# Dealing with numerical features

if (self.numerical_strategy in ['mean', 'median', "most_frequent"]):

self.__imp = Imputer(strategy=self.numerical_strategy)
Expand All @@ -90,16 +93,33 @@ def fit(self, df_train, y_train=None):
else:
pass

self.__fitOK = True

elif ((type(self.numerical_strategy) == int) | (type(self.numerical_strategy) == float)):

self.__fitOK = True
pass

else:

raise ValueError("Numerical strategy for NA encoding is not valid")

# Dealing with categorical features

if (type(self.categorical_strategy) == str):

if (self.categorical_strategy == "most_frequent"):

na_count = df_train[self.__Lcat].isnull().sum()

for col in na_count[na_count>0].index:
self.__mode[col] = df_train[col].mode()[0]

else:
pass

else:
raise ValueError("Categorical strategy for NA encoding is not valid")

self.__fitOK = True

return self


Expand Down Expand Up @@ -145,7 +165,11 @@ def transform(self, df):

if(len(self.__Lnum) == 0):

return df[self.__Lcat].fillna(self.categorical_strategy)
if (self.categorical_strategy != "most_frequent"):
return df[self.__Lcat].fillna(self.categorical_strategy)

else:
return df[self.__Lcat].fillna(self.__mode)

else:

Expand All @@ -155,13 +179,25 @@ def transform(self, df):

if (len(self.__Lcat) != 0):

return pd.concat(
(pd.DataFrame(self.__imp.transform(df[self.__Lnum]),
columns=self.__Lnum,
index=df.index),
df[self.__Lcat].fillna(self.categorical_strategy)
),
axis=1)[df.columns]
if (self.categorical_strategy != "most_frequent"):

return pd.concat(
(pd.DataFrame(self.__imp.transform(df[self.__Lnum]),
columns=self.__Lnum,
index=df.index),
df[self.__Lcat].fillna(self.categorical_strategy)
),
axis=1)[df.columns]

else:

return pd.concat(
(pd.DataFrame(self.__imp.transform(df[self.__Lnum]),
columns=self.__Lnum,
index=df.index),
df[self.__Lcat].fillna(self.__mode)
),
axis=1)[df.columns]

else:

Expand All @@ -175,12 +211,21 @@ def transform(self, df):

if (len(self.__Lcat) != 0):

return pd.concat(
(df[self.__Lnum].fillna(self.numerical_strategy),
df[self.__Lcat].fillna(self.categorical_strategy)
),
axis=1)[df.columns]
if (self.categorical_strategy != "most_frequent"):

return pd.concat(
(df[self.__Lnum].fillna(self.numerical_strategy),
df[self.__Lcat].fillna(self.categorical_strategy)
),
axis=1)[df.columns]

else:

return pd.concat(
(df[self.__Lnum].fillna(self.numerical_strategy),
df[self.__Lcat].fillna(self.__mode)
),
axis=1)[df.columns]
else:

return df[self.__Lnum].fillna(self.numerical_strategy)
Expand Down
Binary file added python-package/mlbox/encoding/na_encoder.pyc
Binary file not shown.
Binary file added python-package/mlbox/model/__init__.pyc
Binary file not shown.
Binary file added python-package/mlbox/model/supervised/__init__.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added python-package/mlbox/optimisation/__init__.pyc
Binary file not shown.
Binary file added python-package/mlbox/optimisation/optimiser.pyc
Binary file not shown.
Binary file added python-package/mlbox/prediction/__init__.pyc
Binary file not shown.
19 changes: 9 additions & 10 deletions python-package/mlbox/prediction/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ def fit_predict(self, params, df):

if(params is None):
print("")
print('No parameters set. Default configuration is tested')
print('> No parameters set. Default configuration is tested')
set_params = True

else:
Expand All @@ -354,7 +354,7 @@ def fit_predict(self, params, df):
try:
if(self.verbose):
print("")
print("fitting the pipeline...")
print("fitting the pipeline ...")

pp.fit(df['train'], df['target'])

Expand All @@ -369,9 +369,6 @@ def fit_predict(self, params, df):
# Feature importances

try:
if(self.verbose):
print("")
print("dumping feature importances into directory : " + self.to_path)

importance = est.feature_importances()
self.__save_feature_importances(importance,
Expand All @@ -382,9 +379,11 @@ def fit_predict(self, params, df):

if(self.verbose):
self.__plot_feature_importances(importance, 10)

print("")
print("> Feature importances dumped into directory : " + self.to_path)

except:
warnings.warn("Unable to get feature importances...")
warnings.warn("Unable to get feature importances !")

except:
raise ValueError("Pipeline cannot be fitted")
Expand Down Expand Up @@ -424,7 +423,7 @@ def fit_predict(self, params, df):
try:
if(self.verbose):
print("")
print("predicting...")
print("predicting ...")

pred = pd.DataFrame(pp.predict_proba(df['test']),
columns=enc.inverse_transform(range(len(enc.classes_))),
Expand Down Expand Up @@ -471,7 +470,7 @@ def fit_predict(self, params, df):

if(self.verbose):
print("")
print("top 10 predictions :")
print("> Overview on predictions : ")
print("")
print(pred.head(10))

Expand All @@ -481,7 +480,7 @@ def fit_predict(self, params, df):

if(self.verbose):
print("")
print("dumping predictions into directory : "+self.to_path)
print("dumping predictions into directory : "+self.to_path + " ...")

pred.to_csv(self.to_path
+ "/"
Expand Down
Binary file added python-package/mlbox/prediction/predictor.pyc
Binary file not shown.

0 comments on commit f3f6e87

Please sign in to comment.