mlbox 0.5

* improvement in verbose mode * add of new dates features * add of a new strategy for missing categorical values * new parallel computing
AxeldeRomblay · Aug 24, 2017 · f3f6e87 · f3f6e87
1 parent 27f441c
commit f3f6e87
Show file tree

Hide file tree

Showing 43 changed files with 1,161 additions and 490 deletions.
diff --git a/docker/mlbox-0.5.0-py2.py3-none-any.whl b/docker/mlbox-0.5.0-py2.py3-none-any.whl
diff --git a/docs/history.rst b/docs/history.rst
@@ -61,3 +61,10 @@ History
 * pep8 style
 * normalization of drift coefficients
 * warning size of folder 'save' 
+
+0.5.0 (2017-08-24)
+------------------
+* improvement in verbose mode
+* add of new dates features 
+* add of a new strategy for missing categorical values
+* new parallel computing
diff --git a/docs/history.rst~ b/docs/history.rst~
@@ -0,0 +1,70 @@
+History
+=======
+
+0.1.0 (2017-02-09)
+------------------
+* First non-official release.
+
+0.1.1 (2017-02-23)
+------------------
+* add of several estimators : Random Forest, Extra Trees, Logistic Regression, ...
+* improvement in verbose mode for reader.
+
+0.1.2 (2017-03-02)
+------------------
+* add of dropout for entity embeddings.
+* improvement in optimiser.
+
+0.2.0 (2017-03-22)
+------------------
+* add of feature importances for base learners.
+* add of leak detection.
+* add of stacking meta-model.
+* improvement in verbose mode for optimiser (folds variance).
+
+0.2.1 (2017-04-26)
+------------------
+* add of feature importances for bagging and boosting meta-models.
+
+0.2.2 (first official release : 2017-06-13)
+-------------------------------------------
+* update of dependencies (Keras 2.0,...).
+* add of LightGBM model.
+
+0.3.0 (2017-07-11)
+------------------
+* Python 2.7 & Python 3.4-3.6 compatibilities
+
+0.3.1 (2017-07-12)
+------------------
+* Availability on PyPI.
+
+0.4.0 (2017-07-18)
+------------------
+* add of pipeline memory.
+
+0.4.1 (2017-07-21)
+------------------
+* improvement in verbose mode for reader (display missing values)
+
+0.4.2 (2017-07-25)
+------------------
+* update of dependencies
+
+0.4.3 (2017-07-26)
+------------------
+* improvement in verbose mode for predictor (display feature importances)
+* wait until modules and engines are imported
+
+0.4.4 (2017-08-04)
+------------------
+* pep8 style
+* normalization of drift coefficients
+* warning size of folder 'save' 
+
+0.5.0 (2017-08-24)
+------------------
+* improvement in verbose mode
+* add of new dates features 
+* new parallel computing
+* warning size of folder 'save' 
diff --git a/examples/classification/example.ipynb b/examples/classification/example.ipynb
diff --git a/examples/regression/example.ipynb b/examples/regression/example.ipynb
diff --git a/python-package/dist/mlbox-0.5.0-py2.py3-none-any.whl b/python-package/dist/mlbox-0.5.0-py2.py3-none-any.whl
diff --git a/python-package/mlbox/__init__.py b/python-package/mlbox/__init__.py
@@ -2,7 +2,7 @@
 
 __author__ = """Axel ARONIO DE ROMBLAY"""
 __email__ = 'axelderomblay@gmail.com'
-__version__ = '0.4.4'
+__version__ = '0.5.0'
 
 from .preprocessing import *
 from .encoding import *

diff --git a/python-package/mlbox/encoding/__init__.pyc b/python-package/mlbox/encoding/__init__.pyc
diff --git a/python-package/mlbox/encoding/categorical_encoder.pyc b/python-package/mlbox/encoding/categorical_encoder.pyc
diff --git a/python-package/mlbox/encoding/na_encoder.py b/python-package/mlbox/encoding/na_encoder.py
@@ -25,7 +25,7 @@ class NA_encoder():
 
     categorical_strategy : str, default = '<NULL>'
         The strategy to encode NA for categorical features.
-        Available strategies = a string or np.NaN
+        Available strategies = a string or "most_frequent"
     """
 
     def __init__(self,
@@ -37,6 +37,7 @@ def __init__(self,
         self.__Lcat = []
         self.__Lnum = []
         self.__imp = None
+        self.__mode = dict()
         self.__fitOK = False
 
 
@@ -81,6 +82,8 @@ def fit(self, df_train, y_train=None):
         self.__Lcat = df_train.dtypes[df_train.dtypes == 'object'].index
         self.__Lnum = df_train.dtypes[df_train.dtypes != 'object'].index
 
+        # Dealing with numerical features
+
         if (self.numerical_strategy in ['mean', 'median', "most_frequent"]):
 
             self.__imp = Imputer(strategy=self.numerical_strategy)
@@ -90,16 +93,33 @@ def fit(self, df_train, y_train=None):
             else:
                 pass
 
-            self.__fitOK = True
-
         elif ((type(self.numerical_strategy) == int) | (type(self.numerical_strategy) == float)):
 
-            self.__fitOK = True
+            pass
 
         else:
 
             raise ValueError("Numerical strategy for NA encoding is not valid")
 
+        # Dealing with categorical features
+
+        if (type(self.categorical_strategy) == str):
+
+            if (self.categorical_strategy == "most_frequent"):
+
+                na_count = df_train[self.__Lcat].isnull().sum()
+
+                for col in na_count[na_count>0].index:
+                    self.__mode[col] = df_train[col].mode()[0]
+
+            else:
+                pass
+
+        else:
+            raise ValueError("Categorical strategy for NA encoding is not valid")
+
+        self.__fitOK = True
+
         return self
 
 
@@ -145,7 +165,11 @@ def transform(self, df):
 
             if(len(self.__Lnum) == 0):
 
-                return df[self.__Lcat].fillna(self.categorical_strategy)
+                if (self.categorical_strategy != "most_frequent"):
+                    return df[self.__Lcat].fillna(self.categorical_strategy)
+
+                else:
+                    return df[self.__Lcat].fillna(self.__mode)
 
             else:
 
@@ -155,13 +179,25 @@ def transform(self, df):
 
                     if (len(self.__Lcat) != 0):
 
-                        return pd.concat(
-                            (pd.DataFrame(self.__imp.transform(df[self.__Lnum]),
-                                          columns=self.__Lnum,
-                                          index=df.index),
-                             df[self.__Lcat].fillna(self.categorical_strategy)
-                             ),
-                            axis=1)[df.columns]
+                        if (self.categorical_strategy != "most_frequent"):
+
+                            return pd.concat(
+                                (pd.DataFrame(self.__imp.transform(df[self.__Lnum]),
+                                              columns=self.__Lnum,
+                                              index=df.index),
+                                 df[self.__Lcat].fillna(self.categorical_strategy)
+                                 ),
+                                axis=1)[df.columns]
+
+                        else:
+
+                            return pd.concat(
+                                (pd.DataFrame(self.__imp.transform(df[self.__Lnum]),
+                                              columns=self.__Lnum,
+                                              index=df.index),
+                                 df[self.__Lcat].fillna(self.__mode)
+                                 ),
+                                axis=1)[df.columns]
 
                     else:
 
@@ -175,12 +211,21 @@ def transform(self, df):
 
                     if (len(self.__Lcat) != 0):
 
-                        return pd.concat(
-                            (df[self.__Lnum].fillna(self.numerical_strategy),
-                             df[self.__Lcat].fillna(self.categorical_strategy)
-                             ),
-                            axis=1)[df.columns]
+                        if (self.categorical_strategy != "most_frequent"):
+
+                            return pd.concat(
+                                (df[self.__Lnum].fillna(self.numerical_strategy),
+                                 df[self.__Lcat].fillna(self.categorical_strategy)
+                                 ),
+                                axis=1)[df.columns]
+
+                        else:
 
+                            return pd.concat(
+                                (df[self.__Lnum].fillna(self.numerical_strategy),
+                                 df[self.__Lcat].fillna(self.__mode)
+                                 ),
+                                axis=1)[df.columns]
                     else:
 
                         return df[self.__Lnum].fillna(self.numerical_strategy)

diff --git a/python-package/mlbox/encoding/na_encoder.pyc b/python-package/mlbox/encoding/na_encoder.pyc
diff --git a/python-package/mlbox/model/__init__.pyc b/python-package/mlbox/model/__init__.pyc
diff --git a/python-package/mlbox/model/supervised/__init__.pyc b/python-package/mlbox/model/supervised/__init__.pyc
diff --git a/python-package/mlbox/model/supervised/classification/__init__.pyc b/python-package/mlbox/model/supervised/classification/__init__.pyc
diff --git a/python-package/mlbox/model/supervised/classification/classifier.pyc b/python-package/mlbox/model/supervised/classification/classifier.pyc
diff --git a/python-package/mlbox/model/supervised/classification/feature_selector.pyc b/python-package/mlbox/model/supervised/classification/feature_selector.pyc
diff --git a/python-package/mlbox/model/supervised/classification/stacking_classifier.pyc b/python-package/mlbox/model/supervised/classification/stacking_classifier.pyc
diff --git a/python-package/mlbox/model/supervised/regression/__init__.pyc b/python-package/mlbox/model/supervised/regression/__init__.pyc
diff --git a/python-package/mlbox/model/supervised/regression/feature_selector.pyc b/python-package/mlbox/model/supervised/regression/feature_selector.pyc
diff --git a/python-package/mlbox/model/supervised/regression/regressor.pyc b/python-package/mlbox/model/supervised/regression/regressor.pyc
diff --git a/python-package/mlbox/model/supervised/regression/stacking_regressor.pyc b/python-package/mlbox/model/supervised/regression/stacking_regressor.pyc
diff --git a/python-package/mlbox/optimisation/__init__.pyc b/python-package/mlbox/optimisation/__init__.pyc
diff --git a/python-package/mlbox/optimisation/optimiser.pyc b/python-package/mlbox/optimisation/optimiser.pyc
diff --git a/python-package/mlbox/prediction/__init__.pyc b/python-package/mlbox/prediction/__init__.pyc
diff --git a/python-package/mlbox/prediction/predictor.py b/python-package/mlbox/prediction/predictor.py
@@ -339,7 +339,7 @@ def fit_predict(self, params, df):
 
             if(params is None):
                 print("")
-                print('No parameters set. Default configuration is tested')
+                print('> No parameters set. Default configuration is tested')
                 set_params = True
 
             else:
@@ -354,7 +354,7 @@ def fit_predict(self, params, df):
                 try:
                     if(self.verbose):
                         print("")
-                        print("fitting the pipeline...")
+                        print("fitting the pipeline ...")
 
                     pp.fit(df['train'], df['target'])
 
@@ -369,9 +369,6 @@ def fit_predict(self, params, df):
                     # Feature importances
 
                     try:
-                        if(self.verbose):
-                            print("")
-                            print("dumping feature importances into directory : " + self.to_path)
 
                         importance = est.feature_importances()                       
                         self.__save_feature_importances(importance, 
@@ -382,9 +379,11 @@ def fit_predict(self, params, df):
 
                         if(self.verbose):
                             self.__plot_feature_importances(importance, 10)
-
+                            print("")
+                            print("> Feature importances dumped into directory : " + self.to_path)
+
                     except:
-                        warnings.warn("Unable to get feature importances...")
+                        warnings.warn("Unable to get feature importances !")
 
                 except:
                     raise ValueError("Pipeline cannot be fitted")
@@ -424,7 +423,7 @@ def fit_predict(self, params, df):
                     try:
                         if(self.verbose):
                             print("")
-                            print("predicting...")
+                            print("predicting ...")
 
                         pred = pd.DataFrame(pp.predict_proba(df['test']),
                                             columns=enc.inverse_transform(range(len(enc.classes_))),
@@ -471,7 +470,7 @@ def fit_predict(self, params, df):
 
                 if(self.verbose):
                     print("")
-                    print("top 10 predictions :")
+                    print("> Overview on predictions : ")
                     print("")
                     print(pred.head(10))
 
@@ -481,7 +480,7 @@ def fit_predict(self, params, df):
 
                 if(self.verbose):
                     print("")
-                    print("dumping predictions into directory : "+self.to_path)
+                    print("dumping predictions into directory : "+self.to_path + " ...")
 
                 pred.to_csv(self.to_path
                             + "/"

diff --git a/python-package/mlbox/prediction/predictor.pyc b/python-package/mlbox/prediction/predictor.pyc