Merge pull request #147 from teaearlgraycold/new_classifiers

New classifiers
EpistasisLab · May 25, 2016 · 5ebd6d1 · 5ebd6d1
2 parents a4e00b2 + ff3cb08
commit 5ebd6d1
Show file tree

Hide file tree

Showing 16 changed files with 884 additions and 95 deletions.
diff --git a/ci/.travis_install.sh b/ci/.travis_install.sh
@@ -46,10 +46,8 @@ source activate testenv
 
 if [[ "$LATEST" == "true" ]]; then
     pip install deap
-    pip install xgboost
 else
     pip install deap==$DEAP_VERSION
-    pip install xgboost==$XGBOOST_VERSION
 fi
 
 pip install update_checker
@@ -65,6 +63,5 @@ python -c "import scipy; print('scipy %s' % scipy.__version__)"
 python -c "import sklearn; print('sklearn %s' % sklearn.__version__)"
 python -c "import pandas; print('pandas %s' % pandas.__version__)"
 python -c "import deap; print('deap %s' % deap.__version__)"
-python -c "import xgboost; print('xgboost %s ' % xgboost.__version__)"
 python -c "import update_checker; print('update_checker %s ' % update_checker.__version__)"
 python setup.py build_ext --inplace
diff --git a/ci/.travis_test.sh b/ci/.travis_test.sh
@@ -15,7 +15,6 @@ python -c "import scipy; print('scipy %s' % scipy.__version__)"
 python -c "import sklearn; print('sklearn %s' % sklearn.__version__)"
 python -c "import pandas; print('pandas %s' % pandas.__version__)"
 python -c "import deap; print('deap %s' % deap.__version__)"
-python -c "import xgboost; print('xgboost %s ' % xgboost.__version__)"
 python -c "import update_checker; print('update_checker %s ' % update_checker.__version__)"
 
 if [[ "$COVERAGE" == "true" ]]; then

diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -30,10 +30,17 @@ pages:
     - Models:
       - DecisionTreeClassifier: documentation/pipeline_operators/models/classifiers/tree/DecisionTreeClassifier.md
       - RandomForestClassifier: documentation/pipeline_operators/models/classifiers/ensemble/RandomForestClassifier.md
-      - XGBClassifier: documentation/pipeline_operators/models/classifiers/ensemble/XGBClassifier.md
       - SVC: documentation/pipeline_operators/models/classifiers/svm/SVC.md
       - kNeighborsClassifier: documentation/pipeline_operators/models/classifiers/nearest_neighbors/kNeighborsClassifier.md
       - LogisticRegression: documentation/pipeline_operators/models/classifiers/linear_model/LogisticRegression.md
+      - AdaBoost: documentation/pipeline_operators/models/classifiers/ensemble/AdaBoost.md
+      - ExtraTrees: documentation/pipeline_operators/models/classifiers/ensemble/ExtraTrees.md
+      - PassiveAggressive: documentation/pipeline_operators/models/classifiers/linear_model/PassiveAggressive.md
+      - BernoulliNB: documentation/pipeline_operators/models/classifiers/naive_bayes/BernoulliNB.md
+      - GaussianNB: documentation/pipeline_operators/models/classifiers/naive_bayes/GaussianNB.md
+      - MultinomialNB: documentation/pipeline_operators/models/classifiers/naive_bayes/MultinomialNB.md
+      - LinearSVC: documentation/pipeline_operators/models/classifiers/svm/LinearSVC.md
+      - GradientBoostingClassifier: documentation/pipeline_operators/models/classifiers/ensemble/GradientBoostingClassifier.md
     - Pre-Processing:
       - StandardScaler: documentation/pipeline_operators/preprocessing/scaling/StandardScaler.md
       - RobustScaler: documentation/pipeline_operators/preprocessing/scaling/RobustScaler.md

diff --git a/...els/classifiers/ensemble/XGBClassifier.md → ...s/models/classifiers/ensemble/AdaBoost.md b/...els/classifiers/ensemble/XGBClassifier.md → ...s/models/classifiers/ensemble/AdaBoost.md
@@ -1,20 +1,17 @@
-# XGBoost Classifier
+# AdaBoost Classifier
 * * *
 
-Fits the dmlc eXtreme gradient boosting classifier.
+Fits an AdaBoost classifier.
 
 ## Dependencies
-    xgboost.XGBClassifier
-
+    sklearn.ensemble.AdaBoostClassifier
 
 Parameters
 ----------
     input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
-        Input DataFrame for fitting the XGBoost classifier
+        Input DataFrame for fitting the classifier
     learning_rate: float
-        Shrinks the contribution of each tree by learning_rate
-    max_depth: int
-        Maximum depth of the individual estimators; the maximum depth limits the number of nodes in the tree
+        Learning rate shrinks the contribution of each classifier by learning_rate.
 
 Returns
 -------
@@ -30,7 +27,7 @@ Example Exported Code
 import numpy as np
 import pandas as pd
 from sklearn.cross_validation import train_test_split
-from xgboost import XGBClassifier
+from sklearn.ensumble import AdaBoostClassifier
 
 # NOTE: Make sure that the class is labeled 'class' in the data file
 tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')
@@ -39,10 +36,9 @@ training_indices, testing_indices = train_test_split(tpot_data.index, stratify=t
 
 result1 = tpot_data.copy()
 
-# Perform classification with a gradient boosting classifier
-xgbc1 = XGBClassifier(learning_rate=0.0001, n_estimators=500, max_depth=None)
-xgbc1.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values)
+adab1 = AdaBoostClassifier(learning_rate=0.1, n_estimators=500, random_state=42)
+adab1.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values)
 
-result1['xgbc1-classification'] = xgbc1.predict(result1.drop('class', axis=1).values)
+result1['adab1-classification'] = adab1.predict(result1.drop('class', axis=1).values)
 
 ```
diff --git a/...rces/documentation/pipeline_operators/models/classifiers/ensemble/ExtraTrees.md b/...rces/documentation/pipeline_operators/models/classifiers/ensemble/ExtraTrees.md
@@ -0,0 +1,47 @@
+# Extra Trees Classifier
+* * *
+
+Fits an extra-trees classifier.
+
+## Dependencies
+    sklearn.ensemble.ExtraTreesClassifier
+
+Parameters
+----------
+    input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
+        Input DataFrame for fitting the classifier
+    criterion: int
+        Integer that is used to select from the list of valid criteria,
+        either 'gini', or 'entropy'
+    max_features: int
+        The number of features to consider when looking for the best split
+
+Returns
+-------
+    input_df: pandas.DataFrame {n_samples, n_features+['guess', 'group', 'class', 'SyntheticFeature']}
+        Returns a modified input DataFrame with the guess column updated according to the classifier's predictions.
+        Also adds the classifiers's predictions as a 'SyntheticFeature' column.
+
+
+Example Exported Code
+---------------------
+
+```Python
+import numpy as np
+import pandas as pd
+from sklearn.cross_validation import train_test_split
+from sklearn.ensemble import ExtraTreesClassifier
+
+# NOTE: Make sure that the class is labeled 'class' in the data file
+tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')
+training_indices, testing_indices = train_test_split(tpot_data.index, stratify=tpot_data['class'].values, train_size=0.75, test_size=0.25)
+
+
+result1 = tpot_data.copy()
+
+etc1 = ExtraTreesClassifier(criterion="entropy", max_features=5, n_estimators=500, random_state=42)
+etc1.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values)
+
+result1['etc1-classification'] = etc1.predict(result1.drop('class', axis=1).values)
+
+```
diff --git a/...on/pipeline_operators/models/classifiers/ensemble/GradientBoostingClassifier.md b/...on/pipeline_operators/models/classifiers/ensemble/GradientBoostingClassifier.md
@@ -0,0 +1,45 @@
+# Gradient Boosting Classifier
+* * *
+
+Fits a Gradient Boosting classifier.
+
+## Dependencies
+     sklearn.ensemble.GradientBoostingClassifier
+
+Parameters
+----------
+    input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
+        Input DataFrame for fitting the random forest
+    learning_rate: float
+        Learning rate shrinks the contribution of each tree by learning_rate
+    max_depth: int
+        Maximum depth of the individual regression estimators
+
+Returns
+-------
+    input_df: pandas.DataFrame {n_samples, n_features+['guess', 'group', 'class', 'SyntheticFeature']}
+        Returns a modified input DataFrame with the guess column updated according to the classifier's predictions.
+        Also adds the classifiers's predictions as a 'SyntheticFeature' column.
+
+Example Exported Code
+---------------------
+
+```Python
+import numpy as np
+import pandas as pd
+from sklearn.cross_validation import train_test_split
+from sklearn.ensemble import GradientBoostingClassifier
+
+# NOTE: Make sure that the class is labeled 'class' in the data file
+tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')
+training_indices, testing_indices = train_test_split(tpot_data.index, stratify=tpot_data['class'].values, train_size=0.75, test_size=0.25)
+
+result1 = tpot_data.copy()
+
+# Perform classification with a gradient boosting classifier
+gbc1 = GradientBoostingClassifier(learning_rate=1.0, max_depth=3, n_estimators=500, random_state=42)
+gbc1.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values)
+
+result1['gbc1-classification'] = gbc1.predict(result1.drop('class', axis=1).values)
+
+```
diff --git a/...ntation/pipeline_operators/models/classifiers/linear_model/PassiveAggressive.md b/...ntation/pipeline_operators/models/classifiers/linear_model/PassiveAggressive.md
@@ -0,0 +1,47 @@
+# Passive Aggressive Classifier
+* * *
+
+Fits a Passive Aggressive classifier
+
+## Dependencies
+    sklearn.linear_model.PassiveAggressiveClassifier
+
+Parameters
+----------
+    input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
+        Input DataFrame for fitting the classifier
+    criterion: int
+        Integer that is used to select from the list of valid criteria,
+        either 'gini', or 'entropy'
+    max_features: int
+        The number of features to consider when looking for the best split
+
+Returns
+-------
+    input_df: pandas.DataFrame {n_samples, n_features+['guess', 'group', 'class', 'SyntheticFeature']}
+        Returns a modified input DataFrame with the guess column updated according to the classifier's predictions.
+        Also adds the classifiers's predictions as a 'SyntheticFeature' column.
+
+
+Example Exported Code
+---------------------
+
+```Python
+import numpy as np
+import pandas as pd
+from sklearn.cross_validation import train_test_split
+from sklearn.linear_model import PassiveAggressiveClassifier
+
+# NOTE: Make sure that the class is labeled 'class' in the data file
+tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')
+training_indices, testing_indices = train_test_split(tpot_data.index, stratify=tpot_data['class'].values, train_size=0.75, test_size=0.25)
+
+
+result1 = tpot_data.copy()
+
+etc1 = ExtraTreesClassifier(criterion="gini", max_features=6, n_estimators=500, random_state=42)
+etc1.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values)
+
+result1['etc1-classification'] = etc1.predict(result1.drop('class', axis=1).values)
+
+```
diff --git a/.../documentation/pipeline_operators/models/classifiers/naive_bayes/BernoulliNB.md b/.../documentation/pipeline_operators/models/classifiers/naive_bayes/BernoulliNB.md
@@ -0,0 +1,49 @@
+# BernoulliNB Classifier
+* * *
+
+Fits a Naive Bayes classifier for multivariate Bernoulli models.
+
+## Dependencies
+    sklearn.naive_bayes.BernoulliNB
+
+Parameters
+----------
+    input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
+        Input DataFrame for fitting the classifier
+    alpha: float
+        Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
+    binarize: float
+        Threshold for binarizing (mapping to booleans) of sample features.
+    fit_prior: int
+        Whether to learn class prior probabilities or not. If false, a uniform prior will be used.
+        Reduced to a boolean with modulus.
+
+Returns
+-------
+    input_df: pandas.DataFrame {n_samples, n_features+['guess', 'group', 'class', 'SyntheticFeature']}
+        Returns a modified input DataFrame with the guess column updated according to the classifier's predictions.
+        Also adds the classifiers's predictions as a 'SyntheticFeature' column.
+
+
+Example Exported Code
+---------------------
+
+```Python
+import numpy as np
+import pandas as pd
+from sklearn.cross_validation import train_test_split
+from sklearn.naive_bayes import BernoulliNB
+
+# NOTE: Make sure that the class is labeled 'class' in the data file
+tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')
+training_indices, testing_indices = train_test_split(tpot_data.index, stratify=tpot_data['class'].values, train_size=0.75, test_size=0.25)
+
+
+result1 = tpot_data.copy()
+
+bnb1 = BernoulliNB(alpha=0.01, binarize=1.0, fit_prior=False)
+bnb1.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values)
+
+result1['bnb1-classification'] = bnb1.predict(result1.drop('class', axis=1).values)
+
+```
diff --git a/...s/documentation/pipeline_operators/models/classifiers/naive_bayes/GaussianNB.md b/...s/documentation/pipeline_operators/models/classifiers/naive_bayes/GaussianNB.md
@@ -0,0 +1,42 @@
+# GaussianNB Classifier
+* * *
+
+Fits a Gaussian Naive Bayes classifier
+
+## Dependencies
+    sklearn.naive_bayes.GaussianNB
+
+Parameters
+----------
+    input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
+        Input DataFrame for fitting the classifier
+
+Returns
+-------
+    input_df: pandas.DataFrame {n_samples, n_features+['guess', 'group', 'class', 'SyntheticFeature']}
+        Returns a modified input DataFrame with the guess column updated according to the classifier's predictions.
+        Also adds the classifiers's predictions as a 'SyntheticFeature' column.
+
+
+Example Exported Code
+---------------------
+
+```Python
+import numpy as np
+import pandas as pd
+from sklearn.cross_validation import train_test_split
+from sklearn.naive_bayes import GaussianNB
+
+# NOTE: Make sure that the class is labeled 'class' in the data file
+tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')
+training_indices, testing_indices = train_test_split(tpot_data.index, stratify=tpot_data['class'].values, train_size=0.75, test_size=0.25)
+
+
+result1 = tpot_data.copy()
+
+gnb1 = GaussianNB()
+gnb1.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values)
+
+result1['gnb1-classification'] = gnb1.predict(result1.drop('class', axis=1).values)
+
+```
diff --git a/...ocumentation/pipeline_operators/models/classifiers/naive_bayes/MultinomialNB.md b/...ocumentation/pipeline_operators/models/classifiers/naive_bayes/MultinomialNB.md
@@ -0,0 +1,47 @@
+# MultinomialNB Classifier
+* * *
+
+Fits a Naive Bayes classifier for multinomial models
+
+## Dependencies
+    sklearn.naive_bayes.MultinomialNB
+
+Parameters
+----------
+    input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
+        Input DataFrame for fitting the classifier
+    alpha: float
+        Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
+    fit_prior: int
+        Whether to learn class prior probabilities or not. If false, a uniform prior will be used.
+        Reduced to a boolean with modulus.
+
+Returns
+-------
+    input_df: pandas.DataFrame {n_samples, n_features+['guess', 'group', 'class', 'SyntheticFeature']}
+        Returns a modified input DataFrame with the guess column updated according to the classifier's predictions.
+        Also adds the classifiers's predictions as a 'SyntheticFeature' column.
+
+
+Example Exported Code
+---------------------
+
+```Python
+import numpy as np
+import pandas as pd
+from sklearn.cross_validation import train_test_split
+from sklearn.naive_bayes import MultinomialNB
+
+# NOTE: Make sure that the class is labeled 'class' in the data file
+tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')
+training_indices, testing_indices = train_test_split(tpot_data.index, stratify=tpot_data['class'].values, train_size=0.75, test_size=0.25)
+
+
+result1 = tpot_data.copy()
+
+mnb1 = MultinomialNB(alpha=1.0, fit_prior=True)
+mnb1.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values)
+
+result1['mnb1-classification'] = mnb1.predict(result1.drop('class', axis=1).values)
+
+```