# DSG: ENSEMBLE

In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler


folder = os.getcwd() ; print folder

/home/arda/Documents/DSG


In [2]:
#pd_train = pd.io.pickle.read_pickle(folder + '/data_munged/pd_train_tagged')
#pd_test = pd.io.pickle.read_pickle(folder + '/data_munged/pd_test_tagged')

pd_train = pd.io.pickle.read_pickle(folder + '/data_munged/pd_train_munged')
pd_test = pd.io.pickle.read_pickle(folder + '/data_munged/pd_test_munged')

# Simple model

In [3]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key].values
    
class DenseTransformer(TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [4]:
extra = ExtraTreesClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,
                     min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None,
                     bootstrap=False, oob_score=False, n_jobs=1, random_state=None, verbose=0, 
                     warm_start=False, class_weight=None)

bag = BaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True,
                  bootstrap_features=False, oob_score=False, n_jobs=1, random_state=None, verbose=0)

ada = AdaBoostClassifier(base_estimator=None, n_estimators=200, learning_rate=0.3, algorithm='SAMME.R', 
                         random_state=None)

rf= RandomForestClassifier(n_estimators=200, criterion='gini', max_depth=15, min_samples_split=2, min_samples_leaf=1,
                       min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, 
                       oob_score=False, n_jobs=-1, random_state=None, verbose=0, warm_start=False, 
                       class_weight=None)

StdScl = StandardScaler(copy=True, with_mean=True, with_std=True)

In [9]:
Features = ['lenght', 'nb_caps_char','nb_caps_word', 'nb_?', 'nb_,','nb_!', 'nb_.', 'nb_;', 'nb_all', 'nb_,;','diversity']

sentence_pipe = Pipeline([
        ('selector', ColumnSelector(key=Features)),
        #('StdScl', StdScl)
    ])

In [10]:
X = sentence_pipe.fit_transform(pd_train)
Y = pd_train['Author']

X_test = sentence_pipe.transform(pd_test)

In [11]:
clf = svm
scores = cross_val_score(clf, X,Y,scoring='accuracy',cv=5, n_jobs=-1)
print scores,scores.mean(),scores.std()


[ 0.22425191  0.40636965  0.38004875  0.41076093  0.40421456] 0.365129158882 0.0712456272582


# Grid search ensemble

In [179]:
Features = ['lenght', 'nb_caps_char','nb_caps_word', 'nb_?', 'nb_,','nb_!', 'nb_.', 'nb_;', 'nb_all', 'nb_,;','diversity']

ada = AdaBoostClassifier(base_estimator=None, n_estimators=200, learning_rate=0.3, algorithm='SAMME.R', 
                         random_state=None)

rf= RandomForestClassifier(n_estimators=200, criterion='gini', max_depth=6, min_samples_split=2, min_samples_leaf=1,
                       min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, 
                       oob_score=False, n_jobs=-1, random_state=None, verbose=0, warm_start=False, 
                       class_weight=None)

extra = ExtraTreesClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,
                     min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None,
                     bootstrap=False, oob_score=False, n_jobs=1, random_state=None, verbose=0, 
                     warm_start=False, class_weight=None)

clf = rf

In [180]:
pipeline = Pipeline([
    ('column', ColumnSelector(key=Features)),
    #('StdScl', StdScl),
    ('clf', clf)
])

parameters = {   
    
    #rf config
    'clf__n_estimators': (380, 390, 400, 410, 420,),
    'clf__max_depth': (14,),
    'clf__min_samples_leaf': (3,),
    'clf__min_samples_split': (2,),

}

grid = GridSearchCV(pipeline, parameters, scoring='accuracy', 
                   fit_params=None, n_jobs=-1, iid=True,
                   refit=True,cv=5, verbose=0)

grid.fit(pd_train,pd_train['Author'])

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('column', ColumnSelector(key=['lenght', 'nb_caps_char', 'nb_caps_word', 'nb_?', 'nb_,', 'nb_!', 'nb_.', 'nb_;', 'nb_all', 'nb_,;', 'diversity'])), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=..._jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid={'clf__min_samples_split': (2,), 'clf__max_depth': (14,), 'clf__min_samples_leaf': (3,), 'clf__n_estimators': (380, 390, 400, 410, 420)},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='accuracy', verbose=0)

In [182]:
print str(grid.best_params_) +" "+str(grid.best_score_)

{'clf__max_depth': 14, 'clf__min_samples_leaf': 3, 'clf__min_samples_split': 2, 'clf__n_estimators': 380} 0.458064965359


In [185]:
ada = AdaBoostClassifier(base_estimator=None, n_estimators=400, learning_rate=0.4, algorithm='SAMME.R', 
                         random_state=None)


#best rf {'clf__max_depth': 14, 'clf__min_samples_leaf': 3, 'clf__min_samples_split': 2, 'clf__n_estimators': 400} 0.458900532674
rf= RandomForestClassifier(n_estimators=400, criterion='gini', max_depth=14, min_samples_split=2, min_samples_leaf=3,
                       min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, 
                       oob_score=False, n_jobs=-1, random_state=None, verbose=0, warm_start=False, 
                       class_weight=None)

In [186]:
clf = rf

scores = cross_val_score(clf, X,Y,scoring='accuracy',cv=5, n_jobs=-1)
print scores,scores.mean(),scores.std()

[ 0.47077244  0.4709363   0.45316852  0.47118231  0.42337165] 0.457886245238 0.0185830985852


In [170]:
clf.fit(X,Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=14, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [None]:

lenght_pipe = Pipeline([
        ('selector', ColumnSelector(key=['lenght']))
    ])

combined_features = FeatureUnion([
                                ( "ngram",sentence_pipe ), 
                                ("lenght", lenght_pipe)
                                ], 
                                transformer_weights={"ngram":1,"lenght":1 })

In [7]:
scores = cross_val_score(rf, X,Y,scoring='accuracy',cv=5, n_jobs=-1)
print scores,scores.mean(),scores.std()

[ 0.62526096  0.63940132  0.5689415   0.58558245  0.55329154] 0.594495554284 0.0328428448777


In [41]:
scores = cross_val_score(extra, X,Y,scoring='accuracy',cv=5, n_jobs=2)
print scores,scores.mean(),scores.std()

[ 0.64178845  0.63400627  0.58095404  0.62162633  0.56234761] 0.608144538831 0.0310457138965


In [14]:
scores = cross_val_score(bag, X,Y,scoring='accuracy',cv=5, n_jobs=-1)
print scores,scores.mean(),scores.std()

[ 0.3578636   0.36268709  0.38353064  0.39004005  0.33803553] 0.366431381703 0.0186735368356
