 Project Statement
 ============
 As with Project 1, apply the ideas of ch. 1 - 3 as appropriate.
 Develop and demonstrate your capabilities with:
  * Regression (ch. 4)
  * Support Vector Machines (SVM's) (ch. 5)


  Starting Point
  --------------
  To start this project I'm going to pull from the Titanic dataset part of the previous, since I was a little disappointed that Derek Byrnes got a higher score than me. Friendly rivalry and what not. :) So now that we're working together, I'll see if I can pull some of his techniques in to produce a better score.
  
  TESTING THIS IS DEREK

In [1]:
import pandas

raw_training = pandas.read_csv("titanic/train.csv")
raw_test = pandas.read_csv("titanic/test.csv")
example_output = pandas.read_csv("titanic/gender_submission.csv")

y = raw_training["Survived"].copy()
X = raw_training.drop("Survived", axis=1)
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [2]:
X.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [44]:
from sklearn.base import BaseEstimator, TransformerMixin
import re

class RegexTransform(BaseEstimator, TransformerMixin):
    def __init__(self, regex=".*", groupNum=0):
        self.regex = regex
        self.groupNum = groupNum
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.applymap((lambda x: RegexTransform.applyRegex(x, self)))
    
    @staticmethod
    def applyRegex(val, selfObj):
        if (type(val) is str):
            match = re.search(selfObj.regex, val)
            newVal = ""
            if (match):
                newVal = match.group(selfObj.groupNum)
            if (newVal is None):
                newVal = ""
            return newVal
        else:
            return val


regexTest = RegexTransform(regex="(.*?),", groupNum=1)
results = regexTest.fit_transform(X)
results.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,Braund,,22.0,1,0,,7.25,,
1,2,1,Cumings,,38.0,1,0,,71.2833,,
2,3,3,Heikkinen,,26.0,0,0,,7.925,,
3,4,1,Futrelle,,35.0,1,0,,53.1,,
4,5,3,Allen,,35.0,0,0,,8.05,,


In [52]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer

def CreateTitanicPipeline():
        numeric_steps=[
            ('imputer', SimpleImputer()),
            ('scaler', StandardScaler()),
            ('discretizer', KBinsDiscretizer())
        ]
        numeric_pipeline = Pipeline(numeric_steps)

        categorical_pipeline = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown ='ignore'))
        ])

        namesPipeline = Pipeline(steps=[
            ('lastName', RegexTransform(regex="(.*?),", groupNum=1)),
            ('oneHot', OneHotEncoder(handle_unknown ='ignore'))
        ])

        return ColumnTransformer(sparse_threshold=0,transformers=[
            ('numerical', numeric_pipeline, ["Age", "Fare"]),
            ('categorical', categorical_pipeline, ["Sex", "Embarked", "Pclass"]),
            ('passthrough', "passthrough", ["SibSp", "Parch"]),
            ('names', namesPipeline, ["Name"])
        ])

titanic_pipeline = CreateTitanicPipeline()
for key in titanic_pipeline.get_params():
    print(key)


n_jobs
remainder
sparse_threshold
transformer_weights
transformers
numerical
categorical
passthrough
names
numerical__memory
numerical__steps
numerical__imputer
numerical__scaler
numerical__discretizer
numerical__imputer__copy
numerical__imputer__fill_value
numerical__imputer__missing_values
numerical__imputer__strategy
numerical__imputer__verbose
numerical__scaler__copy
numerical__scaler__with_mean
numerical__scaler__with_std
numerical__discretizer__encode
numerical__discretizer__n_bins
numerical__discretizer__strategy
categorical__memory
categorical__steps
categorical__imputer
categorical__onehot
categorical__imputer__copy
categorical__imputer__fill_value
categorical__imputer__missing_values
categorical__imputer__strategy
categorical__imputer__verbose
categorical__onehot__categorical_features
categorical__onehot__categories
categorical__onehot__dtype
categorical__onehot__handle_unknown
categorical__onehot__n_values
categorical__onehot__sparse
names__memory
names__steps
names__lastNam

Allright, now to see if things have gotten better!

In [46]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

def tune_decisionTree(data, labels):
    param_grid = { 
        "transform__numerical__discretizer__n_bins": [8, 20, 40, 80],
        "transform__numerical__discretizer__strategy": ["quantile", "uniform", "kmeans"],
        "transform__numerical__discretizer__encode":["ordinal", "onehot"],
        "classifier__splitter": ["best", "random"],
        "classifier__criterion": ["gini", "entropy"],
        "classifier__min_samples_split": [5, 10, 20],
        "classifier__min_samples_leaf": [1,3,9],
        "classifier__min_weight_fraction_leaf": [0],
        "classifier__max_leaf_nodes": [None],
        "classifier__min_impurity_decrease": [0, 0.2, 0.4],
        "classifier__random_state":[42]
    }

    pipeline = Pipeline([
        ("transform", titanic_pipeline),
        ("classifier", DecisionTreeClassifier())
    ])
    search = GridSearchCV(pipeline, param_grid, n_jobs=-1, cv=3, iid=False)
    search.fit(data, labels)
    print("Best Params: ", search.best_params_)
    print("Best Score: ", search.best_score_)
    print("Refit Time: ", search.refit_time_)
    return search.best_estimator_ 

best_dt = tune_decisionTree(X, y)

Best Params:  {'classifier__criterion': 'gini', 'classifier__max_leaf_nodes': None, 'classifier__min_impurity_decrease': 0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__min_weight_fraction_leaf': 0, 'classifier__random_state': 42, 'classifier__splitter': 'random', 'transform__numerical__discretizer__encode': 'onehot', 'transform__numerical__discretizer__n_bins': 8, 'transform__numerical__discretizer__strategy': 'kmeans'}
Best Score:  0.830527497194164
Refit Time:  0.07807326316833496
