 Project Statement
 ============
 As with Project 1, apply the ideas of ch. 1 - 3 as appropriate.
 Develop and demonstrate your capabilities with:
  * Regression (ch. 4)
  * Support Vector Machines (SVM's) (ch. 5)


  Starting Point
  --------------
  To start this project I'm going to pull from the Titanic dataset part of the previous, since I was a little disappointed that Derek Byrnes got a higher score than me. Friendly rivalry and what not. :) So now that we're working together, I'll see if I can pull some of his techniques in to produce a better score.

In [0]:
import pandas

raw_training = pandas.read_csv("titanic/train.csv")
raw_test = pandas.read_csv("titanic/test.csv")
example_output = pandas.read_csv("titanic/gender_submission.csv")

y = raw_training["Survived"].copy()
X = raw_training.drop("Survived", axis=1)
y.head()

In [0]:
X.head()

In [0]:
from sklearn.base import BaseEstimator, TransformerMixin
import re

class RegexTransform(BaseEstimator, TransformerMixin):
    def __init__(self, regex=".*", groupNum=0):
        self.regex = re.compile(regex)
        self.groupNum = groupNum
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.applymap((lambda x: RegexTransform.applyRegex(x, self)))
    
    @staticmethod
    def applyRegex(val, transObj):
        if (type(val) is str):
            match = transObj.regex.search(val)
            newVal = ""
            if (match):
                newVal = match.group(transObj.groupNum)
            if (newVal is None):
                newVal = ""
            return newVal
        else:
            return val


regexTest = RegexTransform(regex="(.*?),", groupNum=1)
results = regexTest.fit_transform(X)


In [0]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer

def getOneHotEncoderColumns(encoder, colNames):
    names = []
    i = 0
    for cat_cols in encoder.categories_:
        curColName = colNames[i]
        i += 1
        for cat in cat_cols:
            names.append(curColName + "_" + str(cat))
    return names

class TitanicDataTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, numeric_scale=True, numeric_nBins=5, numeric_encode="ordinal", numeric_strategy="quantile"):
        numeric_steps=[
            ('imputer', SimpleImputer()),
            ('scaler', StandardScaler()),
            ('discretizer', KBinsDiscretizer(n_bins=numeric_nBins,encode=numeric_encode,strategy=numeric_strategy))
        ]
        self.numeric_pipeline = Pipeline(numeric_steps)

        self.categorical_pipeline = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(sparse=False, handle_unknown ='ignore'))
        ])

        self.namesPipeline = Pipeline(steps=[
            ('lastName', RegexTransform(regex="(.*?),", groupNum=1)),
            ('oneHot', OneHotEncoder(sparse=False, handle_unknown ='ignore'))
        ])

        self.column_transform = ColumnTransformer(sparse_threshold=0,transformers=[
            ('numerical', self.numeric_pipeline, ["Age", "Fare"]),
            ('categorical', self.categorical_pipeline, ["Sex", "Embarked", "Pclass"]),
            ('passthrough', "passthrough", ["SibSp", "Parch"]),
            ('names', self.namesPipeline, ["Name"])
        ])
    
    def fit(self, data, labels=None):
        self.column_transform.fit(data, labels)
        
        self.features = ["Age", "Fare" ]
        name, pipeline, columns = self.column_transform.transformers_[1]
        name, transform = pipeline.steps[-1]
        self.features += getOneHotEncoderColumns(transform, columns)
        self.features += ["SibSp", "Parch"]
        name, pipeline, columns = self.column_transform.transformers_[3] 
        name, transform = pipeline.steps[-1]
        self.features += getOneHotEncoderColumns(transform, columns)
        return self
    
    def transform(self, data):
        tmpData = self.column_transform.transform(data)

        newNames = ["Age", "Fare" ]
        name, pipeline, columns = self.column_transform.transformers_[1]
        name, transform = pipeline.steps[-1]
        newNames += getOneHotEncoderColumns(transform, columns)
        newNames += ["SibSp", "Parch"]
        name, pipeline, columns = self.column_transform.transformers_[3] 
        name, transform = pipeline.steps[-1]
        newNames += getOneHotEncoderColumns(transform, columns)
        
        df = pandas.DataFrame(tmpData, columns=newNames)
        #dummies = pandas.DataFrame(columns=self.features)
        #return pandas.concat((df, dummies)).fillna(0)
        return df

titanicTransformer = TitanicDataTransformer()
clean_training = titanicTransformer.fit_transform(X)

print(clean_training.head())


