In [1]:
# Get Data Process
import os

TITANIC_PATH = os.path.join("datasets", "titanic")
import pandas as pd

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

#From previous work used to preprocess numerical data
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]
    
# Fill the pipeline with the Numerical Data    
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# We will be selecting for age 
numericPipeline = Pipeline([
        ("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
        ("imputer", SimpleImputer(strategy="median")),
    ])
numericPipeline.fit_transform(train_data)

# Catagorical Preprocess
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

from sklearn.preprocessing import OneHotEncoder

#Fill pipeline with catagorical 
categoricPipeline = Pipeline([
        ("select_cat", DataFrameSelector(["Pclass", "Sex", "Embarked"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])

categoricPipeline.fit_transform(train_data)

from sklearn.pipeline import FeatureUnion

preprocess_pipeline = FeatureUnion(transformer_list=[
        ("numericalPipeline", numericPipeline ),
        ("categoricPipeline", categoricPipeline),
    ])

X_train = preprocess_pipeline.fit_transform(train_data)
y_train = train_data["Survived"]

from sklearn.svm import SVC

sVCMachine = SVC(gamma="auto")
sVCMachine.fit(X_train, y_train)
X_test = preprocess_pipeline.transform(test_data)
y_pred = sVCMachine.predict(X_test)

from sklearn.model_selection import cross_val_score

svm_scores = cross_val_score(sVCMachine, X_train, y_train, cv=10)
svm_scores.mean()


0.7329588014981274

In [2]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.8126466916354558

In [3]:
train_data["AgeBucket"] = train_data["Age"] // 15 * 15
train_data[["AgeBucket", "Survived"]].groupby(['AgeBucket']).mean()

Unnamed: 0_level_0,Survived
AgeBucket,Unnamed: 1_level_1
0.0,0.576923
15.0,0.362745
30.0,0.423256
45.0,0.404494
60.0,0.24
75.0,1.0


In [4]:
train_data["RelativesOnboard"] = train_data["SibSp"] + train_data["Parch"]
train_data[["RelativesOnboard", "Survived"]].groupby(['RelativesOnboard']).mean()

Unnamed: 0_level_0,Survived
RelativesOnboard,Unnamed: 1_level_1
0,0.303538
1,0.552795
2,0.578431
3,0.724138
4,0.2
5,0.136364
6,0.333333
7,0.0
10,0.0


###### 