In [115]:
import os
import urllib.request
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedShuffleSplit

TITANIC_PATH = os.path.join("datasets", "titanic")
DOWNLOAD_URL = "https://raw.githubusercontent.com/rickiepark/handson-ml2/master/datasets/titanic/"

def fetch_titanic_data(url=DOWNLOAD_URL, path=TITANIC_PATH):
    if not os.path.isdir(path):
        os.makedirs(path)
    for filename in ("train.csv", "test.csv"):
        filepath = os.path.join(path, filename)
        if not os.path.isfile(filepath):
            print("Downloading", filename)
            urllib.request.urlretrieve(url + filename, filepath)

fetch_titanic_data()   

In [116]:
def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

In [117]:
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

In [118]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [119]:
"""from sklearn.base import BaseEstimator, TransformerMixin

name_ix = 3

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_second_name = True):
        self.add_second_name = add_second_name
    def fit(self,X, y= None):
        return self
    def transform(self, X):
        X_name_1 = X[:,name_ix].str.split(",").str[0].value_counts().reset_index(drop=False)
        X_name_1.columns = ["First_name","first_name_count"]
        X_train_name_1 = X.join(X_name_1.set_index("First_name")['first_name_count'], on='First_name')
        first_name = X_train_name_1["first_name_count"]
        
        if self.add_second_name:
            X_name_2 = X[:,name_ix].str.split(".").str[1].value_counts().reset_index(drop=False)
            X_name_2.columns = ["Second_name","second_name_count"]
            X_train_name_2 = X.join(X_name_2.set_index("Second_name")['second_name_count'], on='Second_name')
            second_name = X_train_name_1["first_name_count"]
            return np.c_[X, first_name, second_name]
        else:
            return np.c_[X, first_name]"""

'from sklearn.base import BaseEstimator, TransformerMixin\n\nname_ix = 3\n\nclass CombinedAttributesAdder(BaseEstimator, TransformerMixin):\n    def __init__(self, add_second_name = True):\n        self.add_second_name = add_second_name\n    def fit(self,X, y= None):\n        return self\n    def transform(self, X):\n        X_name_1 = X[:,name_ix].str.split(",").str[0].value_counts().reset_index(drop=False)\n        X_name_1.columns = ["First_name","first_name_count"]\n        X_train_name_1 = X.join(X_name_1.set_index("First_name")[\'first_name_count\'], on=\'First_name\')\n        first_name = X_train_name_1["first_name_count"]\n        \n        if self.add_second_name:\n            X_name_2 = X[:,name_ix].str.split(".").str[1].value_counts().reset_index(drop=False)\n            X_name_2.columns = ["Second_name","second_name_count"]\n            X_train_name_2 = X.join(X_name_2.set_index("Second_name")[\'second_name_count\'], on=\'Second_name\')\n            second_name = X_train_n

In [120]:
def name_count(X_train):
    X_train["First_name"] = X_train.Name.str.split(",").str[0]
    X_train["Second_name"] = X_train.Name.str.split(".").str[1]
    
    X_name_1 = X_train.Name.str.split(",").str[0].value_counts().reset_index(drop=False)
    X_name_2 = X_train.Name.str.split(".").str[1].value_counts().reset_index(drop=False)
    X_name_1.columns = ["First_name","first_name_count"]
    X_name_2.columns = ["Second_name","second_name_count"]
    X_train_name = X_train.join(X_name_1.set_index("First_name")['first_name_count'], on='First_name')
    X_train_name = X_train_name.join(X_name_2.set_index("Second_name")['second_name_count'], on='Second_name')
    X_train_name = X_train_name.drop(["Name", "First_name", "Second_name", "Ticket", "Cabin"], axis=1).copy()
    return X_train_name

temp = train_data.copy()
train_data = name_count(temp).copy()
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,first_name_count,second_name_count
0,1,0,3,male,22.0,1,0,7.2500,S,2,1
1,2,1,1,female,38.0,1,0,71.2833,C,1,1
2,3,1,3,female,26.0,0,0,7.9250,S,1,1
3,4,1,1,female,35.0,1,0,53.1000,S,2,1
4,5,0,3,male,35.0,0,0,8.0500,S,2,4
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,13.0000,S,1,1
887,888,1,1,female,19.0,0,0,30.0000,S,3,1
888,889,0,3,female,,1,2,23.4500,S,2,1
889,890,1,1,male,26.0,0,0,30.0000,C,1,1


In [121]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        #('attribs_adder', CombinedAttributesAdder()),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])

from sklearn.compose import ColumnTransformer

num_attribs = ["Age", "SibSp", "Parch", "Fare"]
cat_attribs = ["Pclass", "Sex", "Embarked", "first_name_count", "second_name_count"]

preprocess_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

In [122]:
X_train = preprocess_pipeline.fit_transform(train_data[num_attribs + cat_attribs])
X_train



array([[-0.56573646,  0.43279337, -0.47367361, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.66386103,  0.43279337, -0.47367361, ...,  0.        ,
         0.        ,  0.        ],
       [-0.25833709, -0.4745452 , -0.47367361, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.1046374 ,  0.43279337,  2.00893337, ...,  0.        ,
         0.        ,  0.        ],
       [-0.25833709, -0.4745452 , -0.47367361, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.20276197, -0.4745452 , -0.47367361, ...,  0.        ,
         0.        ,  0.        ]])

In [123]:
y_train = train_data["Survived"]
y_train

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [124]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf.fit(X_train, y_train)

In [125]:
temp = test_data.copy()
test_data = name_count(temp).copy()
X_test = preprocess_pipeline.transform(test_data[num_attribs + cat_attribs])
y_pred = forest_clf.predict(X_test)

ValueError: Found unknown categories [5] in column 4 during transform

In [None]:
y_pred

In [126]:
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.8092259675405742

In [127]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)

In [128]:
sgd_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
sgd_scores.mean()

0.8092259675405742

In [129]:
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto")
svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean()

0.8103245942571784

In [130]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'], 'degree' : [3,8,12], 'gamma' : ['auto', 'scale']},
]
svc_reg = SVC()
grid_search = GridSearchCV(svc_reg, param_grid, cv=5,
                          scoring='neg_mean_squared_error',
                          return_train_score=True)
grid_search.fit(X_train, y_train)

In [131]:
grid_search.best_params_

{'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}

In [132]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

0.4593650591968736 {'degree': 3, 'gamma': 'auto', 'kernel': 'linear'}
0.5926918767683125 {'degree': 3, 'gamma': 'auto', 'kernel': 'poly'}
0.43936887303372246 {'degree': 3, 'gamma': 'auto', 'kernel': 'rbf'}
0.46542634325885496 {'degree': 3, 'gamma': 'auto', 'kernel': 'sigmoid'}
0.4593650591968736 {'degree': 3, 'gamma': 'scale', 'kernel': 'linear'}
0.4197661588865654 {'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
0.41574679285504407 {'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
0.5734429866640974 {'degree': 3, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.4593650591968736 {'degree': 8, 'gamma': 'auto', 'kernel': 'linear'}
0.6159075929701427 {'degree': 8, 'gamma': 'auto', 'kernel': 'poly'}
0.43936887303372246 {'degree': 8, 'gamma': 'auto', 'kernel': 'rbf'}
0.46542634325885496 {'degree': 8, 'gamma': 'auto', 'kernel': 'sigmoid'}
0.4593650591968736 {'degree': 8, 'gamma': 'scale', 'kernel': 'linear'}
0.5254074404399411 {'degree': 8, 'gamma': 'scale', 'kernel': 'poly'}
0.41574679285504407 {'d