In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sn

train_orig = pd.read_csv("C:\\Users\\basti\\learnpy\\datasets\\titanic\\train.csv")
test_orig = pd.read_csv("C:\\Users\\basti\\learnpy\\datasets\\titanic\\test.csv")
gender_orig = pd.read_csv("C:\\Users\\basti\\learnpy\\datasets\\titanic\\gender_submission.csv")

train = train_orig.copy()
test = test_orig.copy()
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


<h1>Preprocessing (Veraltete Überlegungen)

 <h2>#1 Remove Passenger ID, Name, Ticket via Transformer

Über den ColumnTransformer kann in der Pipeline standardmäßig entfernt werden. 
Vgl:

clmn_trnsfrmr = ColumnTransformer([
        ('clmn_drpr', 'drop', ['pet'])]),
        ('scale', StandardScaler(), ['salary']),
'passthrough'])

Dies soll für die folgenden Spalten geschehen:
- PassengerId
- Name
- Ticket

<h2>#2 Kategorisieren

Auffüllen mit SimpleImputer
Kategorisierung über den OneHotEncoder
Dies soll die folgenden Reihen betreffen:
- Pclass
- Sex
- Embarked

<h2>#3 Numerische Werte --> fehlende auffüllen und skalieren

fehlende Werte über einen Imputer Transformer mit Mittelwert für:
- Age
- SibSp
- Parch
- Fare

<h2>#4 Cabin

Hier sollen die Kabinennummern zunächst in einfache Buchstaben umgewandelt werden. 
1)Dafür wird der Transformer 'Replace_Letters()' geschrieben und verwendet. 
2)Ein Imputer soll fehlende Werte durch ein 'D' ersetzen, weil diese Kategorie in der Mitte des Schiffs liegt und daher vermutlich die ausgeglichensten Überlebensschancen bietet. 
3)Anschließend: OneHotEncoder

<h2> ColumnTransformer

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import Normalizer
from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.impute import SimpleImputer


#definition of Room_Replacer

class Room_Replacer(BaseEstimator,TransformerMixin):
    def __init__(self):
        return
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X,y=None):
        if isinstance(X, pd.DataFrame)==True:
            for i in range(len(X.columns)):
                liste = list(X[X.columns[i]])
                for j in range(len(liste)):
                    if isinstance(liste[j], str)==True:
                        liste[j]=liste[j][0]
                X[X.columns[i]]=liste
        if isinstance(X, pd.Series)==True:
            for j in range(X.size):
                liste = list(X)
                for j in range(len(liste)):
                    if isinstance(liste[j], str)==True:
                        liste[j]=liste[j][0]
                X = liste
        else:   
            X = X
        return X


drop_rows = ['PassengerId',
             
             'Name', 'Ticket', 'Embarked',]       # rows to drop
cat_rows = ['Pclass', 'Sex']             #rows with categories
num_rows = ['Age', 'SibSp', 'Parch', 'Fare']

#Pipelines

pip_cat = Pipeline([                                 #pipeline für kategoriale Werte,
    ('o_encoder_cat', OrdinalEncoder()),
    ('imputer_cat', SimpleImputer(strategy='mean')),
    ('o_h_encoder_cat', OneHotEncoder())
])

pipe_num = Pipeline([                                #pipeline für numerische Werte
    ('imputer_num', SimpleImputer(strategy = 'mean')),
    ('std_scaler', Normalizer())
])

pipe_cabin = Pipeline([                               # pipeline für 'Cabin'
    ('room_replacer', Room_Replacer()),
    ('imputer_cabin', SimpleImputer(strategy='constant', fill_value='D')),
    ('o_h_encoder_cabin', OneHotEncoder())
])

col_tran = ColumnTransformer(
    transformers= [
    ('clmn_dropper', 'drop', drop_rows),      #pipeline 1
    ('cabins', pipe_cabin, ['Cabin']),
    ('cat', pip_cat, cat_rows),       #OneHotEncoder für 'Pclass' und 'Sex'
    ('age', pipe_num, num_rows)
    ], 
    remainder = 'passthrough',
    n_jobs=1

)

<h2>Preparing the data

In [3]:
from sklearn.model_selection import train_test_split

X = train.drop('Survived', axis=1)
y = train['Survived'].copy()

X_tf = col_tran.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_tf, y, test_size=0.2, random_state=42)
pd.DataFrame(X_tf)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.948873,0.043131,0.000000,0.312697
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.470381,0.012378,0.000000,0.882377
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.956551,0.000000,0.000000,0.291564
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.550270,0.015722,0.000000,0.834839
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.974555,0.000000,0.000000,0.224148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.901002,0.000000,0.000000,0.433816
887,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.535052,0.000000,0.000000,0.844819
888,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.783473,0.026380,0.052761,0.618619
889,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.654931,0.000000,0.000000,0.755689


<h1>Machine Learning
    <h2> #1 Lineare Regression

In [4]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression()

In [5]:
from sklearn.metrics import mean_squared_error
survival_predictions = lin_reg.predict(X_test)
lin_mse = mean_squared_error (y_test, survival_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.38183189806038553

In [6]:
lin_reg.score(X_test, y_test)

0.39878548679818204

## #2 Support Vector Machine

In [7]:
from sklearn.svm import LinearSVR

svr = LinearSVR()
svr.fit(X_train, y_train)

LinearSVR()

In [8]:
svr_predict = svr.predict(X_test)
svr_mse = mean_squared_error (y_test, svr_predict)
svr_rmse = np.sqrt(svr_mse)
svr_rmse

0.4727221086692939

In [9]:
svr.score(X_test, y_test)

0.07849674920658778

## #3 DecisionTree

In [10]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)


DecisionTreeRegressor()

In [11]:
tree_predict = tree_reg.predict(X_test)
tree_mse = mean_squared_error (y_test, tree_predict)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.48222105816501565

In [12]:
tree_reg.score(X_test, y_test)

0.04109101813183447

## #4 RidgeRegression

In [13]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=1, solver='cholesky')
ridge_reg.fit(X_train, y_train)

Ridge(alpha=1, solver='cholesky')

In [14]:
ridge_predict = ridge_reg.predict(X_test)
ridge_mse = mean_squared_error (y_test, ridge_predict)
ridge_rmse = np.sqrt(ridge_mse)
ridge_rmse

0.38331085790005753

In [15]:
ridge_reg.score(X_test, y_test)

0.3941190660417444