In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics

import os
TITANIC_PATH = os.path.join("datasets", "titanic")



def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)



train_data = load_titanic_data("C:\\Users\\aleks\\Desktop\\datasets\\titanic\\train.csv")
test_data = load_titanic_data("C:\\Users\\aleks\\Desktop\\datasets\\titanic\\test.csv")

train_data.head()

train_data.info()

train_data.loc[train_data.Age.isna(), 'Age'] = train_data['Age'].mean()


import re

cabin = [re.sub('\d', '', v)[0] if type(v) == str else 'Z' for v in train_data['Cabin']]
train_data['CabinLvl'] = [ord(v) for v in cabin]
train_data.head()

cabin = [re.sub('\d', '', v)[0] if type(v) == str else 'Z' for v in test_data['Cabin']]
test_data['CabinLvl'] = [ord(v) for v in cabin]
test_data.head()

train_data = train_data.drop(columns=['Cabin'])


train_data = train_data.drop(columns=['Name'])


train_data.describe()

train_data['Survived'].unique()

X = train_data.drop(['Survived'], axis=1)
y = train_data['Survived'].values

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

print("X.shape: {} y.shape: {}".format(X.shape, y.shape))

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare","Pclass","CabinLvl"])),
        ("imputer", SimpleImputer(strategy="median")),
    ])

num_pipeline.fit_transform(X_train)

class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["Embarked","Sex","Ticket"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False, handle_unknown = 'ignore')),
    ])

cat_pipeline.fit_transform(X_train)
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier


preprocess_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

pipe1 = Pipeline([
    ('preprocessing', preprocess_pipeline),
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())])

param_grid = {
            'classifier__C': [0.001, 0.01, 0.1,0.2,0.5, 1, 10, 100],
            'classifier__penalty': [None, 'l1', 'l2', 'elasticnet','newton-cg']
}

grid_1 = GridSearchCV(pipe1, param_grid)

grid_1.fit(X_train, y_train)
grid_1.best_params_

pipe_2 = Pipeline([('preprocessing', preprocess_pipeline), ('classifier', SVC(probability=True))])

param_grid_2 = {
            'classifier__C': [100000,1e6,1e7],
            'classifier__gamma': [1e-07,1e-6,0.000001,0.00001]
}

grid_2 = GridSearchCV(pipe_2, param_grid_2, return_train_score=True)

grid_2.fit(X_train, y_train)
grid_2.best_params_

pipe = Pipeline([
    ('preprocessing', preprocess_pipeline), 
    ('classifier', SVC(kernel='linear',probability=True))])


param_grid = {
            'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid_4 = GridSearchCV(pipe, param_grid)

grid_4.fit(X_train, y_train)
grid_4.best_params_

from sklearn import  metrics

voting_clf = VotingClassifier(
    estimators=[('grid1', grid_1.best_estimator_),
                ('grid2', grid_2.best_estimator_),
                ('grid3', grid_4.best_estimator_)],
    voting='soft')
voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(test_data)


models = []
models.append(('SVM linear', grid_4.best_estimator_))
models.append(('SVM rbf', grid_2.best_estimator_))
#models.append(('SVM poly', grid_3.best_estimator_))
models.append(('Logical Regression', grid_1.best_estimator_))



precision_score = []
recall_score = []
f1_score = []
accuracy_score = []
for name, model in models:
    print(name)
    print("precision_score: {}".format(metrics.precision_score(y_test, model.predict(X_test)) ))
    print("recall_score: {}".format( metrics.recall_score(y_test, model.predict(X_test)) ))
    print("f1_score: {}".format( metrics.f1_score(y_test, model.predict(X_test)) ))
    print("accuracy_score: {}".format( metrics.accuracy_score(y_test, model.predict(X_test)) ))
    precision_score.append(metrics.precision_score(y_test, model.predict(X_test)))
    recall_score.append(metrics.recall_score(y_test, model.predict(X_test)))
    f1_score.append( metrics.f1_score(y_test, model.predict(X_test)))
    accuracy_score.append(metrics.accuracy_score(y_test, model.predict(X_test)))

    
    
import pandas as pd
d = {'precision_score': precision_score, 
     'recall_score': recall_score, 
     'f1_score': f1_score,
     'accuracy_score' : accuracy_score
    }
df = pd.DataFrame(data=d)
df.insert(loc=0, column='Method', value=['SVM linear','SVM rbf','Logical Regression'])
print(df)

def generate_csv(prediction, file_name):
    # test_data = load_titanic_data("test.csv")
    df = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': prediction})
    df.to_csv(file_name, index=False)


generate_csv(y_pred, "konkursTitanic.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
X.shape: (891, 9) y.shape: (891,)














120 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\aleks\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\aleks\anaconda3\lib\site-packages\sklearn\pipeline.py", line 406, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\aleks\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\aleks\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 5









SVM linear
precision_score: 0.7647058823529411
recall_score: 0.7027027027027027
f1_score: 0.7323943661971832
accuracy_score: 0.7877094972067039
SVM rbf
precision_score: 0.7714285714285715
recall_score: 0.7297297297297297
f1_score: 0.75
accuracy_score: 0.7988826815642458
Logical Regression
precision_score: 0.7681159420289855
recall_score: 0.7162162162162162
f1_score: 0.7412587412587412
accuracy_score: 0.7932960893854749
               Method  precision_score  recall_score  f1_score  accuracy_score
0          SVM linear         0.764706      0.702703  0.732394        0.787709
1             SVM rbf         0.771429      0.729730  0.750000        0.798883
2  Logical Regression         0.768116      0.716216  0.741259        0.793296
