Titanic konkurs

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics

Pobranie danych

In [2]:
import os
TITANIC_PATH = os.path.join("datasets", "titanic")

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

Sprawdzenie danych

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Age, Cabin i Embarked mają niekiedy wartość null. Age można uzuepłnić, Cabin będziemy ignorować
Atrybuty liczbowe:

In [7]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Przewidujemy czy pasażer przeżył katastrofę. Dzielimy train_data:

In [8]:
x_train = train_data.drop(['Survived'], axis=1)
y_train = train_data['Survived'].values

Uzupełniamy Age - mediana

In [28]:
x_train['Age'] = train_data.groupby(['Pclass', 'Sex', 'Parch'], group_keys=False)['Age'].apply(lambda x : x.fillna(x.mean()))
test_data['Age'] = test_data.groupby(['Pclass', 'Sex', 'Parch'], group_keys=False)['Age'].apply(lambda x : x.fillna(x.mean()))

In [29]:
train_data.groupby(['Pclass', 'Sex', 'Parch'])['Age'].value_counts()

Pclass  Sex     Parch  Age 
1       female  0      35.0    6
                       30.0    5
                       24.0    4
                       33.0    3
                       38.0    3
                              ..
3       male    2      26.0    1
                       40.5    1
                3      16.0    1
                4      40.0    1
                5      39.0    1
Name: Age, Length: 374, dtype: int64

Atrybuty kategoryczne, to:
* Sex
* Embarked
* Ticket

In [20]:
train_data['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [21]:
train_data['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [22]:
train_data['Ticket'].value_counts()

347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: Ticket, Length: 681, dtype: int64

Budujemy pipeline dla atrybutów numerycznych:

In [32]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(['Age', 'Parch',  'Fare', 'SibSp', 'Pclass'])),
        ("imputer", SimpleImputer(strategy="median")),
    ])

In [34]:
num_pipeline.fit_transform(x_train)

array([[22.    ,  0.    ,  7.25  ,  1.    ,  3.    ],
       [38.    ,  0.    , 71.2833,  1.    ,  1.    ],
       [26.    ,  0.    ,  7.925 ,  0.    ,  3.    ],
       ...,
       [13.5   ,  2.    , 23.45  ,  1.    ,  3.    ],
       [26.    ,  0.    , 30.    ,  0.    ,  1.    ],
       [32.    ,  0.    ,  7.75  ,  0.    ,  3.    ]])

Oraz dla atrybutów kategorycznych:

In [36]:
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [37]:
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["Sex", "Embarked", "Ticket"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False, handle_unknown = 'ignore')),
    ])

In [38]:
cat_pipeline.fit_transform(x_train)



array([[0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

Łączenie

In [39]:
from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

Testowanie modeli:

In [41]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

seed=123
kfold = StratifiedKFold(n_splits=5, random_state=seed,shuffle=True)

In [42]:
pipe_rbf = Pipeline([('preprocessing', preprocess_pipeline),
                     ('scaler', StandardScaler()), ('classifier', SVC(kernel='rbf'))])

param_grid_rbf = {
            'classifier__gamma': np.logspace(-6, 6, num=10),
            'classifier__C': np.logspace(-6, 6, num=10)}

grid_rbf = GridSearchCV(pipe_rbf, param_grid_rbf, cv=kfold, return_train_score=True)

grid_rbf.fit(x_train, y_train)
grid_rbf.best_params_

# {'classifier__C': 100.0, 'classifier__gamma': 0.00046415888336127773}



{'classifier__C': 100.0, 'classifier__gamma': 0.00046415888336127773}

In [48]:
pred = grid_rbf.predict(test_data)
passenger_id = test_data['PassengerId'].values
res = pd.DataFrame({'PassengerId':passenger_id,'Survived':pred})
res.to_csv('prediction/pred_svm_rbf.csv', index = False)

# Score: 0.79186

In [50]:
pipe_poly = Pipeline([('preprocessing', preprocess_pipeline),
                     ('scaler', StandardScaler()), ('classifier', SVC(kernel='poly'))])

param_grid_poly = {
            'classifier__gamma': [0.001, 0.01, 0.1, 1, 10],
            'classifier__C': [0.1, 1, 10, 100, 1000],
            'classifier__degree': [1, 2, 3, 4],
            'classifier__coef0': [0, 1]}

grid_poly = GridSearchCV(pipe_poly, param_grid_poly, cv=kfold, return_train_score=True)

grid_poly.fit(x_train, y_train)
grid_poly.best_params_

# {'classifier__C': 10,
#  'classifier__coef0': 1,
#  'classifier__degree': 2,
#  'classifier__gamma': 0.01}



{'classifier__C': 10,
 'classifier__coef0': 1,
 'classifier__degree': 2,
 'classifier__gamma': 0.01}

In [51]:
pred = grid_poly.predict(test_data)
passenger_id = test_data['PassengerId'].values
res = pd.DataFrame({'PassengerId':passenger_id,'Survived':pred})
res.to_csv('prediction/pred_svm_poly.csv', index = False)

# Score: 0.77272

In [52]:
pipe_lin = Pipeline([('preprocessing', preprocess_pipeline),
                     ('scaler', StandardScaler()), ('classifier', SVC(kernel='linear'))])

param_grid_lin = {
            'classifier__C': np.logspace(-6, 6, num=13)}

grid_lin = GridSearchCV(pipe_lin, param_grid_lin, cv=kfold, return_train_score=True)

grid_lin.fit(x_train, y_train)
grid_lin.best_params_



{'classifier__C': 0.1}

In [53]:
pred = grid_lin.predict(test_data)
passenger_id = test_data['PassengerId'].values
res = pd.DataFrame({'PassengerId':passenger_id,'Survived':pred})
res.to_csv('prediction/pred_svm_lin.csv', index = False)

# Score: 0.78708

In [54]:
pipe_log_reg = Pipeline([('preprocessing', preprocess_pipeline),
                     ('scaler', StandardScaler()), ('classifier', LogisticRegression())])

param_grid_log_reg = {
            'classifier__penalty': ['l1', 'l2', 'elasticnet', None],
            'classifier__solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
            'classifier__C': np.logspace(-6, 6, num=13)}

grid_log_reg = GridSearchCV(pipe_log_reg, param_grid_log_reg, cv=kfold, return_train_score=True)

grid_log_reg.fit(x_train, y_train)
grid_log_reg.best_params_

# {'classifier__C': 1.0,
#  'classifier__penalty': 'l1',
 # 'classifier__solver': 'liblinear'}

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'classifier__C': 1.0,
 'classifier__penalty': 'l1',
 'classifier__solver': 'liblinear'}

In [55]:
pred = grid_log_reg.predict(test_data)
passenger_id = test_data['PassengerId'].values
res = pd.DataFrame({'PassengerId':passenger_id,'Survived':pred})
res.to_csv('prediction/pred_log_reg.csv', index = False)

# Score: 0.78468

In [57]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from scipy.stats.distributions import uniform, randint

pipe_xgb = Pipeline([
  ('preprocessing', preprocess_pipeline),
  ('scaler', StandardScaler()),
  ('classifier', XGBClassifier()),
])

param_distribution = {
    'classifier__max_depth': randint(3, 11),
    'classifier__learning_rate': uniform(0.001, 0.1-0.001),
    'classifier__n_estimators': randint(50, 400),
    'classifier__gamma': uniform(0,2),
    'classifier__colsample_bytree': uniform(0.5, 0.5),
    'classifier__subsample': uniform(0.5, 0.5),
    'classifier__min_child_weight': randint(1, 11)
}

clf2 = RandomizedSearchCV(XGBClassifier(), param_distributions=param_distribution)
clf2.fit(x_train, y_train)
clf2.best_params_

ValueError: 
All the 50 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\agsud_z3nlru4\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\agsud_z3nlru4\AppData\Local\Programs\Python\Python39\lib\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
  File "C:\Users\agsud_z3nlru4\AppData\Local\Programs\Python\Python39\lib\site-packages\xgboost\sklearn.py", line 1471, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "C:\Users\agsud_z3nlru4\AppData\Local\Programs\Python\Python39\lib\site-packages\xgboost\sklearn.py", line 448, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "C:\Users\agsud_z3nlru4\AppData\Local\Programs\Python\Python39\lib\site-packages\xgboost\sklearn.py", line 908, in _create_dmatrix
    return DMatrix(**kwargs, nthread=self.n_jobs)
  File "C:\Users\agsud_z3nlru4\AppData\Local\Programs\Python\Python39\lib\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
  File "C:\Users\agsud_z3nlru4\AppData\Local\Programs\Python\Python39\lib\site-packages\xgboost\core.py", line 743, in __init__
    handle, feature_names, feature_types = dispatch_data_backend(
  File "C:\Users\agsud_z3nlru4\AppData\Local\Programs\Python\Python39\lib\site-packages\xgboost\data.py", line 970, in dispatch_data_backend
    return _from_pandas_df(data, enable_categorical, missing, threads,
  File "C:\Users\agsud_z3nlru4\AppData\Local\Programs\Python\Python39\lib\site-packages\xgboost\data.py", line 417, in _from_pandas_df
    data, feature_names, feature_types = _transform_pandas_df(
  File "C:\Users\agsud_z3nlru4\AppData\Local\Programs\Python\Python39\lib\site-packages\xgboost\data.py", line 391, in _transform_pandas_df
    _invalid_dataframe_dtype(data)
  File "C:\Users\agsud_z3nlru4\AppData\Local\Programs\Python\Python39\lib\site-packages\xgboost\data.py", line 283, in _invalid_dataframe_dtype
    raise ValueError(msg)
ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:Name: object, Sex: object, Ticket: object, Cabin: object, Embarked: object
