# Loading the data

In [1]:
import pandas as pd

titanic =  pd.read_csv('/kaggle/input/titanic/train.csv')
titanic_test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [2]:
titanic.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
titanic_test.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Discovering how many missing values are in every column

In [4]:
titanic.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
titanic_test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

# Dropping Useless or problematic features and train test split
Dropping cabin because there are too many missing values, and dropping the rows will cause too many of the data to be missing. Dropping the few missing rows on embarked column on training data and fare column on test data.

In [6]:
from sklearn.model_selection import train_test_split 
import numpy as np

titanic.drop(columns = ['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)
titanic = titanic[titanic.Embarked.notna()]

tpid = titanic_test.PassengerId
titanic_test.drop(columns = ['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)
titanic_test.Fare.fillna(titanic_test.Fare.mean(), inplace=True)

y = titanic.Survived
X = titanic.drop(columns=['Survived'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.2, random_state=0)

In [7]:
titanic_test.isnull().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         0
Embarked     0
dtype: int64

# Pipeline
Using OrdinalEncoder in categorical data and simple imputer for missing values.
Random forest classifier will be the model.

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier


categorical_cols = ['Embarked', 'Sex']
numerical_cols = X_train.drop(columns=['Embarked', 'Sex']).columns

In [9]:
preprocessor = ColumnTransformer([
    ('num_cols', StandardScaler(), numerical_cols),
    ('cat_cols', OrdinalEncoder(), categorical_cols)
    ])

pipe_transformer = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('imp', SimpleImputer(strategy='mean')),
    ('model', RandomForestClassifier(random_state=0))
])


pipe_transformer.fit(X_train, y_train)
pipe_transformer.score(X_valid, y_valid)

0.7584269662921348

Cross Validation to measure modeling quality

In [10]:
from sklearn.model_selection import cross_val_score

score = cross_val_score(pipe_transformer, X, y, cv=5)
score.mean()

0.8155652891512727

# Predictions

In [11]:
predictions = pipe_transformer.predict(titanic_test)
predictions

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [12]:
df = pd.DataFrame({'PassengerId': tpid, 'Survived':predictions})
df.to_csv('results.csv', index=False)