# In this notebook am using machine learning to create a model that predicts which passengers survived the Titanic shipwreck. Here is the [link](https://www.kaggle.com/c/titanic)

In [1]:
# modules for reading the data

import pandas as pd # data processing
import numpy as np # linear algebra

In [2]:
#getting the data and loading to pandas DataFrame

train_df = pd.read_csv('F://bizz//DATA SETS/TITANIC DATA SET/train.csv', index_col = 'PassengerId' ) # data to train our model
test_df = pd.read_csv('F://bizz//DATA SETS/TITANIC DATA SET/test.csv', index_col = 'PassengerId') #data to make predictions


In [28]:
#to have a feel of training data, this data will be used to train our model
train_df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [29]:
#to have a feel of test data
#this data will be used to evaluate our model
test_df.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
# Survived will act as our label on the testing data.
labl = train_df[ 'Survived']
type(labl)

pandas.core.series.Series

# LABEL is the thing we're predicting

In [30]:
#converting series object to dataframe, o means dint not survive 1 survived
y = pd.DataFrame(labl, index = train_df.index)
y.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
1,0
2,1
3,1
4,1
5,0


In [7]:
#available features we only chose numerical
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 66.1+ KB


# Creating a pipeline for numerical features only

In [8]:
from sklearn.pipeline import Pipeline
#Pipeline is used for tying together these dynamic parts of the ML process.
from sklearn.compose import ColumnTransformer
#transformations on the column ie choosing only numerical features
from sklearn.tree import DecisionTreeClassifier
#algorithm for predicting the results

In [9]:
# selecting columns to use
columns = ['Pclass', 'Parch', 'SibSp']

ct = ColumnTransformer(remainder = 'drop',
                       transformers = [
                           ('select', 'passthrough', columns)])

#creating the model
model_1 = Pipeline([
    ('selector', ct),
    ('predictor', DecisionTreeClassifier()),

])

In [10]:
#fitting/training the model
model_1.fit(train_df, y);

In [11]:
#making sure the test data and the train data have same columns

test_correct_columns = pd.DataFrame(test_df, columns=train_df.columns)


In [12]:
# custom fuction to make submissions

def make_submission(model, test_correct_columns):
    y_test_pred = model.predict(test_correct_columns)
    
    #predictions to dataframe
    predictions = pd.Series(data = y_test_pred,
                           index = test_df.index,
                           name = 'Survived')
    date = pd.Timestamp.now().strftime(format='%Y-%m-%d_%H-%M_')
    predictions.to_csv(f'{date}submission.csv', 
                       index=True, header=True)
    

In [13]:
#submission to local machine
#make_submission(model_1, test_correct_columns)

In [14]:
#Evaluating the model performance
model_1.score(train_df, y)

0.7216610549943884

# 0.68 on kaggle(68%)

# model 2

In [25]:
from sklearn.preprocessing import OneHotEncoder
#converts the features into numbers ie male 1 female 0 
from sklearn.impute import SimpleImputer
#it fills the missing values ie in Embarked, machine learning models does not do well with missing values

#categorical features 
categorical_features = [ 'Sex', 'Embarked']

categorical_transformer = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# selecting columns to use (numerical)
columns = ['Pclass', 'Parch', 'SibSp']

ct = ColumnTransformer(remainder = 'drop',
                       transformers = [
                           ('select', 'passthrough', columns),
                           ('onehot', categorical_transformer, categorical_features)])

#creating the model
model_2 = Pipeline([
    ('selector', ct),
    ('predictor', DecisionTreeClassifier()),

])

In [26]:
model_2.fit(train_df, y);

In [27]:
model_2.score(train_df, y)

0.8372615039281706

-we get an improved score of  83% 
-The model is trained on five features('Pclass', 'Parch', 'SibSp', Sex, Embarked)