I. Import

In [1]:
# Plan simple : 
# - Clean data
# - Simple model

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [4]:
df_train = pd.read_csv("/kaggle/input/titanic/train.csv")
df_test = pd.read_csv("/kaggle/input/titanic/test.csv")

II. EDA

In [5]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
print(df_train.shape)
print(df_test.shape)

(891, 12)
(418, 11)


In [7]:
print(df_train.isnull().sum())
print(df_train.info())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
Non

In [8]:
print(df_test.isnull().sum())

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [9]:
if 'Cabin' in df_train.columns:
    df_train = df_train.drop(columns = "Cabin")
    df_test = df_test.drop(columns = "Cabin")
    
if 'Name' in df_train.columns:
    df_train = df_train.drop(columns = "Name")
    df_test = df_test.drop(columns = "Name")
    
if 'Ticket' in df_train.columns:
    df_train = df_train.drop(columns = "Ticket")
    df_test = df_test.drop(columns = "Ticket")
    
if 'Embarked' in df_train.columns:
    df_train = df_train.drop(columns = "Embarked")
    df_test = df_test.drop(columns = "Embarked")
    
# if 'PassengerId' in df_train.columns:
#     df_train = df_train.drop(columns = "PassengerId")
    
print(df_train.columns)
print(df_test.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')


In [10]:
#binary encoding for gender

gender_dic = { 'male' : 0, 'female' : 1}
list_values = df_train['Sex'].unique()

if "male" in list_values:
    df_train['Sex'] = df_train['Sex'].map(gender_dic)
    df_test['Sex'] = df_test['Sex'].map(gender_dic)

print(df_train)

     PassengerId  Survived  Pclass  Sex   Age  SibSp  Parch     Fare
0              1         0       3    0  22.0      1      0   7.2500
1              2         1       1    1  38.0      1      0  71.2833
2              3         1       3    1  26.0      0      0   7.9250
3              4         1       1    1  35.0      1      0  53.1000
4              5         0       3    0  35.0      0      0   8.0500
..           ...       ...     ...  ...   ...    ...    ...      ...
886          887         0       2    0  27.0      0      0  13.0000
887          888         1       1    1  19.0      0      0  30.0000
888          889         0       3    1   NaN      1      2  23.4500
889          890         1       1    0  26.0      0      0  30.0000
890          891         0       3    0  32.0      0      0   7.7500

[891 rows x 8 columns]


In [11]:
# We encode NaN Age value by taking the mean of all age. Simple encoding

mean_age_train = df_train['Age'].mean()
mean_age_test = df_test['Age'].mean()
mean_fare_test = df_test['Fare'].mean()

if df_train['Age'].isnull().sum() != 0:
    df_train['Age'] = df_train['Age'].fillna(mean_age_train)
    df_test['Age'] = df_test['Age'].fillna(mean_age_test)

    if df_test['Fare'].isnull().sum() != 0:
        df_test['Fare'] = df_test['Fare'].fillna(mean_fare_test)

In [12]:
from sklearn.model_selection import train_test_split

if 'Survived' in df_train.columns:
    y = df_train.pop('Survived')

x_train, x_val, y_train, y_val = train_test_split(df_train, y, random_state = 100)
print(x_train.shape)
print(x_val.shape)
print(y_train.shape)
print(y_val.shape)

(668, 7)
(223, 7)
(668,)
(223,)


In [13]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

In [14]:
# Measuring the accuracy of our model
from sklearn.metrics import accuracy_score

tree_pred = tree.predict(x_val)
print(accuracy_score(y_val, tree_pred))

0.7309417040358744


In [15]:
# Training the tree on all the data

tree_final = DecisionTreeClassifier()
tree_final.fit(df_train, y)
tree_pred = tree_final.predict(df_test)

In [16]:
submission = pd.DataFrame({
"PassengerId": df_test["PassengerId"],
"Survived": tree_pred
})

submission.to_csv('submission.csv', index=False)