In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/test.csv
/kaggle/input/titanic/train.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
raw_train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
raw_test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

In [3]:
raw_train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
raw_test_data.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [5]:
## Let's see the volume of missing data in each column
for i in raw_train_data.columns:
    print('Variable: ', '{:<15}'.format(i), 'Missing Values: ', raw_train_data[i].isna().sum())


Variable:  PassengerId     Missing Values:  0
Variable:  Survived        Missing Values:  0
Variable:  Pclass          Missing Values:  0
Variable:  Name            Missing Values:  0
Variable:  Sex             Missing Values:  0
Variable:  Age             Missing Values:  177
Variable:  SibSp           Missing Values:  0
Variable:  Parch           Missing Values:  0
Variable:  Ticket          Missing Values:  0
Variable:  Fare            Missing Values:  0
Variable:  Cabin           Missing Values:  687
Variable:  Embarked        Missing Values:  2


In [6]:
## Notably, the cabin feature is missing for 687 of the total 891 observances
## Let's drop this variable
## Let's also drop PaseengerId, Name, and Ticket 
## We create a checkpoint here

train_data = raw_train_data.drop(['Cabin'], axis=1)
train_data = train_data.drop(['PassengerId'], axis=1)
train_data = train_data.drop(['Name'], axis=1)
train_data = train_data.drop(['Ticket'], axis=1)
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [7]:
## For the test data, we need to keep the PasengerId column
test_data = raw_test_data.drop(['Cabin'], axis=1)
test_data = test_data.drop(['Name'], axis=1)
test_data = test_data.drop(['Ticket'], axis=1)
test_data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [8]:
## The fare paid is an indication of placement on the ship
## A more expensive fare should correspond to a superior placement on the ship
## This likely effects probability of survival
## Instead of the exact fare, we are more interested in categories of fares
## Let's create 4 categories, based on quartiles

train_data['Fare'].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [9]:
for df in (train_data, test_data):
    df['Fare'] = df['Fare'].fillna(0)
    df['Fare'] = df['Fare'].astype(int)

    df.loc[train_data['Fare'] <= 7.91, 'Fare'] = 0
    df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.4542), 'Fare'] = 1
    df.loc[(df['Fare'] > 14.4542) & (df['Fare'] <= 31), 'Fare'] = 2
    df.loc[df['Fare'] > 31, 'Fare'] = 3
    df['Fare'] = df['Fare'].astype(int)

train_data['Fare'].describe()

count    891.000000
mean       1.453423
std        1.124206
min        0.000000
25%        0.000000
50%        1.000000
75%        2.000000
max        3.000000
Name: Fare, dtype: float64

In [10]:
## We need to fill in the missing age values
## This can be done many ways, but let us use the average age for a given combination of Sex and Pclass
## This will provide a more accurate estimate for age
train_data['Age'] = train_data.groupby(['Sex', 'Pclass']).transform(lambda x: x.fillna(x.mean()))['Age']
test_data['Age'] = test_data.groupby(['Sex', 'Pclass']).transform(lambda x: x.fillna(x.mean()))['Age']

train_data.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.318643,0.523008,0.381594,1.453423
std,0.486592,0.836071,13.281103,1.102743,0.806057,1.124206
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,21.75,0.0,0.0,0.0
50%,0.0,3.0,26.507589,0.0,0.0,1.0
75%,1.0,3.0,36.0,1.0,0.0,2.0
max,1.0,3.0,80.0,8.0,6.0,3.0


In [11]:
## The exact age of a person doesn't tell us much
## Categorizing passengers based on age into toddler, children, adults, and seniors would be more useful

for df in (train_data, test_data):

    df.loc[df['Age'] <= 5, 'Age'] = 0 # toddlers
    df.loc[(df['Age'] > 5) & (df['Age'] <= 18), 'Age'] = 1 # children
    df.loc[(df['Age'] > 18) & (df['Age'] <= 65), 'Age'] = 2 # adults
    df.loc[df['Age'] > 65, 'Age'] = 3 # seniors

train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,2.0,1,0,0,S
1,1,1,female,2.0,1,0,3,C
2,1,3,female,2.0,0,0,0,S
3,1,1,female,2.0,1,0,3,S
4,0,3,male,2.0,0,0,1,S


In [12]:
## Finally, let's get the required dummy variables, for Sex and Embarked
## Note there are two missing values for Embarked
## Let's fill these with the most common Port of Embarkation

train_data['Embarked'] = train_data['Embarked'].fillna(train_data['Embarked'].mode()[0])
test_data['Embarked'] = test_data['Embarked'].fillna(test_data['Embarked'].mode()[0])

dummy_train_data = pd.get_dummies(train_data[['Sex','Embarked']])
dummy_test_data = pd.get_dummies(test_data[['Sex','Embarked']])

dummy_train_data.head()

Unnamed: 0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,1,0,0,1
1,1,0,1,0,0
2,1,0,0,0,1
3,1,0,0,0,1
4,0,1,0,0,1


In [13]:
## Let's drop Sex and Embarked from our dataframe and replace with our dummies
train_data['Female'] = pd.Series(dummy_train_data['Sex_female'])
train_data['Male'] = pd.Series(dummy_train_data['Sex_male'])
train_data['Port C'] = pd.Series(dummy_train_data['Embarked_C'])
train_data['Port Q'] = pd.Series(dummy_train_data['Embarked_Q'])
train_data['Port S'] = pd.Series(dummy_train_data['Embarked_S'])

test_data['Female'] = pd.Series(dummy_test_data['Sex_female'])
test_data['Male'] = pd.Series(dummy_test_data['Sex_male'])
test_data['Port C'] = pd.Series(dummy_test_data['Embarked_C'])
test_data['Port Q'] = pd.Series(dummy_test_data['Embarked_Q'])
test_data['Port S'] = pd.Series(dummy_test_data['Embarked_S'])

final_train_data = train_data.drop(['Sex','Embarked'], axis=1)
final_test_data = test_data.drop(['Sex','Embarked'], axis=1)

final_train_data.head()


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Female,Male,Port C,Port Q,Port S
0,0,3,2.0,1,0,0,0,1,0,0,1
1,1,1,2.0,1,0,3,1,0,1,0,0
2,1,3,2.0,0,0,0,1,0,0,0,1
3,1,1,2.0,1,0,3,1,0,0,0,1
4,0,3,2.0,0,0,1,0,1,0,0,1


In [14]:
final_test_data.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Female,Male,Port C,Port Q,Port S
0,892,3,2.0,0,0,0,0,1,0,1,0
1,893,3,2.0,1,0,0,1,0,0,0,1
2,894,2,2.0,0,0,0,0,1,0,1,0
3,895,3,2.0,0,0,0,0,1,0,0,1
4,896,3,2.0,1,1,0,1,0,0,0,1


In [15]:
X_train = final_train_data.iloc[:,1:]
X_test = final_test_data.iloc[:,1:]

y = final_train_data['Survived']

In [16]:
# ## Finally, let's set up and run our model
# from sklearn.ensemble import RandomForestClassifier

# model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
# model.fit(X_train, y)
# predictions = model.predict(X_test)

# output = pd.DataFrame({'PassengerId': final_test_data.PassengerId, 'Survived': predictions})
# output.to_csv('Titanic_Submission2.csv', index=False)

from sklearn.linear_model import LogisticRegression
reg = LogisticRegression()
reg.fit(X_train, y)
predictions = reg.predict(X_test)
output = pd.DataFrame({'PassengerId': final_test_data.PassengerId, 'Survived': predictions})
output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [17]:
output.to_csv('Titanic_Submission3.csv', index=False)