# Kaggle Titanic challenge

In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import pandas as pd
import numpy  as np

## Data modification

In [3]:
# Fetch data from kaggle using kaggle-cli
# kg download -c titanic -u Bobox214 -p XXXXX

In [4]:
PATH = "data/"

In [5]:
train_df = pd.read_csv(f'{PATH}/train.csv')
test_df  = pd.read_csv(f'{PATH}/test.csv')

In [6]:
train_df.drop(columns=['PassengerId','Cabin','Name','Ticket'],inplace=True)
test_df.drop(columns=['Cabin','Name','Ticket'],inplace=True)

In [7]:
test_df.head(2)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S


In [8]:
# Update columns to be all numerical
train_df['Sex'] = train_df['Sex'].apply( lambda s: 0 if s=='male' else 1 )
test_df['Sex']  = test_df['Sex'].apply( lambda s: 0 if s=='male' else 1 )

In [9]:
# Update columns to be all numerical
train_df['Embarked'] = train_df['Embarked'].apply( lambda s: {'C':0,'S':1,'Q':2,np.nan:3}[s])
test_df['Embarked']  = test_df['Embarked'].apply( lambda s: {'C':0,'S':1,'Q':2,np.nan:3}[s])

In [10]:
age_mean = train_df['Age'].mean()
train_df.loc[train_df.Age.apply(np.isnan),'Age'] = age_mean
test_df.loc[test_df.Age.apply(np.isnan),'Age'] = age_mean

In [11]:
fare_mean = train_df['Fare'].mean()
train_df.loc[train_df.Fare.apply(np.isnan),'Fare'] = fare_mean
test_df.loc[test_df.Fare.apply(np.isnan),'Fare'] = fare_mean

In [12]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,1
1,1,1,1,38.0,1,0,71.2833,0
2,1,3,1,26.0,0,0,7.925,1
3,1,1,1,35.0,1,0,53.1,1
4,0,3,0,35.0,0,0,8.05,1


## Logistic regression

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
dev_X, val_X, dev_y, val_y = train_test_split(train_X, train_y, test_size=0.25)#, random_state=42)

In [37]:
from sklearn import linear_model as lm
clf = lm.LogisticRegression()

In [38]:
clf.fit(dev_X,dev_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [39]:
# Predict on the validation sample #
val_preds = clf.predict(val_X)

In [40]:
from sklearn.metrics import accuracy_score
accuracy_score(val_y, val_preds)

0.7488789237668162

## Submissions

In [20]:
test_preds = clf.predict(test_X)

In [21]:
test_y_df = pd.DataFrame(data={'Survived':test_preds})

In [22]:
my_sub = pd.concat((test_df[['PassengerId']],test_y_df),axis='columns')
my_sub.shape

(418, 2)

In [23]:
my_sub.head(4)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0


In [24]:
my_sub.to_csv(f'{PATH}/logisticRegression.csv',index=False)

In [25]:
!ls -lart {PATH}

total 108
-rw-rw-r-- 1 paperspace paperspace 61194 Feb 13 13:59 train.csv
-rw-rw-r-- 1 paperspace paperspace 28629 Feb 13 13:59 test.csv
-rw-rw-r-- 1 paperspace paperspace  3258 Feb 13 13:59 gender_submission.csv
-rw-rw-r-- 1 paperspace paperspace  2839 Feb 13 15:38 test_sub.csv
drwxrwxr-x 2 paperspace paperspace  4096 Feb 15 15:36 .
drwxrwxr-x 4 paperspace paperspace  4096 Feb 15 15:39 ..
-rw-rw-r-- 1 paperspace paperspace  2839 Feb 15 15:39 logisticRegression.csv


In [26]:
# Submit data to kaggle using kaggle-cli
# kg submit data/test_sub.csv -c titanic -u Bobox214 -p XXXXX