# packages used

In [1]:
# misc
from IPython.display import display, HTML
import numpy as np

# DATA - prep
#kaggle
import pandas as pd
import sklearn.model_selection

# ML - models 
import sklearn.linear_model
import sklearn.tree
import sklearn.ensemble
import xgboost.sklearn

# ML - accuracy
import sklearn.metrics



# Get data

Setup:
- follow "API credential step" listed here: https://github.com/Kaggle/kaggle-api
    - go to https://www.kaggle.com/ (login)
    - go to my_profile (download kaggle.json)
    - put it in ~/.kaggle/kaggle.json
    - `cp ~/Downloads/kaggle.json ~/.kaggle/kaggle.json`
    - `chmod 600 ~/.kaggle/kaggle.json`
- Go to kaggle and join competition: 
    - https://www.kaggle.com/c/titanic
- install kaggle
- download data
- profit!!!

In [2]:
!pip install kaggle -q
# -q is just for quite, so we don't spam the notebook

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:

metadata = {
    'basepath' : '../data/',
    'dataset':'titanic',
    'train' : 'train.csv',
    'test' : 'test.csv'}


In [4]:
# make folder
# download .zip
# unzip
# remove the .zip
# (data is placed ../data/titanic)

!mkdir -p {metadata['basepath']}
!kaggle competitions download -c dataset {metadata['dataset']} -p {metadata['basepath']}
!unzip -o {metadata['basepath']}{metadata['dataset']}.zip -d {metadata['basepath']}{metadata['dataset']}/
!rm {metadata['basepath']}{metadata['dataset']}.zip

Downloading titanic.zip to ../data
  0%|                                               | 0.00/33.9k [00:00<?, ?B/s]
100%|███████████████████████████████████████| 33.9k/33.9k [00:00<00:00, 760kB/s]
Archive:  ../data/titanic.zip
  inflating: ../data/titanic/train.csv  
  inflating: ../data/titanic/test.csv  
  inflating: ../data/titanic/gender_submission.csv  


# Load and explore

In [5]:
# load
train = pd.read_csv("{basepath}/{dataset}/{train}".format(**metadata))
test = pd.read_csv("{basepath}/{dataset}/{test}".format(**metadata))


In [6]:
# Train
display(HTML("<h1>train</h1>"))
# example data
display(train.head(3))
# summary stats
display(train.describe())
# list missing values
display(pd.DataFrame(train.isna().mean() ,columns=["is na fraction"]))
# list types of column
display(train.dtypes)
# list dimenstion
display(train.shape)


# TODO check test

display(HTML("<h1>test</h1>"))
display(pd.DataFrame(test.isna().mean() ,columns=["is na fraction"]))


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Unnamed: 0,is na fraction
PassengerId,0.0
Survived,0.0
Pclass,0.0
Name,0.0
Sex,0.0
Age,0.198653
SibSp,0.0
Parch,0.0
Ticket,0.0
Fare,0.0


PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

(891, 12)

Unnamed: 0,is na fraction
PassengerId,0.0
Pclass,0.0
Name,0.0
Sex,0.0
Age,0.205742
SibSp,0.0
Parch,0.0
Ticket,0.0
Fare,0.002392
Cabin,0.782297


In [7]:
# clean data / build feature 
# to_expand

target = "Survived"
# keep numeric features without missing vals
keep_features = ["Pclass","SibSp","Parch"]

y = train[target]
X = train[keep_features]

In [8]:
# Split data in train and validation
target = "Survived"

seed = 42
test_size = 0.7

X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(
    X,
    y,
    random_state = seed,
    test_size = test_size)


# ML

In [9]:


# default models
# Logistic regression 
model_logreg = sklearn.linear_model.LogisticRegression()
model_logreg.fit(X_train, y_train);

# decision tree
model_decision_tree = sklearn.tree.DecisionTreeClassifier()
model_decision_tree.fit(X_train, y_train);

# randomForest
model_random_forest = sklearn.ensemble.RandomForestClassifier()
model_random_forest.fit(X_train, y_train);

# xgboost
model_xgboost = xgboost.sklearn.XGBClassifier()
model_xgboost.fit(X_train, y_train);





# Eval ML

In [10]:
# naive model
class naive_model():
    # everyone dies
    def predict(self, df):
        return np.zeros(df.shape[0])

model_naive = naive_model()

models = {
    "model_naive" : model_naive,
    "model_logreg" : model_logreg,
    "model_decision_tree": model_decision_tree,
    "model_random_forest": model_random_forest,
    "model_xgboost" :model_xgboost
}

for name,model in zip(models.keys(),models.values()):
    acc = sklearn.metrics.accuracy_score(
     y_true = y_val,
     y_pred = model.predict(X_val)
    )

    print(name,round(acc,4))



model_naive 0.609
model_logreg 0.6859
model_decision_tree 0.6683
model_random_forest 0.6763
model_xgboost 0.6843


# Output

In [11]:
# passengerid
id = "PassengerId"
out = pd.DataFrame(data = test[id], columns = [id])

# target
out_target = model_logreg.predict(test[keep_features])
out[target] = pd.DataFrame(out_target
                          ,columns = [target]
                          ,dtype=np.int32
                          )


# put them out
outfile = metadata["basepath"] + "output_logreg.csv"
out.to_csv(path_or_buf = outfile,
           index = False)



In [12]:
# Submit
#!kaggle competitions submit {metadata['dataset']} -f {outfile} -m "minimal model"

In [13]:
# See submission 
!kaggle competitions submissions "{metadata['dataset']}"

fileName               date                 description                                          status    publicScore  privateScore  
---------------------  -------------------  ---------------------------------------------------  --------  -----------  ------------  
output_logreg.csv      2019-12-01 21:51:33  minimal model                                        complete  0.65550      None          
output_logreg.csv      2019-12-01 21:50:01  minimal model                                        complete  0.65550      None          
output_logreg.csv      2019-12-01 21:33:42  minimal model                                        error     None         None          
predict_rf_1.csv       2019-11-28 11:36:40  TEST TEST                                            complete  0.76076      None          
myfirstforest.csv      2018-08-20 19:09:53  Old random forest                                    complete  0.73205      None          
predict_xgboost_1.csv  2018-08-20 19:06:02  simple xgbo