In [None]:
import os

def scale_input_data(scale_factor):
  file_bases = ['./input/train', './input/test']
  for file_base in file_bases:
    import pandas as pd
    import shutil
    if scale_factor == 1.0:
      shutil.copyfile(file_base + '.csv', file_base + '.scaled.csv')
      continue
    df_to_scale = pd.read_csv(file_base + '.csv')
    new_num_rows = int(scale_factor * len(df_to_scale))
    if scale_factor <= 1.0:
      df_to_scale = df_to_scale.iloc[:new_num_rows]
    else:
      while len(df_to_scale) < new_num_rows:
        df_to_scale = pd.concat([df_to_scale, df_to_scale[:min(new_num_rows - len(df_to_scale), len(df_to_scale))]])
    df_to_scale.to_csv(file_base + '.scaled.csv', index=False)

if 'INPUT_SCALE_FACTOR' in os.environ:
  scale_input_data(float(os.environ['INPUT_SCALE_FACTOR']))

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
exec(os.environ['IREWR_IMPORTS'])

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

# ALEX: remove path printing
# from subprocess import check_output
# print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.
# ALEX: make notebook work with runner
from IPython.display import display

# WE'RE GOING TO START WITH SOME DATA EXPLORATION
### Read in the data from Kaggle

In [2]:
trainData = pd.read_csv("./input/train.scaled.csv")
testData = pd.read_csv("./input/test.scaled.csv")

### Reformat some of the columns

In [3]:
trainData.Sex = trainData.Sex.astype('category')
testData.Sex = testData.Sex.astype('category')
trainData['IsMale'] = trainData.Sex.cat.codes
testData['IsMale'] = testData.Sex.cat.codes

### Show a window of the data

In [4]:
trainData.head(n=20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,IsMale
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,1
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,1
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,0


### Get some commonly-used stats on the table columns

In [5]:
display(trainData.describe().round(2))
display(trainData.describe(include=["O"]))

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,IsMale
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,446.0,0.38,2.31,29.7,0.52,0.38,32.2,0.65
std,257.35,0.49,0.84,14.53,1.1,0.81,49.69,0.48
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.12,0.0,0.0,7.91,0.0
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.45,1.0
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0,1.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.33,1.0


Unnamed: 0,Name,Ticket,Cabin,Embarked
count,891,891,204,889
unique,891,681,147,3
top,"Braund, Mr. Owen Harris",347082,B96 B98,S
freq,1,7,4,644


### Graph some of the columns

In [6]:
# ALEX: remove plotting
# import matplotlib.pyplot as plt
# trainData.Sex.value_counts(normalize=True).plot(kind="bar", title="Sex")
# plt.show()
# trainData.Pclass.value_counts().plot(kind="barh", title="Pclass")
# plt.show()
# trainData.Fare.plot(kind="density", title="Fare", xlim=(0,800))
# plt.show()
trainData.Sex.value_counts(normalize=True)
trainData.Pclass.value_counts()
trainData.Fare

0       7.2500
1      71.2833
2       7.9250
3      53.1000
4       8.0500
        ...   
886    13.0000
887    30.0000
888    23.4500
889    30.0000
890     7.7500
Name: Fare, Length: 891, dtype: float64

### We can also do pivot tables automagically

In [7]:
trainData.groupby(["Sex", "Survived"])["Survived"].count()

Sex     Survived
female  0            81
        1           233
male    0           468
        1           109
Name: Survived, dtype: int64

### And this incredibly powerful function that shows how well each column correlates with the target label

In [8]:
trainData[["PassengerId","Survived", "Pclass", "Age", "SibSp", "Parch", "Fare", "IsMale"]].corrwith(trainData.Survived).round(4) * 100

PassengerId     -0.50
Survived       100.00
Pclass         -33.85
Age             -7.72
SibSp           -3.53
Parch            8.16
Fare            25.73
IsMale         -54.34
dtype: float64

### Based on these correlation values, let's select a subset of these columns to use as features for our ML model
#### (We also need to do some cleanup to remove the pesky blank values)

In [9]:
testFeatures = ["IsMale", "Pclass", "Fare"]
# ALEX: make notebook run
# X = trainData[testFeatures].fillna(0).as_matrix()
# Y = trainData["Survived"].fillna(0).as_matrix()
X = trainData[testFeatures].fillna(0).to_numpy()
Y = trainData["Survived"].fillna(0).to_numpy()

# OKAY LET'S MACHINE LEARN
### This is literally the entire machine learning thing

In [10]:
# ALEX: remove ML code
# from sklearn import tree
# dtree = tree.DecisionTreeClassifier(max_depth=3)
# dtree.fit(X, Y) #ALL OF THE MAGIC HAPPENS RIGHT HERE LADIES AND GENTLEMEN AND OTHERS

### Visualize our shiny new decision tree

In [11]:
# ALEX: remove ML code
# import graphviz
# dot_data = tree.export_graphviz(dtree, out_file=None, feature_names=testFeatures)
# graph = graphviz.Source(dot_data)
# graph

### Use our model to predict on the test data

In [12]:
# ALEX: remove ML code, make notebook run
# testData["Survived"] = dtree.predict(testData[testFeatures].fillna(0).as_matrix())
testData["Survived"] = testData[testFeatures].fillna(0).to_numpy()[:,1]

### Format our predictions the way Kaggle wants

In [13]:
predictions = testData[["PassengerId", "Survived"]]
predictions.to_csv("dtree_predictions.csv", index=False)
# ALEX: remove path printing
# print(check_output(["ls", "."]).decode("utf8"))