# Review, chapter 6 of 'Python for Data Analysis'

In [19]:
# imports
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

## Constants and Functions
Migrate these to a seperate package when done so they can be used with scripts

In [20]:
FILE_RAW = "../datasets/titanic/raw/titanic.csv"
FILE_PROCESSED = "../datasets/titanic/processed/titanic_processed.csv"
FILE_FEATHER = "../datasets/titanic/processed/titanic_processed.feather"

## Loading data

In [22]:
#there is a bit of data in ../datesets/titanic folder
df = pd.read_csv(FILE_RAW)

## Navigating the directory structure with magics

What if you cannot find the data?  And BTW, where is this jupyter notebook running from?<br>
It's easy to tell if running on your local machine.  It's a bit harder if your notebook is running from a remote server <br>
(especially Kaggle kernels for those of you who have tried their datascience competitions). 

In [23]:
#this is where jupyter magics can help, these should work regardless of underlying OS
%pwd     #what is our current directory?
# %ls    #list files in this directory
%ls -la ../datasets     #list files in the datasets folder 

'/home/keith/AA_jupyter_tuts/DATA301_CODE/week_1'

total 133000
drwxrwxr-x  4 keith keith      4096 Jan 14 14:01 [0m[01;34m.[0m/
drwxrwxr-x 12 keith keith      4096 Jan 14 18:59 [01;34m..[0m/
-rw-r--r--  1 keith keith    731856 Aug  1  2019 airqual.csv
-rw-rw-r--  1 keith keith  29612211 Dec 13 14:28 [01;31mgroupby-data.zip[0m
drwxrwxr-x  2 keith keith      4096 Jan 14 14:01 [01;34mkaggle[0m/
-rw-r--r--  1 keith keith   1431833 Aug  1  2019 legislators-historical.csv
-rw-rw-r--  1 keith keith   2091239 Sep 21  2019 melb_data.csv
-rw-r--r--  1 keith keith 102297000 Feb 27  2016 news.csv
drwxr-xr-x  5 keith keith      4096 Jan 15 22:48 [01;34mtitanic[0m/


## Look at the data

In [24]:
# lets see the first 5 rows
df.head()
# df.head(10)  #first 10 rows

#and the last 5 rows
df.tail()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.45
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0
886,0,3,Mr. Patrick Dooley,male,32.0,0,0,7.75


### Its a passenger manifest, which includes the grisly column 'Survived'

In [25]:
#how many passengers?
len(df)

887

## Process the data as needed (future lecture)

## Save data
Usually to a different filename titanic_processed.csv) in a different directory (titanic_processed).<br>
This way preserves and seperates original from processed data (always preserve original data!)<br>
Also you now only have to run pre-processing on your original data once.

In [26]:
df.to_csv(FILE_PROCESSED,index=False)

# Faster ways to load and save a pandas dataframe
Have a larger dataset that takes a while to load or save? Use the <a href="https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#feather">feather</a> format for a substantial speedup.  See <a href="https://medium.com/@black_swan/pandas-i-o-benchmarking-56cd688f832b"> this Pandas I/O benchmarking</a> article for more detail.<br> 
Feather is a binary format with some minor restrictions (see docs).  <mark>To use, just load the dataframe using pandas.read_ the first time, then save and load as a feather file from then on.

In [27]:
# you will likely need this for feather to work
# !conda install pyarrow -y

In [28]:
# save to feather format
df.to_feather(FILE_FEATHER)

In [29]:
#load as a feather file
df = pd.read_feather(FILE_FEATHER)
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [33]:
# get rid of the feather file, and processsed file to conserve space
#in a real application save the processed file
import os
try:
    os.remove(FILE_FEATHER)
    os.remove(FILE_PROCESSED)
except FileNotFoundError:
    pass