In [1]:
import sys  # module for python interpreter
sys.path.append('../')  # necessary for relative import of utils file e.g.

import os
import requests

import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
# %load_ext autoreload
# %autoreload 2
# # %matplotlib inline
# %config IPCompleter.greedy=True

# pd.options.display.max_columns = None
# pd.options.display.max_rows = 200
# sns.set_palette("bright")
# sns.set(style="darkgrid")

InteractiveShell.ast_node_interactivity = "all"

# Load dataset

In [2]:
def download(url: str, dest_folder: str):
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)  

    filename = url.split('/')[-1].replace(" ", "_")  
    file_path = os.path.join(dest_folder, filename)

    r = requests.get(url, stream=True)
    if r.ok:
        print("saving to", os.path.abspath(file_path))
        with open(file_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024 * 8):
                if chunk:
                    f.write(chunk)
                    f.flush()
                    os.fsync(f.fileno())
    else:  
        print("Download failed: status code {}\n{}".format(r.status_code, r.text))

In [3]:
url_to_titanic_data = 'https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv'

download(url_to_titanic_data,'./data')

saving to /Users/danieldeutsch/Desktop/code/pandas_transform_format/data/titanic.csv


# Transform data

In [5]:
df = pd.read_csv('./data/titanic.csv')
df.to_json(r'./data/titanic.json')

df.columns
df

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Siblings/Spouses Aboard',
       'Parents/Children Aboard', 'Fare'],
      dtype='object')

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.2500
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.9250
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1000
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0000
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0000
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.4500
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0000


In [6]:
df = pd.read_json('./data/titanic.json')
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.2500
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.9250
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1000
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0000
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0000
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.4500
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0000


In [10]:
df.to_hdf()

",Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare\n0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25\n1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cumings,female,38.0,1,0,71.2833\n2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925\n3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1\n4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05\n5,0,3,Mr. James Moran,male,27.0,0,0,8.4583\n6,0,1,Mr. Timothy J McCarthy,male,54.0,0,0,51.8625\n7,0,3,Master. Gosta Leonard Palsson,male,2.0,3,1,21.075\n8,1,3,Mrs. Oscar W (Elisabeth Vilhelmina Berg) Johnson,female,27.0,0,2,11.1333\n9,1,2,Mrs. Nicholas (Adele Achem) Nasser,female,14.0,1,0,30.0708\n10,1,3,Miss. Marguerite Rut Sandstrom,female,4.0,1,1,16.7\n11,1,1,Miss. Elizabeth Bonnell,female,58.0,0,0,26.55\n12,0,3,Mr. William Henry Saundercock,male,20.0,0,0,8.05\n13,0,3,Mr. Anders Johan Andersson,male,39.0,1,5,31.275\n14,0,3,Miss. Hulda Amanda Adolfina Vestrom,female,14.0,0,0,7.8542000000000005\n15,1,2,

In [13]:


df.to_hdf(
    './titanic_data.h5',
    'titanic_data',
    mode='w')

In [16]:
pd.read_hdf('./titanic_data.h5')

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.2500
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.9250
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1000
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0000
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0000
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.4500
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0000
