In [8]:
import os
import pandas as pd
from azureml.core import Workspace, Datastore, Dataset

DATA_PATH = "../data"
filename = "titanic_dataset.csv"

In [9]:
df = pd.read_csv(f"{DATA_PATH}/002_cleaned/{filename}")

In [10]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
# Select target and relevant features
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked']]

In [12]:
# One hot encoding for categorical variables
df = pd.get_dummies(data=df, columns=['Pclass', 'Sex', 'Embarked'], drop_first=True)

df.head()

Unnamed: 0,Survived,Age,Fare,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
0,0,22.0,7.25,0,1,1,0,1
1,1,38.0,71.2833,0,0,0,0,0
2,1,26.0,7.925,0,1,0,0,1
3,1,35.0,53.1,0,0,0,0,1
4,0,35.0,8.05,0,1,1,0,1


In [13]:
output_path = f"{DATA_PATH}/003_preprocessed/"

if not os.path.exists(output_path):
    os.makedirs(output_path)

df.to_csv(f"{output_path}/{filename}", index=False)

In [17]:
# connect to workspace
ws = Workspace.from_config()

# upload file to BLOB Storage
datastore = Datastore.get(ws, datastore_name='bc_blob')

datastore.upload(
    src_dir=f"{output_path}", 
    target_path=f"ml/preprocessed/",
    overwrite=True)

# register preprocessed dataset as TabularDataset in AzureML Workspace
file_dataset = Dataset.File.from_files((datastore, 'ml/preprocessed/**'))
file_dataset.register(ws, name='ds-titanic-preprocessed')

tabular_dataset = Dataset.Tabular.from_delimited_files((datastore, f"ml/preprocessed/{filename}"))
tabular_dataset.register(ws, name='ds-titanic-preprocssed-tabular')

Uploading an estimated of 3 files
Uploading ../data/003_preprocessed/.amlignore
Uploaded ../data/003_preprocessed/.amlignore, 1 files out of an estimated total of 3
Uploading ../data/003_preprocessed/.amlignore.amltmp
Uploaded ../data/003_preprocessed/.amlignore.amltmp, 2 files out of an estimated total of 3
Uploading ../data/003_preprocessed/titanic_dataset.csv
Uploaded ../data/003_preprocessed/titanic_dataset.csv, 3 files out of an estimated total of 3
Uploaded 3 files


{
  "source": [
    "('bc_blob', 'bronze/preprocessed/titanic_dataset.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "a94c8d27-1431-4d3a-aefd-1bac1a06da83",
    "name": "ds-titanic-preprocssed-tabular",
    "version": 1,
    "workspace": "Workspace.create(name='bc-aml-weu-001', subscription_id='47cc8772-2fef-4fc8-bb9f-9905aa79b595', resource_group='azureml-csa-bootcamp')"
  }
}