In [1]:
from typing import Dict, Tuple, List, Any
import datetime as dt
import json
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn import metrics
import xgboost as xgb
import mlflow
from category_encoders import CountEncoder
from kedro.framework.session import KedroSession
from kedro.framework.startup import bootstrap_project
import sys
sys.path.append("../../src/hellow_kedro/pipelines/data_processing")
from helpers import _is_true, _parse_percentage, _parse_money

In [2]:
# use config with base
metadata = bootstrap_project(Path.cwd().parent.parent)
with KedroSession.create(metadata.package_name,
        project_path=metadata.project_path,
        # save_on_close=True,
        env=None,
        # extra_params=extra_params
    ) as session: 
    context = session.load_context()


2021-12-25 06:26:40,428 - kedro.framework.session.store - INFO - `save()` not implemented for `BaseSessionStore`. Skipping the step.


In [3]:
context.params

{'test_size': 0.2,
 'random_state': 3,
 'features': ['engines',
  'passenger_capacity',
  'crew',
  'd_check_complete',
  'moon_clearance_complete',
  'iata_approved',
  'company_rating',
  'review_scores_rating']}

In [4]:
def preprocess_companies(companies: pd.DataFrame) -> pd.DataFrame:
    companies["iata_approved"] = _is_true(companies["iata_approved"])
    companies["company_rating"] = _parse_percentage(companies["company_rating"])
    return companies

In [5]:
def preprocess_shuttles(shuttles: pd.DataFrame) -> pd.DataFrame:
    shuttles["d_check_complete"] = _is_true(shuttles["d_check_complete"])
    shuttles["moon_clearance_complete"] = _is_true(shuttles["moon_clearance_complete"])
    shuttles["price"] = _parse_money(shuttles["price"])
    return shuttles

In [6]:
def create_model_input_table(
    shuttles: pd.DataFrame, companies: pd.DataFrame, reviews: pd.DataFrame
) -> pd.DataFrame:
    rated_shuttles = shuttles.merge(reviews, left_on="id", right_on="shuttle_id")
    model_input_table = rated_shuttles.merge(
        companies, left_on="company_id", right_on="id"
    )
    model_input_table = model_input_table.dropna()
    return model_input_table

In [7]:
preprocessed_companies = preprocess_companies(context.catalog.load("companies"))
context.catalog.save("preprocessed_companies", preprocessed_companies)

No files found in ['/kedro-sample/own_examples/conf/base', '/kedro-sample/own_examples/conf/local'] matching the glob pattern(s): ['credentials*', 'credentials*/**', '**/credentials*']
  warn(f"Credentials not found in your Kedro project config.\n{str(exc)}")


2021-12-25 06:26:47,383 - kedro.io.data_catalog - INFO - Loading data from `companies` (CSVDataSet)...
2021-12-25 06:26:47,510 - kedro.io.data_catalog - INFO - Saving data to `preprocessed_companies` (CSVDataSet)...


In [8]:
preprocessed_shuttles = preprocess_shuttles(
    context.catalog.load("shuttles")
)
context.catalog.save("preprocessed_shuttles", preprocessed_shuttles)

2021-12-25 06:30:33,047 - kedro.io.data_catalog - INFO - Loading data from `shuttles` (ExcelDataSet)...
2021-12-25 06:30:49,879 - kedro.io.data_catalog - INFO - Saving data to `preprocessed_shuttles` (CSVDataSet)...


  x = x.str.replace("$", "").str.replace(",", "")


In [9]:
model_input_table = create_model_input_table(
    context.catalog.load("preprocessed_shuttles")
    ,context.catalog.load("preprocessed_companies")
    ,context.catalog.load("reviews")
)
context.catalog.save("model_input_table", model_input_table)

2021-12-25 06:31:42,290 - kedro.io.data_catalog - INFO - Loading data from `preprocessed_shuttles` (CSVDataSet)...
2021-12-25 06:31:42,453 - kedro.io.data_catalog - INFO - Loading data from `preprocessed_companies` (CSVDataSet)...
2021-12-25 06:31:42,515 - kedro.io.data_catalog - INFO - Loading data from `reviews` (CSVDataSet)...
2021-12-25 06:31:44,908 - kedro.io.data_catalog - INFO - Saving data to `model_input_table` (CSVDataSet)...
