In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%time

# Third-party imports
import os.path as op
import pandas as pd
import great_expectations as ge

# Project imports
from ta_lib.core.api import display_as_tabs, initialize_environment

# Initialization
initialize_environment(debug=False, hide_warnings=True)

ValueError: ClassSelector parameter None value must be an instance of (function, tuple), not <function size at 0x00000120E7CCE530>.

# Data

## Background

This is a house price prediction package using machine learning.

In [3]:
from ta_lib.core.api import create_context, list_datasets, load_dataset

ValueError: ClassSelector parameter None value must be an instance of (function, tuple), not <function size at 0x00000120E7CCE530>.

In [None]:
config_path = op.join('conf', 'config.yml')
context = create_context(config_path)

In [None]:
list_datasets(context)

In [None]:
# download dataset
import tarfile
import urllib.request
import os

housing_url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz"
housing_path = "../../data/raw/housing/"

os.makedirs(housing_path, exist_ok=True)
tgz_path = op.join(housing_path, "housing.tgz")
urllib.request.urlretrieve(housing_url, tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path=housing_path)
housing_tgz.close()
os.remove(tgz_path)

In [None]:
# load datasets
housing_df = load_dataset(context, '/raw/housing')

## Exploratory Analysis

Given the raw data from data ingestion, we would now like to explore and learn more details about the data.


The output of the step would be a summary report and discussion of any pertinent findings.

In [None]:
# Import the eda API
import ta_lib.eda.api as eda

### Variable Summary

In [None]:
display_as_tabs([('housing', housing_df.shape)])

In [None]:
sum1 = eda.get_variable_summary(housing_df)

display_as_tabs([('housing', sum1)])

In [None]:
housing_df.isna().sum()

**Dev NOTES**

<details>
1. Datatypes : We have both numeric and other types. The bulk of them seem to be numeric. `Numeric` is defined to be one of [float|int|date] and the rest are categorized as `Others`.

## Health Analysis

Get an overview of the overall health of the dataset. This is usually quick to compute and hopefully highlights some problems to focus on.

### Summary Plot

Provides a high level summary of the dataset health.

**Watch out for:**

* too few numeric values
* high % of missing values
* high % of duplicate values
* high % of duplicate columns 

In [None]:
sum1, plot1 = eda.get_data_health_summary(housing_df, return_plot=True)

display_as_tabs([('housing', plot1)])

**Dev NOTES**

<details>

1. Datatypes : We have both numeric and other types. The bulk of them seem to be numeric. `Numeric` is defined to be one of [float|int|date] and the rest are categorized as `Others`. A column is assumed to have `date` values if it has the string `date` in the column name.

2. The missing value plot seems to indicate missing values are not present but we do have them. 

### Missing Values summary

This provides an overall view focussing on amount of missing values in the dataset.

**Watch out for:**
* A few columns have significant number of missing values 
* Most columns have significant number of missing values

In [None]:
sum1, plot1 = eda.get_missing_values_summary(housing_df, return_plot=True)

display_as_tabs([('housing', plot1)])

**Dev notes:**

<details>
    
    * By default, the following are considered missing/NA values : `[np.Nan, pd.NaT, 'NA', None]`
    * additional values can be passed to tigerml (add_additional_na_values)
    * these are applied to all columns.
    
    * some of the above information can be learnt from the data discovery step (see discussion below)
    
</details>

In [None]:
sum1 = eda.get_duplicate_columns(housing_df)

display_as_tabs([('housing', sum1)])

In [None]:
sum1 = eda.get_outliers(housing_df)

display_as_tabs([('housing', sum1)])

## Health Analysis report

Generate a report that has all the above data in a single html. This could be useful to submit to a client

In [None]:
from ta_lib.reports.api import summary_report

summary_report(housing_df, './housing.html')