In this notebook, we present various usecases to interact with ptype to handle:

- incorrect column type predictions,
- incorrect missing type predictions.
- incorrect anomaly type predictions.

In [None]:
# Preamble to run notebook in context of source package.
# NBVAL_IGNORE_OUTPUT
import sys
sys.path.insert(0, '../')
!{sys.executable} -m pip install -r ../requirements.txt


In [None]:
from IPython.core.display import display, HTML

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcdefaults()

from ptype.Ptype import Ptype
from ptype.utils import evaluate_types
import pandas as pd
import numpy as np
from utils import *

In [None]:
ptype = Ptype()

# 1. Incorrect Column Type Prediction

## 1.a Incorrect Type Prediction

In [None]:
df = read_data(dataset_name="accident2016", header=0)

column = "Time (24hr)"
df_subsample = subsample_df(df, column_to_sample_from=column, sample_num=10)
df_subsample

In [None]:
ptype.run_inference(_data_frame=df_subsample)

plot_column_type_posterior(ptype, column=column)

In [None]:
ptype.cols[column].predicted_type = 'date'


## 1.b Uniform posterior distribution

In [None]:
df = read_data(dataset_name='inspection_outcomes', header=0)

column = 'Provision type'
df_subsample = subsample_df(df, column_to_sample_from = column, sample_num = 10)
df_subsample

In [None]:
ptype.run_inference(_data_frame=df_subsample)

plot_column_type_posterior(ptype, column=column)

In [None]:
ptype.cols[column].predicted_type = 'string'
# add the character to the alphabet?


# 2. Incorrect Missing Data Prediction

In [None]:
df = read_data(dataset_name='auto')
column = 0
df_subsample = subsample_df(df, column_to_sample_from = column, sample_num = 10)
df_subsample

In [None]:
ptype.run_inference(_data_frame=df_subsample)

plot_column_type_posterior(ptype, column=column)

plot_row_type_posterior(ptype, column=column, t='missing')

In [None]:
ptype.cols[column].change_missing_data_annotations(['-1'])


# update the column type posterior?
# plot_column_type_posterior(ptype, column=column)

plot_row_type_posterior(ptype, column=column, t='missing')

In [None]:
ptype.show_results([column,])

# 3. a Incorrect Anomaly Prediction

In [None]:
column = 'Status'
df = read_data(dataset_name='data_gov_10151_1', header=0)
df_subsample = subsample_df(df, column_to_sample_from = column, sample_num = 20)
df_subsample

In [None]:
unique_values, counts = np.unique(
    [str(int_element) for int_element in df_subsample[column].tolist()],
    return_counts=True,
)
plot_bar(
    unique_values,
    counts,
    title="counts of the unique data values",
    y_lim_max=None,
    xlabel="Unique Value",
    ylabel="Counts",
)

In [None]:
ptype.run_inference(_data_frame=df_subsample)

plot_column_type_posterior(ptype, column=column)

print('Normal Values', unique_values[ptype.normal_types[column]])
print('Anomalous Values', unique_values[ptype.anomaly_types[column]])
plot_row_type_posterior(ptype, column=column, t='anomaly')

In [None]:
ptype.cols[column].change_anomaly_annotations(['T', 'U'])

print('Normal Values', unique_values[ptype.normal_types[column]])
print('Anomalous Values', unique_values[ptype.anomaly_types[column]])
plot_row_type_posterior(ptype, column=column, t='anomaly')

# 3.b Incorrect Anomaly Prediction

In [None]:
df = read_data(dataset_name="survey", header=0)

column = "Gender"
df_subsample = subsample_df(df, column_to_sample_from=column, sample_num=10)
display(df_subsample)

unique_values, counts = np.unique(
    [str(int_element) for int_element in df_subsample[column].tolist()],
    return_counts=True,
)
plot_bar(
    unique_values,
    counts,
    title="counts of the unique data values",
    y_lim_max=None,
    xlabel="Unique Value",
    ylabel="Counts",
)

In [None]:
ptype.run_inference(_data_frame=df_subsample)

plot_column_type_posterior(ptype, column=column)

print("Normal Values", unique_values[ptype.normal_types[column]])
print("Anomalous Values", unique_values[ptype.anomaly_types[column]])

plot_row_type_posterior(ptype, column, t='anomaly')

# 4. Multiple Missing Data Encodings

In [None]:
df = read_data("mass_6", header=0)

column = "LRE Ages 3-5 - Full Incl #"
df_subsample = subsample_df(df, column_to_sample_from=column, sample_num=20)
display(df_subsample)

unique_values, counts = np.unique(
    [str(int_element) for int_element in df_subsample[column].tolist()],
    return_counts=True,
)
plot_bar(
    unique_values,
    counts,
    title="counts of the unique data values",
    y_lim_max=None,
    xlabel="Unique Value",
    ylabel="Counts",
)

In [None]:
ptype.run_inference(_data_frame=df_subsample)

plot_column_type_posterior(ptype, column=column)

print('Normal Values', unique_values[ptype.normal_types[column]])
print('Missing Values', unique_values[ptype.missing_types[column]])

plot_row_type_posterior(ptype, column=column, t='missing')

In [None]:
new_encoding = 'NA'
ptype.replace_missing(column, new_encoding)

unique_values, counts = np.unique(
    [str(int_element) for int_element in ptype.model.data[column].tolist()],
    return_counts=True,
)
print('Normal Values', unique_values[ptype.normal_types[column]])
print('Missing Values', unique_values[ptype.missing_types[column]])

plot_row_type_posterior(ptype, column=column, t='missing')