In this notebook, we present various usecases to interact with ptype for column type predictions.

In [None]:
# Preamble to run notebook in context of source package.
# NBVAL_IGNORE_OUTPUT
import sys
sys.path.insert(0, '../')
# !{sys.executable} -m pip install -r ../requirements.txt


In [None]:
from IPython.core.display import display, HTML
from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcdefaults()
import numpy as np
import pandas as pd
import seaborn as sns

from ptype.Ptype import Ptype, Column2ARFF
from ptype.utils import evaluate_types
from utils import *

### UCI Automobile Dataset

In [None]:
headers = ["symboling","normalized-losses","make","fuel-type","aspiration", "num-of-doors","body-style",
         "drive-wheels","engine-location","wheel-base", "length","width","height","curb-weight","engine-type",
         "num-of-cylinders", "engine-size","fuel-system","bore","stroke","compression-ratio","horsepower",
         "peak-rpm","city-mpg","highway-mpg","price"]

df = pd.read_csv('../data/auto.csv', names = headers)
df.head()

### The Analytical Task

This dataset is commonly used for a regression task, where the goal is to predict the price of an automobile given its attributes.

### A Solution using Standard Python Libraries
Let's now develop a simple solution for this problem. The solution is inspired from Kaggle (see https://www.kaggle.com/fazilbtopal/data-wrangling and https://www.kaggle.com/fazilbtopal/model-development-and-evaluation-with-python).

In [None]:
features = ['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']
target = ['price']

X = df[features]
y = df[target]

df = df[features+target]
df.head()

In [None]:
lm = LinearRegression()
lm.fit(X, y)

y_hat = lm.predict(X)

We notice that some data entries are valued ? and that this leads Pandas to misclassify two data columns as object rather than int64.

In [None]:
df.dtypes

We need to "clean" the horsepower and price columns in terms of missing values. Let's first have a look at what we can do without ptye:

In [None]:
# replace missing data encoding
df['horsepower'].replace("?", np.nan, inplace = True)
df['price'].replace("?", np.nan, inplace = True)

# check counts of missing data
missing_data = df.isnull().sum()
missing_data.sort_values(inplace=True, ascending=False)
missing_data.head()

# drop rows
n = df.shape[0]
df.dropna(subset=["horsepower", "price"], axis=0, inplace=True)
print("# rows deleted = " + str(n-df.shape[0]))

# update the indices
df.reset_index(drop=True, inplace=True)

In [None]:
X = df[features]
y = df[target]

lm.fit(X, y)
y_hat = lm.predict(X)

plt.figure(figsize=(6, 4))

ax1 = sns.distplot(y, hist=False, color="r", label="Actual Value")
sns.distplot(y_hat, hist=False, color="b", label="Fitted Values" , ax=ax1)

plt.title('Actual vs Fitted Values for Price')
plt.xlabel('Price (in dollars)')
plt.ylabel('Proportion of Cars')

plt.show()

Let's now reproduce the error and see how we can use ptype to resolve the issue.

In [None]:
headers = ["symboling","normalized-losses","make","fuel-type","aspiration", "num-of-doors","body-style",
         "drive-wheels","engine-location","wheel-base", "length","width","height","curb-weight","engine-type",
         "num-of-cylinders", "engine-size","fuel-system","bore","stroke","compression-ratio","horsepower",
         "peak-rpm","city-mpg","highway-mpg","price"]

df = pd.read_csv('../data/auto.csv', names = headers, dtype='str')
df = df[features+target]
df.head()

ptype = Ptype()
ptype.run_inference(df)

df = ptype.get_final_df()
df.head()

In [None]:
# check counts of missing data
missing_data = df.isnull().sum()
missing_data.sort_values(inplace=True, ascending=False)
missing_data.head()

# drop rows
n = df.shape[0]
df.dropna(subset=["horsepower", "price"], axis=0, inplace=True)
print("# rows deleted = " + str(n-df.shape[0]))

# update the indices
df.reset_index(drop=True, inplace=True)

In [None]:
X = df[features]
y = df[target]

lm = LinearRegression()
lm.fit(X, y)
y_hat = lm.predict(X)

plt.figure(figsize=(6, 4))

ax1 = sns.distplot(y, hist=False, color="r", label="Actual Value")
sns.distplot(y_hat, hist=False, color="b", label="Fitted Values" , ax=ax1)

plt.title('Actual vs Fitted Values for Price')
plt.xlabel('Price (in dollars)')
plt.ylabel('Proportion of Cars')

plt.show()

# 1. Incorrect Column Type Prediction

## 1.a Incorrect Type Prediction

In [None]:
column2ARFF = Column2ARFF("../models/")

In [None]:
df = read_data(dataset_name="accident2016", header=0)

column = "Time (24hr)"
df_subsample = subsample_df(df, column_to_sample_from=column, sample_num=10)
df_subsample

In [None]:
ptype.run_inference(_data_frame=df_subsample)

plot_column_type_posterior(p_t=ptype.all_posteriors["demo"][column], 
                           types=ptype.types.items())

In [None]:
features = ptype.features[column]
arff_type, arff_post = column2ARFF.get_arff(features)

plot_arff_type_posterior(arff_post)

In [None]:
ptype.reclassify_column(column, 'date')

plot_column_type_posterior(p_t=ptype.all_posteriors["demo"][column], 
                           types=ptype.types.items())

# do the same thing for arff type

## 1.b Uniform posterior distribution

In [None]:
df = read_data(dataset_name='inspection_outcomes', header=0)

column = 'Provision type'
df_subsample = subsample_df(df, column_to_sample_from = column, sample_num = 10)
df_subsample

In [None]:
ptype.run_inference(_data_frame=df_subsample)

plot_column_type_posterior(p_t=ptype.all_posteriors["demo"][column], 
                           types=ptype.types.items())

In [None]:
features = ptype.features[column]
arff_type, arff_post = column2ARFF.get_arff(features)

plot_arff_type_posterior(arff_post)

In [None]:
ptype.reclassify_column(column, 'string')

plot_column_type_posterior(p_t=ptype.all_posteriors["demo"][column], 
                           types=ptype.types.items())

# do the same thing for arff type
# add the character to the alphabet?