Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generalize infer_variable_type, convert_variable_data and convert_all_variable_data methods #423

Merged
merged 6 commits into from Feb 19, 2019
@@ -1,8 +1,8 @@
from __future__ import division, print_function
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function

import logging
from builtins import range
from datetime import datetime

import numpy as np
import pandas as pd
@@ -12,6 +12,13 @@

from featuretools import variable_types as vtypes
from featuretools.utils import is_string
from featuretools.utils.entity_utils import (
col_is_datetime,
convert_all_variable_data,
convert_variable_data,
get_linked_vars,
infer_variable_types
)
from featuretools.utils.wrangle import (
_check_time_type,
_check_timedelta,
@@ -207,71 +214,16 @@ def convert_variable_type(self, variable_id, new_type,
"""
if convert_data:
# first, convert the underlying data (or at least try to)
self.convert_variable_data(
variable_id, new_type, **kwargs)
self.df = convert_variable_data(df=self.df,
column_id=variable_id,
new_type=new_type,
**kwargs)

# replace the old variable with the new one, maintaining order
variable = self._get_variable(variable_id)
new_variable = new_type.create_from(variable)
self.variables[self.variables.index(variable)] = new_variable

def convert_all_variable_data(self, variable_types):
for var_id, desired_type in variable_types.items():
type_args = {}
if isinstance(desired_type, tuple):
# grab args before assigning type
type_args = desired_type[1]
desired_type = desired_type[0]

if var_id not in self.df.columns:
raise LookupError("Variable ID %s not in DataFrame" % (var_id))
current_type = self.df[var_id].dtype.name

if issubclass(desired_type, vtypes.Numeric) and \
current_type not in _numeric_types:
self.convert_variable_data(var_id, desired_type, **type_args)

if issubclass(desired_type, vtypes.Discrete) and \
current_type not in _categorical_types:
self.convert_variable_data(var_id, desired_type, **type_args)

if issubclass(desired_type, vtypes.Datetime) and \
current_type not in _datetime_types:
self.convert_variable_data(var_id, desired_type, **type_args)

def convert_variable_data(self, column_id, new_type, **kwargs):
"""
Convert variable in data set to different type
"""
df = self.df
if df[column_id].empty:
return
if new_type == vtypes.Numeric:
orig_nonnull = df[column_id].dropna().shape[0]
df[column_id] = pd.to_numeric(df[column_id], errors='coerce')
# This will convert strings to nans
# If column contained all strings, then we should
# just raise an error, because that shouldn't have
# been converted to numeric
nonnull = df[column_id].dropna().shape[0]
if nonnull == 0 and orig_nonnull != 0:
raise TypeError("Attempted to convert all string column {} to numeric".format(column_id))
elif issubclass(new_type, vtypes.Datetime):
format = kwargs.get("format", None)
# TODO: if float convert to int?
df[column_id] = pd.to_datetime(df[column_id], format=format,
infer_datetime_format=True)
elif new_type == vtypes.Boolean:
map_dict = {kwargs.get("true_val", True): True,
kwargs.get("false_val", False): False,
True: True,
False: False}
# TODO: what happens to nans?
df[column_id] = df[column_id].map(map_dict).astype(np.bool)
elif not issubclass(new_type, vtypes.Discrete):
raise Exception("Cannot convert column %s to %s" %
(column_id, new_type))

def query_by_values(self, instance_vals, variable_id=None, columns=None,
time_last=None, training_window=None):
"""Query instances that have variable with given value
@@ -345,9 +297,12 @@ def _create_variables(self, variable_types, index, time_index, secondary_time_in
if index not in variable_types:
variable_types[index] = vtypes.Index

inferred_variable_types = self.infer_variable_types(variable_types,
time_index,
secondary_time_index)
link_vars = get_linked_vars(self)
inferred_variable_types = infer_variable_types(self.df,
This conversation was marked as resolved by bukosabino

This comment has been minimized.

Copy link
@kmax12

kmax12 Feb 8, 2019

Member

use the keywords when calling the function to avoid errors in the order of arguments

link_vars,
variable_types,
time_index,
secondary_time_index)
inferred_variable_types.update(variable_types)

for v in inferred_variable_types:
@@ -360,92 +315,14 @@ def _create_variables(self, variable_types, index, time_index, secondary_time_in
_v = inferred_variable_types[v](v, self)
variables += [_v]
# convert data once we've inferred
self.convert_all_variable_data(inferred_variable_types)
self.df = convert_all_variable_data(df=self.df,
variable_types=inferred_variable_types)
# make sure index is at the beginning
index_variable = [v for v in variables
if v.id == index][0]
self.variables = [index_variable] + [v for v in variables
if v.id != index]

def infer_variable_types(self, variable_types, time_index, secondary_time_index):
'''Infer variable types from dataframe
Args:
variable_types (dict[str -> dict[str -> type]]) : An entity's
variable_types dict maps string variable ids to types (:class:`.Variable`)
or (type, kwargs) to pass keyword arguments to the Variable.
time_index (str or None): Name of time_index column
secondary_time_index (dict[str: [str]]): Dictionary of secondary time columns
that each map to a list of columns that depend on that secondary time
'''
link_relationships = [r for r in self.entityset.relationships
if r.parent_entity.id == self.id or
r.child_entity.id == self.id]

link_vars = [v.id for rel in link_relationships
for v in [rel.parent_variable, rel.child_variable]
if v.entity.id == self.id]

# TODO: set pk and pk types here
inferred_types = {}
df = self.df
vids_to_assume_datetime = [time_index]
if len(list(secondary_time_index.keys())):
vids_to_assume_datetime.append(list(secondary_time_index.keys())[0])
inferred_type = vtypes.Unknown
for variable in df.columns:
if variable in variable_types:
continue

elif variable in vids_to_assume_datetime:
if col_is_datetime(df[variable]):
inferred_type = vtypes.Datetime
else:
inferred_type = vtypes.Numeric

elif df[variable].dtype == "object":
if variable in link_vars:
inferred_type = vtypes.Categorical
elif len(df[variable]):
if col_is_datetime(df[variable]):
inferred_type = vtypes.Datetime
else:
# heuristics to predict this some other than categorical
sample = df[variable].sample(min(10000, df[variable].nunique()))
avg_length = sample.str.len().mean()
if avg_length > 50:
inferred_type = vtypes.Text
else:
inferred_type = vtypes.Categorical

elif df[variable].dtype == "bool":
inferred_type = vtypes.Boolean

elif pdtypes.is_categorical_dtype(df[variable].dtype):
inferred_type = vtypes.Categorical

elif col_is_datetime(df[variable]):
inferred_type = vtypes.Datetime

elif variable in link_vars:
inferred_type = vtypes.Ordinal

elif len(df[variable]):
sample = df[variable] \
.sample(min(10000, df[variable].nunique(dropna=False)))

unique = sample.unique()
percent_unique = sample.size / len(unique)

if percent_unique < .05:
inferred_type = vtypes.Categorical
else:
inferred_type = vtypes.Numeric

inferred_types[variable] = inferred_type

return inferred_types

def update_data(self, df, already_sorted=False,
recalculate_last_time_indexes=True):
'''Update entity's internal dataframe, optionaly making sure data is sorted,
@@ -663,23 +540,6 @@ def _handle_time(self, df, time_last=None,
return df


def col_is_datetime(col):
if (col.dtype.name.find('datetime') > -1 or
(len(col) and isinstance(col.iloc[0], datetime))):
return True

# TODO: not sure this is ideal behavior.
# it converts int columns that have dtype=object to datetimes starting from 1970
elif col.dtype.name.find('str') > -1 or col.dtype.name.find('object') > -1:
try:
pd.to_datetime(col.dropna().iloc[:10], errors='raise')
except Exception:
return False
else:
return True
return False


def _create_index(index, make_index, df):
'''Handles index creation logic base on user input'''
created_index = None
@@ -0,0 +1,106 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function

import pandas as pd

from featuretools import variable_types as vtypes
from featuretools.utils.entity_utils import (
convert_all_variable_data,
convert_variable_data,
infer_variable_types
)


def test_infer_variable_types():

df = pd.DataFrame({'id': [0, 1, 2],
This conversation was marked as resolved by bukosabino

This comment has been minimized.

Copy link
@kmax12

kmax12 Feb 12, 2019

Member

can we test the case where the values are integer but it is a a pandas categorical dtype?

'category': ['a', 'b', 'a'],
'ints': ['1', '2', '1'],
'boolean': [True, False, True],
'date': ['3/11/2000', '3/12/2000', '3/13/2000'],
'integers': [1, 2, 1]})

total_variables = df.columns
variable_types = ['id']

inferred_variable_types = infer_variable_types(df=df,
link_vars=[],
variable_types=variable_types,
time_index=None,
secondary_time_index={})

# Check columns' number
assert len(variable_types) + len(inferred_variable_types) == len(total_variables)
This conversation was marked as resolved by bukosabino

This comment has been minimized.

Copy link
@kmax12

kmax12 Feb 12, 2019

Member

no need to assign a variable here, just use len(df.columns)


# Check columns' types
assert inferred_variable_types['category'] == vtypes.Categorical
assert inferred_variable_types['ints'] == vtypes.Categorical
assert inferred_variable_types['boolean'] == vtypes.Boolean
assert inferred_variable_types['date'] == vtypes.Datetime
assert inferred_variable_types['integers'] == vtypes.Numeric


def test_convert_all_variable_data():

df = pd.DataFrame({'id': [0, 1, 2],
'category': ['a', 'b', 'a'],
'ints': ['1', '2', '1'],
'boolean': [True, False, True],
'date': ['3/11/2000', '3/12/2000', '3/13/2000'],
'integers': [1, 2, 1]})

variable_types = {
'id': vtypes.Numeric,
'category': vtypes.Categorical,
'ints': vtypes.Numeric,
'boolean': vtypes.Boolean,
'date': vtypes.Datetime,
'integers': vtypes.Numeric
}

df = convert_all_variable_data(df, variable_types)

assert df['id'].dtype.name in vtypes.PandasTypes._pandas_numerics
assert df['category'].dtype.name == 'object'
assert df['ints'].dtype.name in vtypes.PandasTypes._pandas_numerics
assert df['boolean'].dtype.name == 'bool'
assert df['date'].dtype.name in vtypes.PandasTypes._pandas_datetimes
assert df['integers'].dtype.name in vtypes.PandasTypes._pandas_numerics


def test_convert_variable_data():

df = pd.DataFrame({'id': [0, 1, 2],
'category': ['a', 'b', 'a'],
'ints': ['1', '2', '1'],
'boolean': [True, False, True],
'date': ['3/11/2000', '3/12/2000', '3/13/2000'],
'integers': [1, 2, 1]})

# Categorical -> Numeric
init_dtype = df['ints'].dtype.name
df = convert_variable_data(df=df,
column_id='ints',
new_type=vtypes.Numeric)

assert init_dtype != df['ints'].dtype.name
assert df['ints'].dtype.name in vtypes.PandasTypes._pandas_numerics

# Numeric -> Boolean
init_dtype = df['ints'].dtype.name
df = convert_variable_data(df=df,
column_id='ints',
new_type=vtypes.Boolean,
true_val=1,
false_val=2)

assert init_dtype != df['ints'].dtype.name

# Categorical -> Datetime
init_dtype = df['date'].dtype.name
df = convert_variable_data(df=df,
column_id='date',
new_type=vtypes.Datetime)

assert init_dtype != df['date'].dtype.name
assert df['date'].dtype.name in vtypes.PandasTypes._pandas_datetimes
Oops, something went wrong.
ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.