Related to user story: [SP11-Item04: General Data Wrapper PoC](https://gitlab.inria.fr/fedbiomed/fedbiomed/-/issues/164)

## Tabular dataset

Workflow of data pre processing:

1. Columns name should be shared with the researcher
2. Data format file to be filled by clinicians.
3. Specify if missing data are allowed for a given columns (Exception). The file will be used for data verification during FL pre-processing,
4. Outlier verification for quantitative data, continuous and discrete, and for dates (Critical warning),
5. Missing data imputation by local mean (or optional NN), or majority voting for discrete labels. Give warnings when missing data are found (for verification a posteriori).
6. Give critical warning when too many missing are found (>50%),
7. Verify that number of available data is greater then minimum required (Error)

Critical warnings have different levels of disclosure to the researcher (1) only the warning, 2) type of warning, 3) type of warning and column affected).

In [60]:
!pip install prettytable

Collecting prettytable
  Downloading prettytable-2.4.0-py3-none-any.whl (24 kB)
Installing collected packages: prettytable
Successfully installed prettytable-2.4.0


In [3]:
#1. load  a single view dataset


import pandas as pd
import pprint
import csv
import numpy as np
from typing import List, Tuple, Union, Dict, Any, Iterator, Optional, Callable
import os
from tabulate import tabulate
import json



In [2]:
from enum import Enum, auto

class ExcelSignatures(Enum):
    XLSX = (b'\x50\x4B\x05\x06', 2, -22, 4)
    LSX1 = (b'\x09\x08\x10\x00\x00\x06\x05\x00', 0, 512, 8)
    LSX2 = (b'\x09\x08\x10\x00\x00\x06\x05\x00', 0, 1536, 8)
    LSX3 = (b'\x09\x08\x10\x00\x00\x06\x05\x00', 0, 2048, 8)
    
    def __init__(self, sig, whence, offset, size):
        self._sig = sig
        self._whence = whence
        self._offset = offset
        self._size = size

    @property 
    def signature(self) -> bytes:
        return self._sig
    
    @property
    def whence(self) -> int:
        return self._whence
    
    @property
    def offset(self) -> int:
        return self._offset
    
    @property
    def size(self) -> int:
        return self._size



def load_tabular_datasets(path:str) -> Dict[str, pd.DataFrame]:
    tabular_datasets = {}

    if os.path.isdir(path):
        print('directory found')
        _is_folder = True
        
        _tabular_data_files = os.listdir(path)
    else:
        print('file found')
        _is_folder = False
        _tabular_data_files = (path,)
        
    for tabular_data_file in _tabular_data_files:
        if _is_folder:
            tabular_data_file = os.path.join(path, tabular_data_file)
        
        _is_excel = excel_sniffer(tabular_data_file)
        _csv_delimiter, _csv_header = csv_sniffer(tabular_data_file)
        _view_name = os.path.basename(tabular_data_file)
        if _is_excel:
            tabular_datasets[_view_name] = load_excel_file(tabular_data_file)
        elif _csv_delimiter is not None:
            tabular_datasets[_view_name] = load_csv_file(tabular_data_file,
                                                               _csv_delimiter, 
                                                               _csv_header)
        else:
            print(f'warning: cannot parse {tabular_data_file}: not a tabular data file')
        
    return tabular_datasets

def load_csv_file(path:str, delimiter:str, header:int) -> pd.DataFrame:
    try:
        dataframe = pd.read_csv(path, delimiter=delimiter, header=header)
    except csv.Error as err:
        print('err', err, 'in file', path)
            
    return dataframe

#https://stackoverflow.com/questions/23515791/how-to-check-the-uploaded-file-is-csv-or-xls-in-python/23515973




def load_excel_file(path:str, sheet_name: Union[str, int]=0) -> pd.DataFrame:
    """May rely on openpyxl package"""
    #with open(path, 'r') as excl:
    #    _c = csv.DictReader(excl, dialect=csv.excel_tab)
    #    _delimiter = _c.dialect.delimiter
    
    dataframe = pd.read_excel(path, sheet_name=sheet_name)
    return dataframe


def excel_sniffer(path: str) -> bool:
    
    for excel_sig in ExcelSignatures:
        with open(path, 'rb') as f:
            f.seek(excel_sig.offset, excel_sig.whence)
            bytes = f.read(excel_sig.size)

            if bytes == excel_sig.signature:
                return True
            else:
                return False
            

def csv_sniffer(path:str) :
        
    with open(path, 'r') as csvfile:
        try:
            # do some operation on file using sniffer to make sure considered file
            # is a CSV file
            dialect = csv.Sniffer().sniff(csvfile.readline())
            delimiter = dialect.delimiter
            dialect.lineterminator
            has_header = csv.Sniffer().has_header(csvfile.readline())
            if has_header:
                header = 0
            else:
                header = None
        except (csv.Error, UnicodeDecodeError) as err:
            delimiter, header = None, None
            print('err', err, 'in file', path)
    return delimiter, header

In [52]:
!pip install openpyxl



In [30]:
load_tabular_datasets('../../Exceltest.xlsx')

file found
err 'utf-8' codec can't decode byte 0x8c in position 15: invalid start byte in file ../../Exceltest.xlsx


{'Exceltest.xlsx':    ID   Age Eligibility
 0    1   45           Y
 1    2   45           Y
 2    3   33           N
 3    4   54           Y
 4    5   45           Y
 5    6   54         NaN
 6    7   34           N
 7    8   54         NaN
 8    9   45         NaN
 9   10   44           Y}

In [16]:
single_view_dataset = load_tabular_datasets(r'/user/ybouilla/home/Documents/data/pseudo_adni_mod/pseudo_adni_mod.csv')

file found


In [32]:
multi_view_dataframe =  load_tabular_datasets('test7')
multi_view_dataframe

directory found


{'file1':      a   e   i   o      0      1      2      3                 time  pressure  \
 0   48  98  65   5  False   True  False  False  2018-01-01 00:00:00  0.088082   
 1   87  83  13  70   True  False   True  False  2018-01-01 01:00:00  0.774788   
 2   46  73  81  96  False  False  False   True  2018-01-01 02:00:00  0.514092   
 3   84  45  81  39  False   True   True   True  2018-01-01 03:00:00  0.832881   
 4   94  84   0  15  False   True  False  False  2018-01-01 04:00:00  0.696152   
 ..  ..  ..  ..  ..    ...    ...    ...    ...                  ...       ...   
 95  14  66  25  64   True   True   True  False  2018-01-04 23:00:00  0.295578   
 96  91  81  48  53  False  False   True   True  2018-01-05 00:00:00  0.474322   
 97  15  82  12  51   True   True   True   True  2018-01-05 01:00:00  0.927511   
 98  51  18   4  52  False  False   True   True  2018-01-05 02:00:00  0.494798   
 99  51  70  63  77  False  False   True  False  2018-01-05 03:00:00  0.316395   
 
     

Data format file to be filled by clinicians (step 2 int he workflow):

Data format file will be a dictionary specifying the type: 
* for single view datasets:
```{<feature_name>: {'data_type': <data_type>, 'type':<values_taken>, 'range': <value_range>}```
 * for multiview datatset
```{{<view_name>: <feature_name>: {'data_type': <data_type>, 'type':<values_taken>, 'range': <value_range>}}```

where
* `<view_name>` is the name of the view
* `<feature_name>` is the name of the feature
* `<data_type>` can be categorical or continuous or missing_data or datetime
* `<value_taken>` is the type of the value (eg int, char, float, signed, unsigned ...)
* `<value_range>` represent either a list of bounds, an upper or a lower bound, or None

In [3]:
# 3. create data format file

import numpy as np
from enum import Enum
import enum
import datetime

# the use of Enum classes will prevent incorrect combination of values
class QuantitativeDataType(Enum):
    CONTINUOUS = [float, np.float64]
    DISCRETE = [int, np.int64]

class CategoricalDataType(Enum):
    BOOLEAN = [bool]
    NUMERICAL = [float, int, np.float64, np.int64]
    CHARACTER = [str, object]
    
class KeyDataType(Enum):
    NUMERICAL = [int, np.int64]
    CHARACTER = [str, object]
    DATETIME = [pd.Timestamp,
                pd.Timedelta,
                pd.Period,
                datetime.datetime,
                np.datetime64]


class CustomDataType(Enum):
    """for demo purpose: here a custom datatype"""
    DISCRETE = [int, np.int64]
    CHARACTER = [str, object]
    
    
class DataType(Enum):
    """

    """
    # what about 
    KEY = KeyDataType
    QUANTITATIVE = QuantitativeDataType
    CATEGORICAL = CategoricalDataType
    DATETIME = [pd.Timestamp,
                pd.Timedelta,
                pd.Period,
                datetime.datetime,
                np.datetime64]
    CUSTOM = CustomDataType  # custom data type (should be defined by user)
    UNKNOWN = 'UNKNOWN'
    
    @staticmethod
    def get_names():
        return tuple(n for n, _ in DataType.__members__.items())

class MissingValueAllowedDefault(Enum):
    KEY = False
    QUANTITATIVE = True
    CATEGORICAL = True
    DATETIME = False
    
    @staticmethod
    def get_names():
        return tuple(n for n, _ in MissingValueAllowedDefault.__members__.items())
    
    
class DataTypeProperties(Enum):
    """Data Type possible modification (whithin CLI editing)"""
    CATEGORICAL = (False, False, True, False, True)
    QUANTITATIVE = (True, True, False, False, True)
    DATETIME = (True, True, False, True, False)
    UNKNOWN = (False, False, False, False, True)
    CUSTOM = (True, True, True, False, True)
    KEY = (True, True, False, True, False)

    def __init__(self,
                 lower_bound: bool,
                 upper_bound: bool,
                 set_of_values: bool,
                 date_format:bool,
                 allow_missing_values: bool):
        self._lower_bound = lower_bound
        self._upper_bound = upper_bound
        self._set_of_values = set_of_values
        self._date_format = date_format
        self._allow_missing_values = allow_missing_values
    
    @property
    def lower_bound(self):
        return self._lower_bound
    
    @property
    def upper_bound(self):
        return self._upper_bound
    
    @property
    def set_of_values(self):
        return self._set_of_values
    
    @property
    def date_format(self):
        return self._date_format
    
    @property
    def allow_missing_values(self):
        return self._allow_missing_values



In [20]:
ImputationMethods.MODE_IMPUTATION

<function __main__.impute_missing_values_mode(data: Union[pandas.core.series.Series, pandas.core.frame.DataFrame])>

In [29]:
isinstance(next(iter(DataType.DATETIME.value)), Enum)

False

In [4]:
def get_data_type(
                  #avail_data_types: enum.EnumMeta,
                  d_format: Enum,
                  d_type: type) ->  Tuple[Enum, List[Union[type, str]]]:
    # varibales initialisation
    present_d_types = []
    sub_d_type_format = d_format
    
    
    for avail_data_type in DataType:
        if d_format is avail_data_type:
            sub_dtypes = avail_data_type.value
            #if not isinstance(sub_dtypes, str) and hasattr(sub_dtypes, '__getitem__') and isinstance(sub_dtypes[0], Enum):
            if not isinstance(sub_dtypes, str):
                # check if dtype has subtypes
                #(eg if datatype is QUANTITATIVE, subtype will be CONTINOUS or DISCRETE)
                if isinstance(next(iter(sub_dtypes)), Enum):
                    
                    for sub_dtype in sub_dtypes:
                        if any(d_type == t for t in tuple(sub_dtype.value)):
                            present_d_types.append(d_type)
                            sub_d_type_format = sub_dtype
                            print(sub_dtype, d_type)
                else:
                    # case where datatype doesnot have subtypes, eg DATETIME
                    if any(d_type == t for t in sub_dtypes):
                        present_d_types.append(d_type)
                    sub_d_type_format = d_format
            else:
                # case where d_format is a string of character
                sub_d_type_format = d_format
    print(sub_d_type_format, '|', present_d_types)
    return  sub_d_type_format, present_d_types


def find_data_type(data_format_name: str, data_type_name: str=None) -> Enum:
    """Retrieves from a given data_format and data_type,
    the corresponding Enum class describing data"""
    
    ## varible initialisation
    data_type = None
    _is_data_format_unrecognized = True
    _is_data_type_unrecognized = True
    
    _available_data_types = [t for t in DataType]
    
    for a_data_type in _available_data_types:
        if data_format_name == a_data_type.name:
            _is_data_format_unrecognized = False
            data_type = a_data_type
            
            for sub_type in a_data_type.value:
                
                if data_type_name is not None and  isinstance(sub_type, Enum):
                    # check if sub data type exist (it shouldnot if variable is UNKNOWN)
                    if data_type_name == sub_type.name:
                        
                        _is_data_type_unrecognized = False
                        data_type = sub_type
                
                    
                else:
                    _is_data_type_unrecognized = False
    # check for data formt file consistancy error
    if any((_is_data_format_unrecognized, _is_data_type_unrecognized)):
        if _is_data_format_unrecognized:
            raise ValueError(f'error: {data_format_name} not recognized as a valid data type')
        else:
            raise ValueError(f'error {data_type_name} not recognized as a valid data type')
            
    return data_type

def check_data_type_consistancy(sub_data_type: Enum, d_type: type):
    """ checks if `sub_data_type` folds within """
    is_consistant = False
    for data_type in sub_data_type:
        is_type_in_data_type = any(d_type == t for t in data_type.value)
        if is_type_in_data_type:
            is_consistant = True
            continue
            
    return is_consistant

In [15]:
check_data_type_consistancy(CategoricalDataType, object)

True

In [5]:

def check_missing_data(column: pd.Series)->bool:
    is_missing_data = column.isna().any()
    return is_missing_data
df = pd.DataFrame({'w': [1, 2, 3, 4,  'jj', None]})
print(check_missing_data(df['w']))




True


CLI details:

1. open  csv file
2. for each columns in file ask type of variable or if variable should be excluded
3. automatically detect the type given values in columns 
4. ask for each columns if missing data are allowed


eg :

assume a column is of type discrete with integers


1. user select it is quantitative
2. then system will label it as quantitative-discrete


**Question** : do we want an auto selection parameter choice?

In [5]:
def get_yes_no_msg() -> str:
    msg_yes_or_no_question = '1) YES\n2) NO\n'   
    return msg_yes_or_no_question

def parse_yes_no_msg(resp: str) -> bool:
    """implements logic to parse yes or no msg"""
    yes_or_no_question_key = {'1': True,
                    '2': False}
    return yes_or_no_question_key.get(resp)

def get_data_type_selection_msg(available_data_type:List[Enum],
                               ign_msg: str = 'ignore this column') ->Tuple[str, int]:
    
    
    n_available_data_type = len(available_data_type)
    msg = ''

    
    for i, dtype in enumerate(available_data_type):
        msg += '%d) %s \n' %  (i+1, dtype.name)
    
    ignoring_key = i+2  # add ingoring entry
    msg += f'%d) %s\n' % (ignoring_key, ign_msg)
    
    
    
    return msg, ignoring_key

def unique(iterable: Iterator, number: bool = False) -> int:
    """returns number of unique values"""
    set_of_values = set(iterable)
    if number:
        return len(set_of_values)
    else:
        return set_of_values

In [6]:
from functools import partial

# imputation methods

def impute_missing_values_mean(data):
    try:
        if type(data) == pd.core.frame.DataFrame:
            for col in data.columns:
                if (data[col].isnull().sum()>0):
                    if any(data[col].dtype in x2 for x2 in  [x.value for x in QuantitativeDataType]):
                        data[col].fillna(value=data[col].mean(),inplace=True)
        else:
            data = data.fillna(data.mean())
        return data
    except Exception as err:
        print(err)
        print('Error encountered in loading data file')
    
 #Categorical Data
def impute_missing_values_mode(data: Union[pd.Series, pd.DataFrame]):
    try:
        if type(data) == pd.core.frame.DataFrame:
            for col in data.columns:
                if (data[col].isnull().sum()>0):
                    categorical_data_type = [x.value for x in CategoricalDataType]
                    data_type_condition = any(data[col].dtype in x2 for x2 in categorical_data_type)
                    if data_type_condition:
                        print(col)
                        data[col].fillna(value=data[col].mode()[0],inplace=True)
                        
        else:
            data = data.fillna(data.value_counts().index[0])
            
        return data
    except Exception as err:
        print(err)
        print('Error encountered in imputing missing values - mode')
        
    # Impute missing values with KNN            

def impute_missing_values_knn(data,k=2):
    try:
        if type(data) == pd.core.frame.DataFrame:
            missing_cols = data.columns[data.isnull().any()]
            if len(missing_cols)>0:
                    imputer =KNNImputer(n_neighbors=k)
                    data = pd.DataFrame(imputer.fit_transform(data),columns=data.columns) 
        else:
            imputer =KNNImputer(n_neighbors=k)
            data =pd.DataFrame( (imputer.fit_transform(np.array(data).reshape(1,-1))).reshape(-1,1),columns=[data.name])                    

        return data  
    except Exception as err:
        print(err)
        print('Error encountered in imputing missing values - knn')

#Impute missing values with Interpolate
def impute_missing_values_interpolate(data):
    try:
        data_filled = data.interpolate()
            
        return data_filled
    except Exception as err:
        print(err)
        print('Error encountered in imputing missing values - interpolate')

def ask_for_data_imputation_parameters(parameters_to_ask_for: List[str]) -> Dict[str, Any]:
    """asks for user parameters for data imputation"""
    params_retrieved_from_user = {}
    for param_name in parameters_to_ask_for:
        param = input(f"please specify {param_name} value:\n")
        params_retrieved_from_user[param_name] = param
        
    return params_retrieved_from_user

class ImputationMethods(Enum):
    MEAN_IMPUTATION = (partial(impute_missing_values_mean), QuantitativeDataType, None)
    MODE_IMPUTATION = (partial(impute_missing_values_mode), CategoricalDataType, None)
    KNN_IMPUTATION = (partial(impute_missing_values_knn), CategoricalDataType, ['k'])
    INTERPOLATION_IMPUTATION = (partial(impute_missing_values_interpolate), QuantitativeDataType, None)
    
    def __init__(self, method: Callable,
                 data_type: Enum,
                 parameters_to_ask_user: List[str]):
        self._method = method
        self._data_type = data_type
        self._parameters_to_ask_user = parameters_to_ask_user
    
    def __call__(self, *args):
        """method avoiding to specify `value` when using an enum class"""
        self.value(*args)

    
    def method(self, *args):
        val = self._method(*args)
        return val
    
    @property
    def data_type(self):
        return self._data_type
    
    @property
    def parameters_to_ask_user(self):
        return self._parameters_to_ask_user

In [67]:
ImputationMethods.MEAN_IMPUTATION.method(pd.Series([1, 3, 4]))

0    1
1    3
2    4
dtype: int64

In [80]:
ImputationMethods.KNN_IMPUTATION.parameters_to_ask_user is not None


True

In [18]:
# CLI for clinicians for setting up data format file

get_data_imputation_methods_msg(bool)

('Please select the following method for filling missing values (if some are found)\n1) MODE_IMPUTATION\n2) KNN_IMPUTATION\n3) No method\n',
 {'1': <ImputationMethods.MODE_IMPUTATION: (functools.partial(<function impute_missing_values_mode at 0x7fba57063af0>), <enum 'CategoricalDataType'>, None)>,
  '2': <ImputationMethods.KNN_IMPUTATION: (functools.partial(<function impute_missing_values_knn at 0x7fba57063b80>), <enum 'CategoricalDataType'>, ['k'])>,
  '3': None})

In [25]:
def get_data_imputation_methods_msg(d_type: type = None) -> Tuple[str, Dict[str, Enum]]:
    msg = 'Please select the following method for filling missing values (if some are found)\n'
    
    #available_methods = [method for ethod in ImputationMethods]
    select_action = {}
    i = 1
    
    for  imput_method in ImputationMethods:
        if d_type is not None:
            
            is_d_type_in_sub_type = check_data_type_consistancy(imput_method.data_type, 
                                                               d_type)
            if not is_d_type_in_sub_type:
                continue # data type doesnot match method amputation requirments
        msg += '%d) %s\n' % (i, imput_method.name)
        select_action[str(i)] = imput_method
        i += 1

    # ignore key
    msg += '%d) No method\n' % i 
    select_action[str(i)] = None
    return msg, select_action
    
def no_methods(*kwargs):
    return None

    
    
def get_from_user_dataframe_format_file(dataset: pd.DataFrame) -> Dict[str, Any]:
    ##
    # variable initialisation
    data_format_file = {}
    
    dataset_columns = dataset.columns
    dataset_columns_length = len(dataset_columns)
    
    
    available_data_type = [d_type for d_type in DataType]  # get all available data types
    
    for n_feature, feature in enumerate(dataset_columns):
        print(f'displaying first 10 values of feature {feature} (n_feature: {n_feature+1}/{dataset_columns_length})') 
        #_file_name = os.path.basename(tabular_data_file)
        #data_format_files[_file_name] = data_format_file
        #print(tabulate(dataset[feature].head(10).values()))
        pprint.pprint(dataset[feature].head(10))  # print first 10 lines of feature value
        print(f'number of differents samples: {unique(dataset[feature], number=True)} / total of samples: {dataset[feature].shape[0]}')
        
        msg_data_type_selection, ignoring_id = get_data_type_selection_msg(available_data_type)
        msg_data_type_selection = f'specify data type for {feature}:\n' + msg_data_type_selection
        
        # ask user about data type
        data_format_id = get_user_input(msg_data_type_selection,
                                       
                                       n_answers=ignoring_id)
        
        if int(data_format_id) > ignoring_id - 1:
            # case where user decide to ingore column: go to next iteration (next feature)
            print(f"Ignoring feature {feature}")
            continue
        else:
            # case where user selected a data type: add data type and info to the format file
            data_format = available_data_type[int(data_format_id)-1]
            d_type = dataset[feature].dtype  
            # TODO: rename data_type into d_type for consistancy sake
            n_data_type, types = get_data_type(data_format, d_type)
            
        # KEY and DATETIME type 
        if data_format is DataType.KEY or data_format is DataType.DATETIME:  
            # for these data type, missing values are disabled by default
            is_missing_values_allowed = False
        else: 
            # ask user if missing values are allowed for this specific variable
            ## message definition
            msg_yes_or_no_question = get_yes_no_msg()
            msg_data_imputation_methods, data_imputation_methods = get_data_imputation_methods_msg(d_type=d_type)
            n_data_imputation_method = len(data_imputation_methods)
            msg_yes_or_no_question = f'Allow {feature} to have missing values:\n' + msg_yes_or_no_question
            
            missing_values_user_selection = get_user_input(msg_yes_or_no_question,
                                                        n_answers=2)
            is_missing_values_allowed = parse_yes_no_msg(missing_values_user_selection)
            
            amputation_method = None
            amputation_method_parameters = None
            if is_missing_values_allowed:
                # let user select amputation method if missing data are allowed
                amputation_method_user_selection = get_user_input(msg_data_imputation_methods,
                                                                n_answers=n_data_imputation_method)
                
                amputation_method_selected = data_imputation_methods.get(amputation_method_user_selection)
                
                if amputation_method_selected is not None:
                    amputation_method = amputation_method_selected.name
                if amputation_method_selected.parameters_to_ask_user is not None:
                    print(f'Selected: {amputation_method}\n')
                    amputation_method_parameters = ask_for_data_imputation_parameters(amputation_method_selected.parameters_to_ask_user)
                    print('amput param', amputation_method_parameters)
                    
        data_format_file[feature] = {'data_format': data_format.name,
                                     'data_type': n_data_type.name,
                                     'values': str(d_type),
                                     'is_missing_values': is_missing_values_allowed,
                                     'data_amputation_method': amputation_method,
                                     'data_amputation_parameters': amputation_method_parameters
                                    }
        
    return data_format_file
            
def get_user_input(msg:str,  n_answers:int) -> str:
    """"""
    is_column_parsed = False
    while not is_column_parsed:
        #data_format_id = input(f'specify data type for {feature}:\n' + msg )
        resp = input(msg)
        if resp.isdigit() and int(resp) <= n_answers and int(resp)>0:
            # check if value passed by user is correct (if it is integer,
            # and whithin range [1, n_available_data_type])
            is_column_parsed = True

        else:
            print(f'error ! {resp} value not understood')
            
    return resp

In [71]:
### CLI to use when dataset is available


def get_from_user_multi_view_dataset_fromat_file(datasets: Dict[str, pd.DataFrame])-> Dict[str, pd.DataFrame]:
    
    data_format_files = {}
    
    for tabular_data_file in datasets.keys():
        print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")
        print(f"+++++++ Now parsing view: {tabular_data_file} +++++++")
        print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")
        data_format_file = get_from_user_dataframe_format_file(datasets[tabular_data_file])
        if data_format_file:
            # (above condition avoids adding empty views)
            _file_name = os.path.basename(tabular_data_file)
            data_format_files[_file_name] = data_format_file
        
    return data_format_files


In [33]:
data_format_file = get_from_user_multi_view_dataset_fromat_file(single_view_dataset)

NameError: name 'single_view_dataset' is not defined

data_fromat_ref (read only)

CLI editer data_format_file

review : 
- specify lower / upper bound NUMERICAL
- Specify categorical (BOOLEAN, CHARACTER, NUMERICAL)

- save different categorical values 
 a posteriori ex SEX -> male or female, NOT FEMALE
- 

In [86]:
multi_data_format_file = get_from_user_multi_view_dataset_fromat_file(multi_view_dataframe)


++++++++++++++++++++++++++++++++++++++++++++++++++++++
+++++++ Now parsing view: file1 +++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++
displaying first 10 values of feature a (n_feature: 1/18)
0    48
1    87
2    46
3    84
4    94
5    18
6    15
7    30
8    54
9    46
Name: a, dtype: int64
number of differents samples: 57 / total of samples: 100
specify data type for a:
1) KEY 
2) QUANTITATIVE 
3) CATEGORICAL 
4) DATETIME 
5) CUSTOM 
6) UNKNOWN 
7) ignore this column
2
QuantitativeDataType.DISCRETE int64
QuantitativeDataType.DISCRETE | [dtype('int64')]
Allow a to have missing values:
1) YES
2) NO
1
Please select the following method for filling missing values (if some are found)
1) MEAN_IMPUTATION
2) MODE_IMPUTATION
3) KNN_IMPUTATION
4) INTERPOLATION_IMPUTATION
5) No method
3
method ImputationMethods.KNN_IMPUTATION
please specify k value:
6
amput param {'k': '6'}
{'a': {'data_format': 'QUANTITATIVE', 'data_type': 'DISCRETE', 'values': 'int64', 'is_missing_values': Tr

KeyboardInterrupt: Interrupted by user

multi_data_format_file

In [116]:
save_format_file_ref(multi_data_format_file, 'multi_format_file')

Model successfully saved at multi_format_file


In [48]:
# saving data format file

json_file_name = "format_file_ref"

with open(json_file_name, "w") as format_file:
    json.dump(data_format_file, format_file)

In [7]:
def save_format_file_ref(format_file_ref: Dict[str, Dict[str, Any]], path: str):
    # save `format_file_ref` into a JSON file
    with open(path, "w") as format_file:
        json.dump(format_file_ref, format_file)
    print(f"Model successfully saved at {path}")

In [1]:
def load_format_file_ref(path: str) -> Dict[str, Dict[str, Any]]:
    # retrieve data format file
    with open(path, "r") as format_file:
        format_file_ref = json.load(format_file)
    return format_file_ref

NameError: name 'Dict' is not defined

In [2]:

json_file_name = "format_file_ref"


format_file = load_format_file_ref(json_file_name)

NameError: name 'load_format_file_ref' is not defined

In [10]:
format_file

{'pseudo_adni_mod.csv': {'CDRSB.bl': {'data_format': 'CATEGORICAL',
   'data_type': 'NUMERICAL',
   'values': 'int64',
   'is_missing_values': True},
  'ADAS11.bl': {'data_format': 'QUANTITATIVE',
   'data_type': 'DISCRETE',
   'values': 'int64',
   'is_missing_values': True},
  'MMSE.bl': {'data_format': 'CATEGORICAL',
   'data_type': 'NUMERICAL',
   'values': 'float64',
   'is_missing_values': False},
  'RAVLT.immediate.bl': {'data_format': 'QUANTITATIVE',
   'data_type': 'CONTINUOUS',
   'values': 'float64',
   'is_missing_values': True},
  'RAVLT.learning.bl': {'data_format': 'CATEGORICAL',
   'data_type': 'NUMERICAL',
   'values': 'float64',
   'is_missing_values': False},
  'FAQ.bl': {'data_format': 'QUANTITATIVE',
   'data_type': 'DISCRETE',
   'values': 'int64',
   'is_missing_values': False},
  'TAU.MEDIAN.bl': {'data_format': 'QUANTITATIVE',
   'data_type': 'CONTINUOUS',
   'values': 'float64',
   'is_missing_values': False},
  'AGE': {'data_format': 'CATEGORICAL',
   'data_t

In [25]:
create_msg_action_selection(DataTypeProperties.CATEGORICAL)

('1)data_type\n2) Values taken\n3) Cancel Operation\n', 3)

In [45]:
ask_for_data_type()

1) KEY 
2) QUANTITATIVE 
3) CATEGORICAL 
4) DATETIME 
5) CUSTOM 
6) UNKNOWN 
7) ignore this column
4


{'data_format': 'DATETIME',
 'values': ["<class 'pandas._libs.tslibs.timestamps.Timestamp'>",
  "<class 'pandas._libs.tslibs.timedeltas.Timedelta'>",
  "<class 'pandas._libs.tslibs.period.Period'>",
  "<class 'datetime.datetime'>",
  "<class 'numpy.datetime64'>"]}

In [30]:
import dateutil
from dateutil.parser._parser import ParserError


def create_msg_action_selection(data_type_propreties: Enum) -> Tuple[str, int, Dict[int, Callable]]:
    # create edit selection message for user given data type
    # number of possible action depend of data type properties
    msg = ""
    action_counter = 1
    actions = {}
    
    # data type change command
    msg += "%d) data_type\n" % action_counter
    actions[str(action_counter)] = ask_for_data_type
    action_counter += 1
    
    if data_type_propreties.lower_bound:
        # lower bound edit command
        msg += "%d) lower bound\n" % action_counter
        actions[str(action_counter)] = ask_for_lower_bound
        action_counter += 1
        
    if data_type_propreties.upper_bound:
        # upper bound edit command
        msg += "%d)upper bound\n" % action_counter
        actions[str(action_counter)] = ask_for_upper_bound
        action_counter += 1
        
    if data_type_propreties.set_of_values:
        # value taken edit command
        msg += "%d) Values taken\n" % action_counter
        actions[str(action_counter)] = ask_for_categorical_values
        action_counter += 1
        
    if data_type_propreties.date_format:
        # date formatter edit command
        msg += "%d) Date format\n" % action_counter
        actions[str(action_counter)] = ask_for_date_format
        action_counter += 1
        
    if data_type_propreties.allow_missing_values:
        # change data method for data imputation
        msg += "%d) Data Value amputation method\n" % action_counter
        actions[str(action_counter)] = ask_for_data_amputation_method
        action_counter += 1
        
    msg += "%d) Cancel Operation\n" % action_counter
    actions[str(action_counter)] = cancel_operation
    return msg, action_counter, actions

def select_action(
                  action: str,
                  possible_actions: Dict[str, Callable],
                  ) -> Tuple[Dict[str, Any], bool]:
    
    # variable initialization
    is_cancelled = False
    new_field = None
    _action_counter = 1  # integer for dynamic cli management
    print('action', action, str(len(possible_actions.keys())),possible_actions)
    if action == str(len(possible_actions.keys())):
        is_cancelled = True
        
    else:
        # define action among the pool of possible actions
        _action_func = possible_actions[action]
        new_field = _action_func()
    
    return new_field, is_cancelled


def isfloat(value:str) ->bool:
    """checks if string represents a float or int"""
    is_float = True
    try:
        float(value)
    except ValueError as e:
        is_float = False
    return is_float



def is_datetime(date: str) -> bool:
    """checks if date is a date"""
    is_date_parsable = True
    try:
        dateutil.parser.parse(date)
    except (ParserError, ValueError) as err:
        is_date_parsable = False
        
    return is_date_parsable

def cancel_operation():
    print("operation cancelled")
        
def ask_for_lower_bound() -> Dict[str, float]:
    _is_entered_value_correct = False
    while not _is_entered_value_correct:
        lower_bound = input('enter lower bound')
        if isfloat(lower_bound) or is_datetime(lower_bound):
            # check if entered value is correct (is a numerical value)
            _is_entered_value_correct = True
        else:
            print('Value not a Number! please retry')
    return {'lower_bound': float(lower_bound)}

def ask_for_upper_bound() -> Dict[str, float]:
    
    _is_entered_value_correct = False
    while not _is_entered_value_correct:
        upper_bound = input('enter upper bound')
        if isfloat(upper_bound) or is_datetime(upper_bound):
            # check if entered value is correct (is a numerical value)
            _is_entered_value_correct = True
        else:
            print('Value not a Number! please retry')
    return {'upper_bound': float(upper_bound)}


def _ask_for_data_type(data_type: Enum) -> Enum:
    """asks user for datatype contains in `data_type`
    If user selects `cancel`, it will return None
    """
    
    _available_data_type = [t for t in data_type]  # get all keys contain in data_type
    n_avail_data_type = len(_available_data_type)
    msg, _n_answer = get_data_type_selection_msg(data_type, ign_msg="cancel operation")
    data_type_selection = get_user_input(msg, _n_answer)
    
    if str(_n_answer) != data_type_selection:
        return _available_data_type[int(data_type_selection) - 1]
    
    
def ask_for_data_type() -> Dict[str, Any]:

    updates = None
    
    new_data_format = _ask_for_data_type(DataType)
    
    if new_data_format is not None: 
        # case where 'cancel operation' hasnot been selected
                
        updates = {'data_format': new_data_format.name}
        if  isinstance(next(iter(new_data_format.value)), Enum):
            # if subtypes are available
            new_data_type= _ask_for_data_type(new_data_format.value)
            new_values = list(map(lambda x: str(x), new_data_type.value))
            updates.update({'data_type': new_data_type, 'values': new_values})
        else:
            new_values = list(map(lambda x: str(x), new_data_format.value))
            updates.update({'values': new_values})
    return updates


def ask_for_data_amputation_method(d_type: type=None) -> Dict[str, str]:
    
    # variable initialisation
    _amputation_method = None
    _amputation_method_parameters = None
    
    # get user message + dictionary mapping user responses to data amputation methods
    _msg_data_imputation_methods, _data_imputation_methods = get_data_imputation_methods_msg(d_type)
    
    # ask for user selection
    _amputation_method_user_selection = get_user_input(_msg_data_imputation_methods,
                                                      n_answers=len(_data_imputation_methods))
    
    # select user data amputation method given user command
    _amputation_method_selected = _data_imputation_methods.get(_amputation_method_user_selection)

    if _amputation_method_selected is not None:
        _amputation_method = _amputation_method_selected.name
        
        if _amputation_method_selected.parameters_to_ask_user is not None:
            print(f'Method amputation selected: {_amputation_method}\n')
            _amputation_method_parameters = ask_for_data_imputation_parameters(_amputation_method_selected.parameters_to_ask_user)
            print('amput param', _amputation_method_parameters)
    updates = {'data_amputation_method': _amputation_method,
               'data_amputation_parameters': _amputation_method_parameters}
    return updates

def ask_for_categorical_values() -> Dict[str, Any]:
    possible_values = input('enter possible values (separated by ",")')
    possible_values = possible_values.split(",")  # separate values passed by user into a list
    return {'categorical_values': possible_values}


def ask_for_date_format() -> Dict[str, Any]:
    # TODO : ask for date format (UTC, ....)
    msg = 'please enter date format:\n1)timetsamp\n2)ISO date format (YYYY-MM-DD)\n3)custom date format\n'
    user_selection = input(msg)
    # default date format
    msg = {'1': 'timestamp',
          '2': '(American default date format) mm/dd/yy',
          '3': '(Europeen default date format) dd/mm/yy',
           '4': 'ISO date format (YYYY-MM-DD)',
          '5': 'custom date format',
          '6': 'select timezone'}
    pass

def edit_feature_format_file_ref(feature_content: Dict[str, Any],
                                  feature_name: str,
                                  available_categorical_data_type: List[Enum],
                                  messages: Dict[str, str],
                                  ignore_keystroke: int) -> Dict[str, Any]:
    """Edits a specific feature that belongs to a specific view within a format file"""
    

    _is_feature_unparsed = True  
    _is_cancelled = False  # whether parsing of current column has been cancelled or not
    _is_first_edit = True
    _avail_data_type_properties = [dtype for dtype in DataTypeProperties]
    
    # iterate over number of feature contained in view, and ask for each feature if changes are needed
    while _is_feature_unparsed:
        if _is_cancelled or not _is_first_edit:
            _f_answer = True
        else:
            _f_answer = get_user_input(f"Edit variable: {feature_name}?\n" + messages['yes_or_no'], 2)
            # ask if user wants to edit feature variables
            _f_answer = parse_yes_no_msg(_f_answer)
            _is_operation_cancelled = False  # for cancelling feature edition
            _is_first_edit = False
        if _f_answer:
            # case where user wants to edit the current feature
            
            _msg = messages['edit']
            
            data_format = feature_content.get('data_format')
            for data_type_properties in _avail_data_type_properties:
                if data_format == data_type_properties.name:
                    # get data property from data_format
                    select_msg, n_actions, possible_actions = create_msg_action_selection(data_type_properties)
                    _msg += select_msg
                    _edit_selection = get_user_input(_msg, n_actions)

                    _edited_field, _is_cancelled = select_action(
                                                                  _edit_selection,
                                                                  possible_actions
                                                                  #available_categorical_data_type,
                                                                  #messages['data_type_select']
                                                                )

            if not _is_cancelled:
                # if user has not cancelled field edition
                if _edited_field is not None:
                    feature_content.update(_edited_field)
             
                _c_answer = get_user_input(f"Continue Editing variable: {feature_name}?\n" + messages['yes_or_no'], 2)
                _is_feature_unparsed = parse_yes_no_msg(_c_answer)
            else:
                _is_feature_unparsed = False
                
        else:
            _is_feature_unparsed = False
            
    return feature_content

In [21]:
def edit_format_file_ref(format_file_ref: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
    
    # CLI for editing `format_file_ref`, a file containing information about each variable
    # in a tabular dataset
    print(f'Now editing format file ref')
    
    ## variables initialization
    available_categorical_data_types = [t for t in CategoricalDataType]
    _file_names = list(format_file_ref.keys())
    _n_tot_files = len(_file_names)
    
    ## messages definition
    _data_type_selection_msg, ign_key = get_data_type_selection_msg(available_categorical_data_types)
    
    _messages = {
        'yes_or_no': get_yes_no_msg(),
        'edit': 'Which field should be modified?\n',
    }

    
    
    # iterate over name of files (ie views)
    for i_file in range(_n_tot_files):
        # ask for each file if user wants to edt it
        _answer = get_user_input(f"Edit file: {_file_names[i_file]}?\n" + _messages['yes_or_no'], 2)
        _answer = parse_yes_no_msg(_answer)
        
        if _answer:
            # case where user wants to modify current view scheme
            _file_content = format_file_ref[_file_names[i_file]]  # get file (ie view) content
            
            ## variables initialization for parsing current view
            _features_names = list(_file_content.keys())
            _n_tot_feature = len(_features_names)
            
            # iterate over features found in view
            for i_feature in range(_n_tot_feature):
                feature_name = _features_names[i_feature]
                feature_content = _file_content[feature_name]
                feature_content = edit_feature_format_file_ref(feature_content,
                                                               feature_name,
                                                               available_categorical_data_types,
                                                               _messages,
                                                               ign_key)
            format_file_ref[_file_names[i_file]].update({feature_name: feature_content})
            
    return format_file_ref

In [90]:
format_file

{'pseudo_adni_mod.csv': {'CDRSB.bl': {'data_format': 'CATEGORICAL',
   'data_type': 'BOOLEAN',
   'values': ["<class 'bool'>"],
   'is_missing_values': True,
   'categorical_values': ['1', ' 2', ' 4']},
  'ADAS11.bl': {'data_format': 'QUANTITATIVE',
   'data_type': 'DISCRETE',
   'values': 'int64',
   'is_missing_values': True},
  'MMSE.bl': {'data_format': 'CATEGORICAL',
   'data_type': 'NUMERICAL',
   'values': 'float64',
   'is_missing_values': False},
  'RAVLT.immediate.bl': {'data_format': 'QUANTITATIVE',
   'data_type': 'CONTINUOUS',
   'values': 'float64',
   'is_missing_values': True},
  'RAVLT.learning.bl': {'data_format': 'CATEGORICAL',
   'data_type': 'NUMERICAL',
   'values': 'float64',
   'is_missing_values': False},
  'FAQ.bl': {'data_format': 'QUANTITATIVE',
   'data_type': 'DISCRETE',
   'values': 'int64',
   'is_missing_values': False,
   'lower_bound': 0.0},
  'TAU.MEDIAN.bl': {'data_format': 'QUANTITATIVE',
   'data_type': 'CONTINUOUS',
   'values': 'float64',
   'is

In [None]:
edit_format_file_ref(format_file)

# TODO: create possiblity for user to update data type when editing format_file

In [32]:
multi_format_file_ref = load_format_file_ref('multi_format_file')
edit_format_file_ref(multi_format_file_ref)

Now editing format file ref
Edit file: file1?
1) YES
2) NO
1
Edit variable: e?
1) YES
2) NO
1
Which field should be modified?
1) data_type
2) lower bound
3)upper bound
4) Data Value amputation method
5) Cancel Operation
4
action 4 5 {'1': <function ask_for_data_type at 0x7fba5611e550>, '2': <function ask_for_lower_bound at 0x7fba55aaeca0>, '3': <function ask_for_upper_bound at 0x7fba55aae5e0>, '4': <function ask_for_data_amputation_method at 0x7fba55aae670>, '5': <function cancel_operation at 0x7fba55aae9d0>}
Please select the following method for filling missing values (if some are found)
1) MEAN_IMPUTATION
2) MODE_IMPUTATION
3) KNN_IMPUTATION
4) INTERPOLATION_IMPUTATION
5) No method
5
Continue Editing variable: e?
1) YES
2) NO
1
Which field should be modified?
1) data_type
2) lower bound
3)upper bound
4) Data Value amputation method
5) Cancel Operation
4
action 4 5 {'1': <function ask_for_data_type at 0x7fba5611e550>, '2': <function ask_for_lower_bound at 0x7fba55aaeca0>, '3': <funct

{'file1': {'e': {'data_format': 'QUANTITATIVE',
   'data_type': 'DISCRETE',
   'values': 'int64',
   'is_missing_values': True,
   'data_amputation_method': 'MODE_IMPUTATION',
   'data_amputation_parameters': None},
  '1': {'data_format': 'CATEGORICAL',
   'data_type': 'BOOLEAN',
   'values': 'bool',
   'is_missing_values': False},
  '2': {'data_format': 'CATEGORICAL',
   'data_type': 'BOOLEAN',
   'values': 'bool',
   'is_missing_values': True},
  'time': {'data_format': 'DATETIME',
   'data_type': 'DATETIME',
   'values': 'object',
   'is_missing_values': False},
  'pressure': {'data_format': 'QUANTITATIVE',
   'data_type': 'CONTINUOUS',
   'values': 'float64',
   'is_missing_values': False},
  'e.1': {'data_format': 'QUANTITATIVE',
   'data_type': 'DISCRETE',
   'values': 'int64',
   'is_missing_values': False},
  'gender': {'data_format': 'CATEGORICAL',
   'data_type': 'CHARACTER',
   'values': 'object',
   'is_missing_values': False},
  'blood type': {'data_format': 'CATEGORICAL',

## tabular data sanity check using file format ref

In [134]:
    



# utility functions for multi view dataframe
def rename_variables_before_joining(multi_view_datasets: Dict[str, pd.DataFrame],
                                    views_name: List[Union[str, int]],
                                    primary_key:Union[str, int]=None) -> Tuple[Dict[str, pd.DataFrame]]:
    """
    Renames variables that have same name but different views using the following naming convention:
    if `a` is the name of a feature of `view1` and `a` is the name of a feature of `view2`,
    features names will be updated into `view1.a` and `view2.a`
    """
    _features_names = {}
    _views_length = len(views_name)
    
    # check for each variable name existing in one view, that it doesnot exist in another
    # view. if it is, rename both variables
    # for this purpose, parse every combination once
    for i_left in range(0, _views_length-1):
        _left_view = views_name[i_left]
        _left_features_name = multi_view_datasets[_left_view].columns.tolist()
        for i_right in range(i_left+1, _views_length):
        
            _right_view = views_name[i_right]
            _right_features_name = multi_view_datasets[_right_view].columns.tolist()
            
            for _f in _left_features_name:
                if primary_key and _f == primary_key:
                    # do not affect primary key (if any)
                    continue
                if _f  in _right_features_name:
                    
                    if _left_view  not in _features_names:
                        _features_names[_left_view] = {}
                        
                    if _right_view not in _features_names:
                        _features_names[_right_view] = {}
                        
                    _features_names[_left_view].update({_f: _left_view + '.' + str(_f)})
                    _features_names[_right_view].update({_f: _right_view + '.' + str(_f)})
    
    for i in range(_views_length):
        _view = views_name[i]
        _new_features = _features_names.get(_view)
        if _new_features:
            multi_view_datasets[_view] = multi_view_datasets[_view].rename(columns=_new_features)
        
    
    return multi_view_datasets


def create_multi_view_dataframe(datasets: Dict[str, pd.DataFrame]) -> pd.DataFrame:
    _header_labels = ['views', 'feature_name']
    # 1. create multiindex header

    _feature_name_array = np.array([])  # store all feature names
    _view_name_array = []  # store all views (ie modalities) names

    _concatenated_datasets = np.array([])  # store dataframe values

    for key in datasets.keys():
        _feature_name_array = np.concatenate([_feature_name_array,
                                              datasets[key].columns.values])
        if len(_concatenated_datasets) <= 0:
            # first pass 
            _concatenated_datasets = datasets[key].values
        else:
            # next passes
            try:
                _concatenated_datasets = np.concatenate(
                                        [_concatenated_datasets,
                                         datasets[key].to_numpy()
                                         ], axis=1)
            except ValueError as val_err:
                # catching case where nb_samples are differents
                raise ValueError(
                    'Cannot create multi view dataset: different number of samples for each modality have been detected'\
                        + 'Details: ' + str(val_err)
                    )
        for _ in datasets[key].columns.values:
            _view_name_array.append(key)

    _header = pd.MultiIndex.from_arrays([_view_name_array,
                                         _feature_name_array],
                                        names=_header_labels)


    # 2. create multi index dataframe

    multi_view_df = pd.DataFrame(_concatenated_datasets,
                                  columns = _header)
    return multi_view_df


def join_muti_view_dataset(multi_view_dataset: pd.DataFrame,
                           primary_key: str=None,
                          as_multi_index: bool = True) -> pd.DataFrame:
    """Concatenates a multi view dataset into a plain pandas dataframe,
    by doing a join operation along specified primary_key"""
    
    _views_names = sorted(set(multi_view_dataset.columns.get_level_values(0)))  # get views name
    
    joined_dataframe = multi_view_dataset[_views_names[0]]  # retrieve the first view
    # (as a result of join operation)
    for x in range(1, len(_views_names)):
        joined_dataframe = joined_dataframe.merge(multi_view_dataset[_views_names[x]],
                                                    on=primary_key,
                                                    suffixes=('', '.'+_views_names[x]))
    
    if as_multi_index:
        # convert plain dataframe into multi index dataframe
        # primary key will have its own view
        _header_labels = ['views', 'feature_name']
        _primary_key_label = 'primary_key'
        
        _multi_index = multi_view_dataset.columns
        _key_values = joined_dataframe[primary_key].values  # storing primary key

        _all_features_names = []
        _new_views_names = []
        for view_name in _views_names:
            # get all columns name for each view, and remove primary key
            _features_names = list(multi_view_dataset[view_name].columns)
            _features_names.remove(primary_key)
            _all_features_names.extend(_features_names)

            for feature_name in _features_names:
                _new_views_names.append(view_name)
                # appending as much as there are feature within each view
            #features_name[name].remove(primary_key)

        _header = pd.MultiIndex.from_arrays([ _new_views_names, _all_features_names],
                                            names=_header_labels)
        joined_dataframe  = pd.DataFrame(joined_dataframe[_all_features_names].values, columns=_header)
        joined_dataframe[_primary_key_label, primary_key] = _key_values
        
    return joined_dataframe


def search_primary_key(format_file_ref: Dict[str, Dict[str, Any]]) -> Optional[str]: 
    """"""
    _views_names = list(format_file_ref.keys())
    primary_key = None
    _c_view = None
    for view_name in _views_names:
        file_content = format_file_ref[view_name]
        _features_names = list(file_content.keys())
        for feature_name in _features_names:
            feature_content  = file_content[feature_name]
            _d_format = feature_content.get('data_format')
            
            if _d_format == DataType.KEY.name:
                if _c_view is None:
                    primary_key = feature_name
                    _c_view = view_name
                    print(f'found primary key {primary_key}')
                else:
                    print(f'error: found 2 primary keys is same view {view_name}')
        _c_view = None
    return primary_key



def select_data_from_format_file_ref(datasets: Dict[str, Dict[str, Any]],
                                     format_file: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
    """returns an updated dataset containing only the features detailed in format_file"""
    # variables initialisation
    
    updated_dataset = {}
    _views_format_file = list(format_file.keys())
    
    for view in _views_format_file:
        if view in datasets.keys():
            # only extract features from format_file
            _format_file_features = list(format_file[view].keys())
            _current_dataset_feature = datasets[view].columns.tolist()
            try:
                updated_dataset[view] = datasets[view][_format_file_features]
            except KeyError as ke:
                # catch error if a column is specified in data format file
                # but not found in dataset
                _missing_feature = []
                for feature in _format_file_features:
                    if feature not in _current_dataset_feature:
                        _missing_feature.append(feature)
                print('Error: th following features', *_missing_feature, f'are not found in view: {view}')
        else:
            # trigger error
            print(f'error!: missing view {view} in dataset')
            
    return updated_dataset

NameError: name 'np' is not defined

In [20]:
df_to_check[n_feature_name]dataset_to_check = load_tabular_datasets(r'/user/ybouilla/home/Documents/data/pseudo_adni_mod/pseudo_adni_mod.csv')



file found


In [132]:
# extract views names
views_names = list(format_file.keys())



# look for primary key
primary_key = search_primary_key(format_file)
print('primary key', primary_key)

# select only features in dataset that will be checked
pre_parsed_dataset_to_check = select_data_from_format_file_ref(dataset_to_check, format_file)
# rename columns names before join operation
pre_parsed_dataset_to_check = rename_variables_before_joining(pre_parsed_dataset_to_check, views_names)
pre_parsed_dataset_to_check

multi_df_to_check = create_multi_view_dataframe(pre_parsed_dataset_to_check)
multi_df_to_check

#if primary_key is not None:
# jointure operation (takesplace only if primary key has been specfied in foramt_file)
df_to_check = join_muti_view_dataset(multi_df_to_check)
    
df_to_check

NameError: name 'format_file' is not defined

In [80]:
# sanity check on columns using format_file



_views_format_file = list(format_file.keys())
    
for view in _views_format_file:
    _features_format_file = format_file[view]
    
    for feature in _features_format_file:
        feature_content = df_to_check[feature]
        feature_format =  _features_format_file[feature]
        # 1. check for datatype consistency
        
        data_types = find_data_type(feature_format['data_format'], feature_format['data_type'])
        
        print(dtype[-1].value, feature_content.dtype, feature_format)
        if any(t == feature_content.dtype for t in data_types[-1].value ):
            print('ok')
        


[<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>] float64 {'data_format': 'CATEGORICAL', 'data_type': 'BOOLEAN', 'values': ["<class 'bool'>"], 'is_missing_values': True, 'categorical_values': ['1', ' 2', ' 4']}
[<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>] float64 {'data_format': 'QUANTITATIVE', 'data_type': 'DISCRETE', 'values': 'int64', 'is_missing_values': True}
[<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>] float64 {'data_format': 'CATEGORICAL', 'data_type': 'NUMERICAL', 'values': 'float64', 'is_missing_values': False}
ok
[<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>] float64 {'data_format': 'QUANTITATIVE', 'data_type': 'CONTINUOUS', 'values': 'float64', 'is_missing_values': True}
ok
[<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>] float64 {'data_format': 'CATEGORICAL', 'data_type': 'NUMERICAL', 'values': 'float64', 'i

In [34]:
multi_format_file_ref = load_format_file_ref('multi_format_file')
multi_dataset_to_check = load_tabular_datasets(r'test7')

directory found


In [14]:
multi_format_file_ref['file1']['2']

{'data_format': 'CATEGORICAL',
 'data_type': 'BOOLEAN',
 'values': 'bool',
 'is_missing_values': True}

In [151]:
# extract views names
views_names = list(multi_format_file_ref.keys())



# look for primary key
primary_key = search_primary_key(multi_format_file_ref)
print('primary key', primary_key)

# select only features in dataset that will be checked
pre_parsed_dataset_to_check = select_data_from_format_file_ref(multi_dataset_to_check, multi_format_file_ref)
# rename columns names before join operation
pre_parsed_dataset_to_check = rename_variables_before_joining(pre_parsed_dataset_to_check, views_names,
                                                             primary_key)
pre_parsed_dataset_to_check

multi_df_to_check = create_multi_view_dataframe(pre_parsed_dataset_to_check)
multi_df_to_check

#if primary_key is not None:
# jointure operation (takesplace only if primary key has been specfied in foramt_file)
multi_df_joined = join_muti_view_dataset(multi_df_to_check, primary_key)
    
df_to_check = multi_df_to_check.droplevel(0, axis=1)  # remove views from dataset
df_to_check

found primary key pkey
found primary key pkey
found primary key pkey
primary key pkey


feature_name,e,file1.1,2,file1.time,pressure,e.1,gender,blood type,pkey,discrete,city,pkey.1,file2.1,file2.time,pH,pkey.2
0,98,True,False,2018-01-01 00:00:00,0.088082,63,MAN,A,zmixzrgvxrjqxoe sluk,64.0,Lille,qpqorfhylu gmfjy bdj,True,2018-01-01 00:00:00,0.023107,kkmjozalfyirgsire ui
1,83,False,True,2018-01-01 01:00:00,0.774788,20,MAN,O,vrzahnpfluspdcbfnaqt,26.0,Lille,kkmjozalfyirgsire ui,False,2018-01-01 01:00:00,,xkdawggpnuulcewuoyzz
2,73,False,False,2018-01-01 02:00:00,0.514092,2,WOMAN,A,pnrepvmrxqabdlvisclv,61.0,Paris,ezfasuuycdda foisjte,True,2018-01-01 02:00:00,0.407279,khuulhwgwnjggrfoefce
3,45,True,True,2018-01-01 03:00:00,0.832881,70,WOMAN,AB,gwj luzejwdxzsiljxzd,29.0,Paris,faxiqkt xggzmwzoidbg,True,2018-01-01 03:00:00,0.536301,xxysdmwwmjsmyhaswfdb
4,84,True,False,2018-01-01 04:00:00,0.696152,90,MAN,B,jjdvcnofivbqhirxzdyo,99.0,Lille,znwhlj rwzdutnagwasy,True,2018-01-01 04:00:00,0.749443,ldejfuij mnbnf wwmms
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,66,True,True,2018-01-04 23:00:00,0.295578,41,WOMAN,A,hrvepmqjn llgbzplshv,9.0,Paris,zeqhcikzdodus jn qjf,True,2018-01-04 23:00:00,,wrmdecb s pohtmrcdj
96,81,False,True,2018-01-05 00:00:00,0.474322,41,WOMAN,B,wroevwyuamxibzshlxxh,98.0,Marseille,iicthcvfmkajbvr gzir,False,2018-01-05 00:00:00,0.388389,whmwrpvqmerdpwwzxasf
97,82,True,True,2018-01-05 01:00:00,0.927511,7,MAN,B,ywadcykylymkdtzfctpg,21.0,Lille,ztjakcsk bhjoksdz lm,True,2018-01-05 01:00:00,0.889067,pnrepvmrxqabdlvisclv
98,18,False,True,2018-01-05 02:00:00,0.494798,11,MAN,O,ruchbfa zwgenxslegrl,42.0,Marseille,sabunaa opt vpulnxj,True,2018-01-05 02:00:00,0.402979,iicthcvfmkajbvr gzir


In [143]:
df_to_check

views,contatct,contatct,file1,file1,file1,file1,file1,file1,file1,file1,file2,file2,file2,primary_key
feature_name,discrete,city,e,file1.1,2,file1.time,pressure,e.1,gender,blood type,file2.1,file2.time,pH,pkey
0,64.0,Lille,16,False,False,2018-01-03 04:00:00,0.98667,98,WOMAN,A,False,2018-01-02 06:00:00,,qpqorfhylu gmfjy bdj
1,26.0,Lille,96,True,True,2018-01-02 04:00:00,0.996889,35,MAN,AB,True,2018-01-01 00:00:00,0.023107,kkmjozalfyirgsire ui
2,61.0,Paris,8,True,True,2018-01-01 09:00:00,0.777026,65,MAN,A,False,2018-01-02 10:00:00,0.587685,ezfasuuycdda foisjte
3,29.0,Paris,6,True,False,2018-01-04 20:00:00,0.877527,81,MAN,AB,True,2018-01-03 12:00:00,0.894073,faxiqkt xggzmwzoidbg
4,99.0,Lille,79,True,True,2018-01-04 09:00:00,0.447389,88,WOMAN,O,True,2018-01-01 10:00:00,0.026831,znwhlj rwzdutnagwasy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,9.0,Paris,62,True,True,2018-01-02 13:00:00,0.953184,53,MAN,AB,False,2018-01-02 05:00:00,0.78856,zeqhcikzdodus jn qjf
96,98.0,Marseille,49,False,True,2018-01-02 21:00:00,0.442283,35,MAN,,True,2018-01-05 02:00:00,0.402979,iicthcvfmkajbvr gzir
97,21.0,Lille,14,False,False,2018-01-02 06:00:00,0.988543,67,MAN,B,False,2018-01-01 12:00:00,,ztjakcsk bhjoksdz lm
98,42.0,Marseille,10,True,False,2018-01-02 01:00:00,0.059791,48,MAN,B,True,2018-01-02 09:00:00,0.651801,sabunaa opt vpulnxj


In [105]:
def check_key_variable_compliance(column: pd.Series,
                                  
                                  col_name:str=None,
                                  warning=None) -> bool:
    """performs data sanity check over variable of type `KEY`
    warning should be Critical warnings
    """
    # variables initialisation
    is_test_passed = True 
    
    # 1. check unicity of values in column
    
    n_unique_samples = unique(column, number=True)
    n_samples = column.shape[0]
    
    if n_unique_samples != n_samples:
        is_test_passed = False
        print(f'error: keys not unique ! b of samples= {n_samples} and unique values {n_unique_samples}')
    else:
        print('test 1 passed')
    # 2. check if missing database contained in key (key should not contain any missing data)
    if check_missing_data(column):
        is_test_passed = False
        print('error: missing data found in key')
    else:
        
        print('test 2 passed')

                
    return is_test_passed




In [47]:
np.all(df_to_check['file1.time'] > '2018-01-04 09:00:00')

False

In [57]:
np.all(df_to_check['file1.time'].apply(is_datetime))

True

In [50]:
check_variable_compliance(df_to_check['2'], multi_format_file_ref['file1']['2'])



test 1 passed
test 2 passed
test 3 skipped 
test 4 skipped
test 5: passed


In [35]:
multi_format_file_ref['file1']['2'].update({'categorical_values': [True,False]})

In [36]:
multi_format_file_ref['file1']['2']

{'data_format': 'CATEGORICAL',
 'data_type': 'BOOLEAN',
 'values': 'bool',
 'is_missing_values': True,
 'categorical_values': [True, False]}

In [38]:
def check_variable_compliance(column: pd.Series,
                               format_file_ref: Dict[str, Any],
                               col_name:str=None,
                               warning=None) -> Tuple[bool, bool]:
    """performs a data sanity check on variable `col_name` given instruction in 
    data_file_ref
    """
    is_test_passed = True
    
    
    data_format_name = format_file_ref.get('data_format')
    data_type_name = format_file_ref.get('data_type')
    # remove nan (missing values) from 
    column_without_nan = column.dropna()
    
    
    if data_format_name is None:
        print(f'critical wraning: data fromat {data_format_name} not understood')
    # 1. check data sub type
    try:
        data_type = find_data_type(data_format_name, data_type_name)
    except ValueError as err:
        data_type = None
        print('Critical warning: data format and data type mismatch')
    does_column_have_correct_data_type = any(t for t in data_type.value)
    if not does_column_have_correct_data_type:
        print(f'error: data type {column.dtype} doesnot have the data type specified in format reference file')
    else:
        print('test 1 passed')

    # 2. check if missing values are allowed
    is_missing_data = check_missing_data(column)
    is_missing_values_authorized = format_file_ref.get('is_missing_values', 'test_skipped')
    if is_missing_values_authorized == 'test_skipped':
        print('missing_value test skipped')
    elif not is_missing_values_authorized and is_missing_data:

        print('Error found missing data but missing data are not authorized')
    else:
        print('test 2 passed')
    
    
    # 3. check lower bound
    print(format_file_ref)
    lower_bound = format_file_ref.get('lower_bound')
    
    if lower_bound is not None:
        
        # should work for both numerical and datetime data sets
        
        is_lower_bound_correct = np.all(column_without_nan >= lower_bound)
        
            
        if not is_lower_bound_correct:
            print('Warning: found some data below lower bound')
        else:
            print('test 3 passed')
    else:
        print('test 3 skipped ')
    # 4. check upper bound
    upper_bound = format_file_ref.get('upper_bound')
    if upper_bound is not None:
         # should work for both numerical and datetime data sets
        is_upper_bound_correct = np.all(column_without_nan <= lower_bound)
        
            
        if not is_upper_bound_correct:
            print('Warning: found some data  above upper bound')
        else:
            print('test 4 passed')
            
    else:
        print('test 4 skipped')
    # 5. check if possible_values are contained in variable
    categorical_values = format_file_ref.get('categorical_values')    
    if categorical_values is None:
        print('categorical value check test skipped')
    else:
        unique_values = unique(column)
        _is_error_found = False
        for val in unique_values:
            if val not in categorical_values and not np.isnan(val):
                print(f'critical warning: {val} not in possible values')
                _is_error_found = True
        if not _is_error_found:
            print('test 5: passed')
    
 
        
    
def check_datetime_variable_compliance(column: pd.Series):
    """additional data sanity checks for datetime variable"""
    # test 1. check if datetime is parsable
    
    # remove nan
    column_without_nan = column.dropna()
    are_datetime_parsables =  np.all(column.apply(is_datetime))
    if not are_datetime_parsables:
        print('Warning: at least one variable is not a datetime')
        
    else:
        print('datetime parsed')

In [106]:
# Data sanity check

new_feature_name = { v: list(pre_parsed_dataset_to_check[v].columns) for v in views_names}
new_feature_name

for view in views_names:
    print(view)
    
    feature_names = list(multi_format_file_ref[view].keys())
    for n_feature_name, feature_name in zip(new_feature_name[view], feature_names):
        check_variable_compliance(df_to_check[n_feature_name], multi_format_file_ref[view][feature_name])
        data_format = multi_format_file_ref[view][feature_name].get('data_format')
        if data_format == DataType.DATETIME.name:
            # addtional check for DATETIME data format
            check_datetime_variable_compliance(df_to_check[n_feature_name])
            
        if data_format == DataType.KEY.name:
            check_key_variable_compliance(df_to_check[n_feature_name])
            
        

file1
test 1 passed
test 2 passed
{'data_format': 'QUANTITATIVE', 'data_type': 'DISCRETE', 'values': 'int64', 'is_missing_values': True}
test 3 skipped 
test 4 skipped
categorical value check test skipped
test 1 passed
test 2 passed
{'data_format': 'CATEGORICAL', 'data_type': 'BOOLEAN', 'values': 'bool', 'is_missing_values': False}
test 3 skipped 
test 4 skipped
categorical value check test skipped
test 1 passed
test 2 passed
{'data_format': 'CATEGORICAL', 'data_type': 'BOOLEAN', 'values': 'bool', 'is_missing_values': True, 'categorical_values': [True, False]}
test 3 skipped 
test 4 skipped
test 5: passed
test 1 passed
test 2 passed
{'data_format': 'DATETIME', 'data_type': 'DATETIME', 'values': 'object', 'is_missing_values': False}
test 3 skipped 
test 4 skipped
categorical value check test skipped
datetime parsed
test 1 passed
test 2 passed
{'data_format': 'QUANTITATIVE', 'data_type': 'CONTINUOUS', 'values': 'float64', 'is_missing_values': False}
test 3 skipped 
test 4 skipped
categor

In [71]:
multi_format_file_ref['file2']['pH'].update({'lower_bound': -10})
multi_format_file_ref['file2']['pH']

{'data_format': 'QUANTITATIVE',
 'data_type': 'CONTINUOUS',
 'values': 'float64',
 'is_missing_values': False,
 'lower_bound': -10}

In [97]:
multi_format_file_ref[view][feature_name]

{'data_format': 'DATETIME',
 'data_type': 'DATETIME',
 'values': 'object',
 'is_missing_values': False}

In [None]:
find_data_type(

In [80]:
check_variable_compliance(df_to_check['pH'], multi_format_file_ref['file2']['pH'])

test 1 passed
Error found missing data but missing data are not authorized
{'data_format': 'QUANTITATIVE', 'data_type': 'CONTINUOUS', 'values': 'float64', 'is_missing_values': False, 'lower_bound': -10}
test 3 passed
test 4 skipped
categorical value check test skipped


In [77]:
df_to_check['pH'] > np.nan

0     False
1     False
2     False
3     False
4     False
      ...  
95    False
96    False
97    False
98    False
99    False
Name: pH, Length: 100, dtype: bool

In [66]:
multi_format_file_ref

{'file1': {'e': {'data_format': 'QUANTITATIVE',
   'data_type': 'DISCRETE',
   'values': 'int64',
   'is_missing_values': True},
  '1': {'data_format': 'CATEGORICAL',
   'data_type': 'BOOLEAN',
   'values': 'bool',
   'is_missing_values': False},
  '2': {'data_format': 'CATEGORICAL',
   'data_type': 'BOOLEAN',
   'values': 'bool',
   'is_missing_values': True,
   'categorical_values': [True, False]},
  'time': {'data_format': 'DATETIME',
   'data_type': 'DATETIME',
   'values': 'object',
   'is_missing_values': False},
  'pressure': {'data_format': 'QUANTITATIVE',
   'data_type': 'CONTINUOUS',
   'values': 'float64',
   'is_missing_values': False},
  'e.1': {'data_format': 'QUANTITATIVE',
   'data_type': 'DISCRETE',
   'values': 'int64',
   'is_missing_values': False},
  'gender': {'data_format': 'CATEGORICAL',
   'data_type': 'CHARACTER',
   'values': 'object',
   'is_missing_values': False},
  'blood type': {'data_format': 'CATEGORICAL',
   'data_type': 'CHARACTER',
   'values': 'obj

In [19]:
check_variable_compliance(df_to_check['pkey'], multi_format_file_ref['file2']['pkey'])

test 1 passed
test 2 passed
categorical value check test skipped


## data_format_file

In [33]:
format_file

{'pseudo_adni_mod.csv': {'CDRSB.bl': {'data_format': 'CATEGORICAL',
   'data_type': 'NUMERICAL',
   'values': 'int64',
   'is_missing_values': True},
  'ADAS11.bl': {'data_format': 'QUANTITATIVE',
   'data_type': 'DISCRETE',
   'values': 'int64',
   'is_missing_values': True},
  'MMSE.bl': {'data_format': 'CATEGORICAL',
   'data_type': 'NUMERICAL',
   'values': 'float64',
   'is_missing_values': False},
  'RAVLT.immediate.bl': {'data_format': 'QUANTITATIVE',
   'data_type': 'CONTINUOUS',
   'values': 'float64',
   'is_missing_values': True},
  'RAVLT.learning.bl': {'data_format': 'CATEGORICAL',
   'data_type': 'NUMERICAL',
   'values': 'float64',
   'is_missing_values': False},
  'FAQ.bl': {'data_format': 'QUANTITATIVE',
   'data_type': 'DISCRETE',
   'values': 'int64',
   'is_missing_values': False},
  'TAU.MEDIAN.bl': {'data_format': 'QUANTITATIVE',
   'data_type': 'CONTINUOUS',
   'values': 'float64',
   'is_missing_values': False},
  'AGE': {'data_format': 'CATEGORICAL',
   'data_t

In [33]:
msg_yes_or_no_question

'1) YES\n2) NO\n'

In [40]:
is_views_finished = False


views_format_file = {}

while not is_views_finished:
    is_features_finished = False
    resp = input('do you want to add a new view (file)?\n' + msg_yes_or_no_question)
    resp = yes_or_no_question_key.get(resp)
    if not resp:
        is_views_finished = True
        print('process done')
        continue
    new_view = input('please add new view name:\n')
    while not is_features_finished:
        feature_format_file = {}
        new_feature = input('please add new feature name:\n')
        feature_format_file[new_feature] = {}categorical_values
        is_column_parsed = False
        try:
            while not is_column_parsed:
                data_format_id = input(f'specify data type for {feature}:\n' + msg )
                if data_format_id.isdigit() and int(data_format_id) <= n_available_data_type+1:
                    # check if value passed by user is correct (if it is integer,
                    # and whithin range [1, n_available_data_type])
                    is_column_parsed = True
                
                else:
                    print(f'error ! {data_format_id} value not understood')
                    
        except KeyboardInterrupt as e:
            print('stopping now' + str(e))
        resp = input('do you want to add a new variable (feature) ?' + msg_yes_or_no_question)
        resp = yes_or_no_question_key.get(resp)
        if not resp:
            is_features_finished = True
            print('process done')
            continue
    views_format_file[new_view] = feature_format_file

do you want to add a new view (file)?
1) YES
2) NO
1
please add new view name:
ll
please add new feature name:
ll
specify data type for 0:
1) KEY 
2) QUANTITATIVE 
3) CATEGORICAL 
4) DATETIME 
5) UNKNOWN 
6) ignore this column
4
do you want to add a new variable (feature) ?1) YES
2) NO
2
process done
do you want to add a new view (file)?
1) YES
2) NO
1
please add new view name:
kk
please add new feature name:
vkeof
specify data type for 0:
1) KEY 
2) QUANTITATIVE 
3) CATEGORICAL 
4) DATETIME 
5) UNKNOWN 
6) ignore this column
3
do you want to add a new variable (feature) ?1) YES
2) NO
2
process done
do you want to add a new view (file)?
1) YES
2) NO
2
process done


In [75]:
type(np.datetime64("2018-01-01"))
import datetime
type(datetime.datetime(2018, 1, 1))

pd.datetime64[ns]

AttributeError: module 'pandas' has no attribute 'datetime64'

In [91]:
t = type(pd.to_datetime('13000101', format='%Y%m%d', errors='ignore'))

In [107]:
t = pd.Series(pd.date_range("1/1/2011", freq="H", periods=3)).dtype

t =type(t).type

In [108]:
any(t == t1 for t1 in [pd.Timestamp, pd.Timedelta, pd.Period, datetime.datetime,np.datetime64] )

True

In [105]:
t.type

numpy.datetime64

In [28]:
data_format_file

{'CDRSB.bl': {'data_type': <CategoricalDataType.NUMERICAL: [<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>]>,
  'values': int,
  'is_missing_values': False},
 'ADAS11.bl': {'data_type': <CategoricalDataType.NUMERICAL: [<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>]>,
  'values': int,
  'is_missing_values': False},
 'MMSE.bl': {'data_type': <CategoricalDataType.NUMERICAL: [<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>]>,
  'values': int,
  'is_missing_values': False},
 'RAVLT.immediate.bl': {'data_type': <QuantitativeDataType.CONTINUOUS: [<class 'float'>, <class 'numpy.float64'>]>,
  'values': float,
  'is_missing_values': False},
 'RAVLT.learning.bl': {'data_type': <CategoricalDataType.NUMERICAL: [<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>]>,
  'values': float,
  'is_missing_values': False},
 'RAVLT.forgetting.bl': {'data_type': <QuantitativeDataType.CON

In [29]:

type(dataset[feature].dtype)

numpy.dtype[float64]

In [28]:
dir(dataset[feature])

['T',
 '_AXIS_LEN',
 '_AXIS_ORDERS',
 '_AXIS_REVERSED',
 '_AXIS_TO_AXIS_NUMBER',
 '_HANDLED_TYPES',
 '__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_ufunc__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__long__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__r