In [78]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import regex as re
from sklearn.preprocessing import StandardScaler, Binarizer, LabelEncoder, Normalizer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pipe

# Preprocessing Pipeline, uveitis_data.xlsx

In [79]:
# import dataframe
df = pd.read_excel("../data/uveitis_data.xlsx")
assert len(df) >= 1075, "Data is not complete"

df = pipe.rename(df, "../data/col_names&data_type-Copy1.xlsx")

# def function 
    # get list of categorical variables
    # split if numeric and categorical is both included
    # transform d type to categorical
    # encode   

In [80]:
df.iloc[:,:5].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1075 entries, 0 to 1074
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      1075 non-null   int64 
 1   gender  1075 non-null   object
 2   race    1075 non-null   object
 3   loc     1064 non-null   object
 4   cat     1074 non-null   object
dtypes: int64(1), object(4)
memory usage: 42.1+ KB


In [81]:
def dtype_transform(df, path):
    '''
    Converts zero values recorded as text to NaN's. 
    
    Arguments
    ---------
    
    Returns
    -------
    '''
    # remove leading or trailing whitespace in all categorical or text features
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    return df
df = dtype_transform(df, "../data/col_names&data_type-Copy1.xlsx")

In [82]:
def val_to_nan():
    '''
    
    
    Arguments
    ---------
    df: df, Original DataFrame
    path: str, Path to excel-file with dtypes linked to features
    
    Returns
    -------
    df: df, Returns original DataFrame with changed dtypes
    '''

In [83]:
# transform categorical data

# gender
df.gender = df.gender.astype('category')

**Feature Race**
The categorical variable "Race" includes the category "race or ethnic group data not provided by source". These values are treated as missing values, since they do not contain any information about the respective person. 

In [85]:
df.race = df.race.replace('Race or Ethnic Group Data Not Provided by Source',np.NaN)
df.race = df.race.astype('category')

In [86]:
df['loc'] = df['loc'].str.lower().astype('category')

['anterior', 'intermediate', 'pan', 'posterior', 'scleritis', 'panuveitis', NaN]
Categories (6, object): ['anterior', 'intermediate', 'pan', 'posterior', 'scleritis', 'panuveitis']

In [87]:
df.cat = df.cat.str.lower().astype('category')

In [88]:
df.specific_diagnosis = df.specific_diagnosis.str.lower().astype('category')

In [89]:
df.notes.value_counts(dropna=False)

NaN              850
pars planitis    142
WDS               80
lymphoma           2
PIC                1
Name: notes, dtype: int64

todo: describe change with 'c'

In [90]:
col = ['ac_abn_od_cells', 'ac_abn_os_cells', 'vit_abn_od_cells',
       'vit_abn_os_cells', 'vit_abn_od_haze', 'vit_abn_os_haze']
for c in col: 
    # replace 'C' (for missing) with NaN
    df[c] = df[c].replace('C',np.nan)
    df[c] = df[c].astype('float')
    df[c] = pd.Categorical(values=df[c], categories=df[c].unique().sort(), ordered=True)

In [91]:
col = ['hbc__ab', 'hbs__ag', 'hcv__ab']
for c in col:
    df[c] = df[c].str.lower()
    df.loc[df[c] == 'negative', c] = 0
    df.loc[df[c] == 'see note | positive result s/co ratio is >5.0.  confirmatory testing i', c] = 1
    df.loc[df[c] == 'see below | positive result s/co ratio is >5.0.  confirmatory testing', c] = 1
    df.loc[df[c] == 'reactive', c] = 1
    df.loc[df[c] == 'repeat reactive', c] = 1
    df.loc[df[c] == 'invalid result', c] = np.nan
    df.loc[df[c] == 'note:', c] = np.nan
    df[c] = df[c].astype('category')

[0, 1, NaN]
Categories (2, int64): [0, 1]
[0, NaN, 1]
Categories (2, int64): [0, 1]
[0, NaN, 1]
Categories (2, int64): [0, 1]


In [None]:
def preprocessing(cat_features, num_features, imputer):
    if num_features is not None:
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy=imputer['numeric']['strategy'])),
            ('scaler', StandardScaler())])

    if cat_features is not None:
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))])

    if cat_features is None and num_features is not None:
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, num_features)])
    elif cat_features is not None and num_features is  None:
        preprocessor = ColumnTransformer(
            transformers=[
                ('cat', categorical_transformer, cat_features)])
    elif cat_features is not None and num_features is not None:
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, num_features),
                ('cat', categorical_transformer, cat_features)])
    return preprocessor

numeric_features = []
categorical_features = []
imputer = {'categorical':{'strategy':'constant', 'fill_value'='missing'}, 'numerical':{'strategy':'median'}}
preprocessor = preprocessing(categorical_features, numeric_features, imputer)