In [201]:
import numpy as np
import pandas as pd

def preprocess_data(data):
    data = data.replace('?', np.NaN)

    # Drop unnecessary columns
    # weights, payer_code, diag_1_desc, diag_2_desc, diag_3_desc
    data.drop(labels=['weight', 'payer_code', 'diag_1_desc', 'diag_2_desc', 'diag_3_desc'], axis=1, inplace=True)

    data['diag_1'] = group_diagnoses(data['diag_1'])
    data['diag_2'] = group_diagnoses(data['diag_2'])
    data['diag_3'] = group_diagnoses(data['diag_3'])

    # Encode string data to numericals
    to_cat = list(data.select_dtypes(['object']).columns)
    data[to_cat] = data[to_cat].astype('category')
    cat_columns = data.select_dtypes(['category']).columns
    data[cat_columns] = data[cat_columns].apply(lambda x: x.cat.codes)

    # Get Readmitted as labels
    labels = data['readmitted']
    data.drop(labels=['readmitted'], axis=1, inplace=True)
    
    columns = data.columns
    numerical_columns = ['num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient']
    for column in columns:
        if column in numerical_columns:
            continue
        ohe = pd.get_dummies(data[column], drop_first=False)
        data = pd.concat([data, ohe], axis=1)
        data.drop(labels=[column], axis=1, inplace=True)

    
    return labels.values.ravel(), data

def group_diagnoses(df):
    # Create mapping from
    l_old = []
    l_new = []

    idx = 0
    tmp_list1 = list(range(390, 460))
    tmp_list1 += [785]
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1

    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = list(range(460, 520))
    tmp_list1 += [786]
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1

    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = list(range(520, 579))
    tmp_list1 += [787]
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1

    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = [str(i) for i in list(np.arange(250, 251, 0.01))]
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1
    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = range(800, 1000)
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1
    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = range(710, 740)
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1
    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = list(range(580, 630))
    tmp_list1 += [788]
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1
    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = range(140, 240)
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1
    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    l_old = [str(i) for i in l_old]
    d = dict(zip(l_old, l_new))

    df_new = df.copy()
    
    df_new = df_new.map(d)
    df_new = df_new.replace(df_new[pd.isna(df_new)], 8)
    df_new = df_new.astype(int)
    return df_new



In [205]:
data = pd.read_csv('task1/data/diab_train.csv', index_col=0)
data.reset_index(drop=True, inplace=True)
#data.reset_index(drop=True, inplace=True)
#data = preprocess_data(data)


In [206]:
data.drop(labels=['weight', 'payer_code', 'diag_1_desc', 'diag_2_desc', 'diag_3_desc'], axis=1, inplace=True)

data['diag_1'] = group_diagnoses(data['diag_1'])
data['diag_2'] = group_diagnoses(data['diag_2'])
data['diag_3'] = group_diagnoses(data['diag_3'])

to_cat = list(data.select_dtypes(['object']).columns)
data[to_cat] = data[to_cat].astype('category')
cat_columns = data.select_dtypes(['category']).columns
data[cat_columns] = data[cat_columns].apply(lambda x: x.cat.codes)

labels = data['readmitted']
data.drop(labels=['readmitted'], axis=1, inplace=True)

In [212]:
data.columns
for col in data.columns:
    print(data[col].values_count())

AttributeError: 'Series' object has no attribute 'values_count'

In [208]:
columns = data.columns
numerical_columns = ['time_in_hospital', 'number_diagnoses', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient']
for column in columns:
    if column in numerical_columns:
        continue
    ohe = pd.get_dummies(data[column], drop_first=False)
    data = pd.concat([data, ohe], axis=1)
    data.drop(labels=[column], axis=1, inplace=True)


In [209]:
data

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,0,1,...,1.1,2,0.1,0.2,0.3,0.4,0.5,1.2,0.6,1.3
0,4,40,4,10,0,0,0,9,0,1,...,0,0,1,1,1,1,0,1,1,0
1,1,24,1,5,0,0,0,3,0,0,...,0,0,1,1,1,1,1,0,0,1
2,2,59,0,12,0,0,0,8,0,0,...,0,0,1,1,1,1,1,0,0,1
3,4,60,1,14,0,0,1,5,0,1,...,0,0,1,1,1,1,1,0,0,1
4,13,54,6,49,0,0,0,6,0,0,...,0,0,1,1,1,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,4,64,0,17,0,1,1,8,0,0,...,0,0,1,1,1,1,0,1,0,1
5996,12,48,0,15,0,0,0,4,0,1,...,0,0,1,1,1,1,0,1,0,1
5997,2,55,3,10,0,0,0,4,0,0,...,0,0,1,1,1,1,0,1,0,1
5998,1,26,0,7,0,0,1,7,0,1,...,0,0,1,1,1,1,1,0,0,1


In [199]:
# data.to_csv('train_ohe.csv', index=False)
# labels.to_csv('train_ohe.csv', index=False)

AttributeError: module 'pandas' has no attribute 'to_csv'