# Baseline model

In [1]:
import pandas as pd
import numpy as np

## Load the data

In [2]:
data_dir = "./data/"
train = pd.read_csv(data_dir + "train.csv", index_col="id")
test = pd.read_csv(data_dir + "test.csv", index_col="id")
original = pd.read_csv(data_dir + "cirrhosis.csv", index_col="ID")

# add source column
train["Source"] = "Competition"
test["Source"]  = "Competition"
original["Source"] = "Original"
# shift index to match other data
original.index += test.index.max() 
# reorder columns
original = original.reindex(train.columns, axis=1);

In [4]:
# append original dataset
train = pd.concat([train, original], axis=0, ignore_index = True);
train.index = range(len(train));
train.index.name = 'id';

## Preprocess the data

In [5]:
X_train = train.drop(columns=["Status"])
y_train = train["Status"]
X_test = test

In [6]:
def process_features(df):
    df = df.ffill().bfill()
    df['Age_Yr'] = df['Age'] / 365
    # encode categorical features
    df['Drug'] = np.where(df['Drug'] == "Placebo", 0, 1).astype(np.uint8)
    df['Sex'] = np.where(df['Sex'] == "M", 1, 0).astype(np.uint8)
    df['Ascites'] = np.where(df['Ascites'] == "N", 1, 0).astype(np.uint8)
    df['Hepatomegaly'] = np.where(df['Hepatomegaly'] == "N", 1, 0).astype(np.uint8)
    df['Spiders'] = np.where(df['Spiders'] == "N", 1, 0).astype(np.uint8)
    df['Edema'] = df['Edema'].map({"N": 0, "S": 1, "Y": 2}).astype(np.uint8)
    df['Stage'] = df['Stage'].astype(np.uint8)
    df['Stage'] = df['Stage'].fillna(1).astype(np.uint8)

    df['Diseases']     = (df['Ascites'] + df['Hepatomegaly'] + df['Spiders'] + df['Edema'] + df['Drug']).astype(np.uint8)
    # df['Age_Risk'] = np.where(df['Age_Yr'] >= 45, 1,0)
    # df['Bilirubin_Risk'] = np.select([df.Bilirubin < 1.2, df.Bilirubin < 2], [0,1],2)
    # df['Cholesterol_Risk'] = np.select([df.Cholesterol < 200, df.Cholesterol <= 239], [0,1], 2)
    # df['Albumin_Risk'] = np.select([df.Albumin.between(3.4, 5.4, inclusive = "left"),df.Albumin < 3.4], [0,1], 2)
    # df['Copper_Risk'] = np.select([df.Copper.between(62, 140, inclusive = "left"), df.Copper < 62], [0,1], 2)  
    # df['Tryglicerides_Risk'] = np.select([df.Tryglicerides < 150, df.Tryglicerides < 200, df.Tryglicerides < 499], [0,1,2], 3)
    # df['Platelets_Risk'] = np.select([df.Platelets.between(150, 450, inclusive = "left"),df.Platelets < 150], [0,1], 2)            
    # df['Prothrombin_Risk'] = np.select([df.Prothrombin.between(11,13.5, inclusive = "left"),df.Prothrombin < 11], [0,1], 2)              
    return df


In [7]:
 def _reduce_mem(df: pd.DataFrame):
    "This method reduces memory for numeric columns in the dataframe";
    
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2;
    
    for col in df.columns:
        col_type = df[col].dtypes
        
        if col_type in numerics:
            c_min = df[col].min();
            c_max = df[col].max();

            if "int" in str(col_type):
                if c_min >= np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min >= np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min >= np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min >= np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                if c_min >= np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)  

    end_mem = df.memory_usage().sum() / 1024**2

    print(f"Start - end memory:- {start_mem:5.2f} - {end_mem:5.2f} Mb");
    return df;

In [8]:
X_train = process_features(X_train)
X_test = process_features(X_test)
X_train = _reduce_mem(X_train)
X_test = _reduce_mem(X_test)


Start - end memory:-  0.89 -  0.48 Mb
Start - end memory:-  0.60 -  0.34 Mb
