In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import random
import seaborn as sns
import math
from sklearn.impute import KNNImputer
from sklearn.utils import shuffle

In [2]:
train = pd.read_csv("../resources/train.csv")
train = shuffle(train)
test = pd.read_csv("../resources/test.csv")
df = pd.concat([train, test], ignore_index=True)
df

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
0,1018,no,adult,529424,37.8,56.0,28.0,normal,normal,normal_pink,...,46.0,5.9,cloudy,7.0,no,0,0,0,yes,lived
1,88,no,adult,5289419,38.4,52.0,20.0,cool,normal,pale_cyanotic,...,48.0,7.7,serosanguious,4.3,no,2124,0,0,yes,lived
2,892,yes,young,5290409,37.2,150.0,80.0,cool,reduced,dark_cyanotic,...,66.0,7.5,serosanguious,5.3,yes,2205,0,0,yes,died
3,408,yes,adult,535292,37.5,120.0,36.0,cold,reduced,dark_cyanotic,...,69.0,82.0,clear,2.0,yes,3205,0,0,no,lived
4,175,no,adult,530242,37.2,60.0,12.0,cool,absent,pale_cyanotic,...,52.0,7.5,cloudy,4.1,yes,2207,0,0,yes,euthanized
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054,2054,no,adult,529461,40.3,114.0,36.0,cool,reduced,normal_pink,...,57.0,8.1,serosanguious,4.5,yes,3205,0,0,yes,
2055,2055,yes,adult,535338,37.2,100.0,20.0,cool,reduced,pale_cyanotic,...,50.0,66.0,serosanguious,2.0,yes,2209,0,0,no,
2056,2056,yes,adult,529640,39.2,132.0,12.0,cool,reduced,dark_cyanotic,...,53.0,7.6,serosanguious,4.5,yes,2205,0,0,no,
2057,2057,no,adult,5287179,38.3,54.0,66.0,normal,normal,normal_pink,...,49.0,8.6,clear,5.0,no,3111,0,0,yes,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2059 entries, 0 to 2058
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     2059 non-null   int64  
 1   surgery                2059 non-null   object 
 2   age                    2059 non-null   object 
 3   hospital_number        2059 non-null   int64  
 4   rectal_temp            2059 non-null   float64
 5   pulse                  2059 non-null   float64
 6   respiratory_rate       2059 non-null   float64
 7   temp_of_extremities    1985 non-null   object 
 8   peripheral_pulse       1952 non-null   object 
 9   mucous_membrane        2025 non-null   object 
 10  capillary_refill_time  2047 non-null   object 
 11  pain                   1986 non-null   object 
 12  peristalsis            2020 non-null   object 
 13  abdominal_distention   2014 non-null   object 
 14  nasogastric_tube       1915 non-null   object 
 15  naso

In [5]:
df.describe(include = "all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,2059.0,,,,1029.0,594.526422,0.0,514.5,1029.0,1543.5,2058.0
surgery,2059.0,2.0,yes,1476.0,,,,,,,
age,2059.0,2.0,adult,1942.0,,,,,,,
hospital_number,2059.0,,,,1016073.009713,1441049.027701,521399.0,528800.0,529796.0,534293.0,5305129.0
rectal_temp,2059.0,,,,38.219136,0.787378,35.4,37.8,38.2,38.6,40.8
pulse,2059.0,,,,79.836328,29.125777,30.0,54.0,76.0,100.0,184.0
respiratory_rate,2059.0,,,,30.320544,16.849999,8.0,18.0,28.0,36.0,96.0
temp_of_extremities,1985.0,4.0,cool,1172.0,,,,,,,
peripheral_pulse,1952.0,4.0,reduced,1202.0,,,,,,,
mucous_membrane,2025.0,6.0,pale_cyanotic,462.0,,,,,,,


In [6]:
aux = df.isnull().sum()
aux, aux.sum()

(id                         0
 surgery                    0
 age                        0
 hospital_number            0
 rectal_temp                0
 pulse                      0
 respiratory_rate           0
 temp_of_extremities       74
 peripheral_pulse         107
 mucous_membrane           34
 capillary_refill_time     12
 pain                      73
 peristalsis               39
 abdominal_distention      45
 nasogastric_tube         144
 nasogastric_reflux        35
 nasogastric_reflux_ph      0
 rectal_exam_feces        315
 abdomen                  367
 packed_cell_volume         0
 total_protein              0
 abdomo_appearance         79
 abdomo_protein             0
 surgical_lesion            0
 lesion_1                   0
 lesion_2                   0
 lesion_3                   0
 cp_data                    0
 outcome                  824
 dtype: int64,
 2148)

In [7]:
def plot_two_vars(df: pd.DataFrame, main_col: str, cols):
    ratio = 3
    n = len(cols)
    fig, axes = plt.subplots(math.ceil(n/ratio), min(ratio, n), figsize=(7*min(ratio, n), 5*(math.ceil(n/ratio))))
    
    for idx, col in enumerate(cols):
        if n==1:
            ax = axes
        elif n <= ratio:
            ax=axes[idx%ratio]
        else:
            ax=axes[int(idx/ratio), idx%ratio]

        type = df[col].dtype
        if pd.api.types.is_numeric_dtype(type):
            sns.barplot(ax=ax, x = main_col, y = col, data = df);
        else:
            aux = df.groupby(main_col)[col].value_counts(normalize=True, dropna=False).reset_index(name='NormalizedCounts')
    
            sns.barplot(ax=ax, data=aux, x=main_col, y='NormalizedCounts', hue=col)
            #sns.histplot(ax=ax, x = main_col, hue = col, data = df, stat="count", multiple="stack")

In [8]:
col = 'surgical_lesion'
#plot_two_vars(df, col, df.columns.drop(col))

In [9]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.preprocessing import MinMaxScaler

def pre_process_data(df :pd.DataFrame):
    df['hospital_dig'] = df.hospital_number.astype(str).map(lambda x: len(x))
    df = df.drop(columns=['id','hospital_number','lesion_2','lesion_3'])
    
    df.surgery = df.surgery.map({'yes':1.0,'no':0.0})
    df.age = df.age.map({'adult':1.0,'young':0.0})
    df.temp_of_extremities = df.temp_of_extremities.map({'cold':0.0,'cool':1.0,'normal':2.0,'warm':3.0})
    df.peripheral_pulse = df.peripheral_pulse.map({'absent':0.0,'reduced':1.0,'normal':2.0,'increased':3.0})
    df.mucous_membrane = df.mucous_membrane.map({'normal_pink':0.0,'bright_pink':1.0,'pale_pink':2.0,'pale_cyanotic':3.0,'bright_red':4.0,'dark_cyanotic':5.0})
    df.capillary_refill_time = df.capillary_refill_time.map({'less_3_sec':0.0,'3':1.0,'more_3_sec':2.0})
    df.peristalsis = df.peristalsis.map({'absent':0.0,'hypomotile':1.0,'normal':2.0,'hypermotile':3.0})
    df.abdominal_distention = df.abdominal_distention.map({'none':0.0,'slight':1.0,'moderate':2.0,'severe':3.0})
    df.nasogastric_tube = df.nasogastric_tube.map({'none':0.0,'slight':1.0,'significant':2.0})
    df.nasogastric_reflux = df.nasogastric_reflux.map({'none':0.0,'slight':1.0,'less_1_liter':2.0,'more_1_liter':3.0})
    df.rectal_exam_feces = df.rectal_exam_feces.map({'absent':0.0,'decreased':1.0,'normal':2.0,'increased':3.0})
    df.abdomo_appearance = df.abdomo_appearance.map({'clear':0.0,'cloudy':1.0,'serosanguious':2.0})
    df.surgical_lesion = df.surgical_lesion.map({'yes':1.0,'no':0.0})
    df.cp_data = df.cp_data.map({'yes':1.0,'no':0.0})
    df.lesion_1 = df.lesion_1.apply(lambda x:int(str(x)[0]))
    df.lesion_1 = df.lesion_1.map({0:'none',1:'gastric',2:'smintestine',3:'lgcolon',4:'lgcoloncecum',5:'cecum',6:'transversecolon',7:'retumdesccolon',8:'uterus',9:'bladder'})
    
    target = df.outcome
    df = df.drop(columns='outcome')
    
    encFeatures = df.select_dtypes(["object"]).columns
    enc = OneHotEncoder()
    
    enc_data = pd.DataFrame(enc.fit_transform(df[encFeatures]).toarray(), columns = enc.get_feature_names_out(encFeatures))
    
    
    df = pd.concat([df, enc_data], axis=1)
    df = df.drop(columns=encFeatures)
    
    df = df.drop(columns=[col for col in df if '_nan' in col])
    
    imputer = KNNImputer(n_neighbors=4)
    df = pd.DataFrame(imputer.fit_transform(df),columns=df.columns)
    
    scaler = MinMaxScaler((-1,1))
    #scaler = StandardScaler()
    df = pd.DataFrame(scaler.fit_transform(df),columns=df.columns)
    
    df['outcome']=target.to_numpy()
    return df

In [10]:
pre_df = pre_process_data(df)
pre_df

Unnamed: 0,surgery,age,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,peristalsis,...,lesion_1_cecum,lesion_1_gastric,lesion_1_lgcolon,lesion_1_lgcoloncecum,lesion_1_none,lesion_1_retumdesccolon,lesion_1_smintestine,lesion_1_transversecolon,lesion_1_uterus,outcome
0,1.0,1.0,0.000000,0.324675,-0.636364,-0.333333,-0.333333,1.0,1.0,-1.000000,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,died
1,1.0,1.0,-0.222222,-0.246753,-0.909091,-0.333333,0.333333,0.2,1.0,-1.000000,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,euthanized
2,1.0,1.0,0.074074,0.168831,-0.545455,-0.333333,-0.333333,-0.2,-1.0,-0.333333,...,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,lived
3,1.0,1.0,-0.370370,-0.454545,-0.500000,-1.000000,-0.333333,-0.2,1.0,-0.333333,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,lived
4,-1.0,1.0,-0.037037,-0.714286,-0.090909,0.333333,0.333333,-1.0,-1.0,-0.333333,...,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,lived
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054,-1.0,1.0,0.814815,0.090909,-0.363636,-0.333333,-0.333333,-1.0,1.0,-0.333333,...,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
2055,1.0,1.0,-0.333333,-0.090909,-0.727273,-0.333333,-0.333333,0.2,1.0,-1.000000,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,
2056,1.0,1.0,0.407407,0.324675,-0.909091,-0.333333,-0.333333,1.0,1.0,-0.333333,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,
2057,-1.0,1.0,0.074074,-0.688312,0.318182,0.333333,0.333333,-1.0,-1.0,-0.333333,...,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
