In [15]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

import warnings
warnings.filterwarnings("ignore") 

from sklearn.preprocessing import LabelEncoder, StandardScaler

# Input data files are available in the read-only "../input/" directory
# For example, running this will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# other libraries, dependencies, ect....



### Loading and Initial Data Exploration

In [16]:
train_data = pd.read_csv("train.csv")
train_data.drop('id', axis=1, inplace=True)
test_data = pd.read_csv("test.csv")
test_data.drop('id', axis=1, inplace=True)
train_data.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
0,yes,adult,530001,38.1,132,24,cool,reduced,dark_cyanotic,more_3_sec,...,57.0,8.5,serosanguious,3.4,yes,2209,0,0,no,died
1,yes,adult,533836,37.5,88,12,cool,normal,pale_cyanotic,more_3_sec,...,33.0,64.0,serosanguious,2.0,yes,2208,0,0,no,euthanized
2,yes,adult,529812,38.3,120,28,cool,reduced,pale_pink,less_3_sec,...,37.0,6.4,serosanguious,3.4,yes,5124,0,0,no,lived
3,yes,adult,5262541,37.1,72,30,cold,reduced,pale_pink,more_3_sec,...,53.0,7.0,cloudy,3.9,yes,2208,0,0,yes,lived
4,no,adult,5299629,38.0,52,48,normal,normal,normal_pink,less_3_sec,...,47.0,7.3,cloudy,2.6,no,0,0,0,yes,lived


Notes on data:

- id: standard ID value
- surgery: Yes or No of whether the horse in question received surgery
- age: Either Adult or Young
- hospital_number: number ID of hospital the horse attended
- rectal_temp: Temperature taken, details on to how self explanatory lol
- pulse: heart rate
- respiratory_rate: breathing rate
- temp_of_extremities: Cool, Normal, or other
- peripheral pulse: Reduced, Normal, or other
- mucous_membrane: color of mucous membrane
- capillary_refill_time: less then 3 sec, more then 3 sec, other
- pain: expression of amount of pain horse could be in
- peristalsis: hypomotile, absent, other. (apparently peristalsis is the wave like muscle contraction of the digestive tract, who knew! Vets in this case I assume)
- abdominal_distention: Abdominal swelling
- nasogastric_tube: according to google: It is used by a vet to identify if there are any abnormal contents in the horse's stomach, and to administer fluids and some treatments directly into the stomach. 
- nasogastric_reflux: Fluid or gas build-up in the stomach
- nasogastric_reflux_ph: ph level of nasogastric fluid
- rectal_exam_feces: measure of feces during rectal exam
- abdomen: small, large, normal, ect...
- packed_cell_volume: score for composition of blood cells
- total_protein: number value
- abdomo_apperance: color of fluid
- abdomo_protein: number value
- surgical_lesion: true or false
- lesion_1: number value
- lesion_2: number value
- lesion_3: number value
- cp_data: true or false
- outcome: lived, died, euthanized

### Missing Data



In [17]:
# Finding null/na values if there are any
null_vals = train_data.isna().sum().sort_values(ascending=False)
null_vals.head


<bound method NDFrame.head of nasogastric_tube         355
nasogastric_reflux       352
abdominal_distention     235
abdomen                  213
rectal_exam_feces        190
peripheral_pulse          60
abdomo_appearance         48
pain                      44
temp_of_extremities       39
mucous_membrane           21
peristalsis               20
capillary_refill_time      6
total_protein              0
lesion_1                   0
lesion_2                   0
surgical_lesion            0
abdomo_protein             0
lesion_3                   0
cp_data                    0
surgery                    0
packed_cell_volume         0
nasogastric_reflux_ph      0
age                        0
respiratory_rate           0
pulse                      0
rectal_temp                0
hospital_number            0
outcome                    0
dtype: int64>

All the missing columns are categorical, so we can use imputation to fill missing values.

In [18]:
# Hospital number is categorical even though represented as a integer, below corrects this
train_data['hospital_number'] = train_data['hospital_number'].astype('object')
test_data['hospital_number'] = test_data['hospital_number'].astype('object')

In [19]:
# Scaler for numerical data

scaler = StandardScaler()
object_columns = train_data.select_dtypes(include=['object'])
number_columns = train_data.select_dtypes(exclude=['object'])
train_data.loc[:,list(number_columns.columns)] = scaler.fit_transform(train_data.loc[:,list(number_columns.columns)])
test_data.loc[:,list(number_columns.columns)] = scaler.transform(test_data.loc[:,list(number_columns.columns)])

In [20]:
train_data.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
0,yes,adult,530001,-0.129621,1.801773,-0.368142,cool,reduced,dark_cyanotic,more_3_sec,...,0.702421,-0.483319,serosanguious,0.068659,yes,-0.298737,-0.075465,-0.040275,no,died
1,yes,adult,533836,-0.890705,0.289582,-1.09783,cool,normal,pale_cyanotic,more_3_sec,...,-1.576449,1.59801,serosanguious,-0.812647,yes,-0.298921,-0.075465,-0.040275,no,euthanized
2,yes,adult,529812,0.124074,1.389357,-0.124913,cool,reduced,pale_pink,less_3_sec,...,-1.196638,-0.562072,serosanguious,0.068659,yes,0.237648,-0.075465,-0.040275,no,lived
3,yes,adult,5262541,-1.398095,-0.260306,-0.003299,cold,reduced,pale_pink,more_3_sec,...,0.322609,-0.539571,cloudy,0.383411,yes,-0.298921,-0.075465,-0.040275,yes,lived
4,no,adult,5299629,-0.256468,-0.947666,1.091232,normal,normal,normal_pink,less_3_sec,...,-0.247108,-0.528321,cloudy,-0.434944,no,-0.705212,-0.075465,-0.040275,yes,lived


In [21]:
train_data.describe(include=['O'])

Unnamed: 0,surgery,age,hospital_number,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,rectal_exam_feces,abdomen,abdomo_appearance,surgical_lesion,cp_data,outcome
count,1235,1235,1235,1196,1175,1214,1229,1191,1215,1000,880,883,1045,1022,1187,1235,1235,1235
unique,2,2,255,4,4,6,3,6,5,3,2,3,5,5,3,2,2,3
top,yes,adult,529461,cool,reduced,pale_pink,less_3_sec,depressed,hypomotile,moderate,slight,more_1_liter,absent,distend_small,serosanguious,yes,yes,lived
freq,887,1160,46,700,724,284,834,429,664,543,758,604,493,482,570,929,668,574


In [22]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1235 entries, 0 to 1234
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surgery                1235 non-null   object 
 1   age                    1235 non-null   object 
 2   hospital_number        1235 non-null   object 
 3   rectal_temp            1235 non-null   float64
 4   pulse                  1235 non-null   float64
 5   respiratory_rate       1235 non-null   float64
 6   temp_of_extremities    1196 non-null   object 
 7   peripheral_pulse       1175 non-null   object 
 8   mucous_membrane        1214 non-null   object 
 9   capillary_refill_time  1229 non-null   object 
 10  pain                   1191 non-null   object 
 11  peristalsis            1215 non-null   object 
 12  abdominal_distention   1000 non-null   object 
 13  nasogastric_tube       880 non-null    object 
 14  nasogastric_reflux     883 non-null    object 
 15  naso