# Horse Survival
### Horse survival prediction based on various observed medical conditions. Dataset: "horses.csv"

In [1]:
# libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [2]:
# increasing max columns display from the default
pd.options.display.max_columns = 30

#### Let’s attempt to predict the survival of a horse based on various observed medical conditions. 
#### Loading the data from ‘horses.csv’ and observing whether it contains missing values.

In [3]:
df = pd.read_csv(r'horse.csv')

In [4]:
df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,extreme_pain,absent,severe,,,,decreased,distend_large,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,mild_pain,absent,slight,,,,absent,other,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,mild_pain,hypomotile,none,,,,normal,normal,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,depressed,absent,severe,none,less_1_liter,5.0,decreased,,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,,,,,,,,,74.0,7.4,,,died,no,4300,0,0,no


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 28 columns):
surgery                  299 non-null object
age                      299 non-null object
hospital_number          299 non-null int64
rectal_temp              239 non-null float64
pulse                    275 non-null float64
respiratory_rate         241 non-null float64
temp_of_extremities      243 non-null object
peripheral_pulse         230 non-null object
mucous_membrane          252 non-null object
capillary_refill_time    267 non-null object
pain                     244 non-null object
peristalsis              255 non-null object
abdominal_distention     243 non-null object
nasogastric_tube         195 non-null object
nasogastric_reflux       193 non-null object
nasogastric_reflux_ph    53 non-null float64
rectal_exam_feces        197 non-null object
abdomen                  181 non-null object
packed_cell_volume       270 non-null float64
total_protein            266 non-null

In [6]:
# null values per feature
df.isnull().sum()

surgery                    0
age                        0
hospital_number            0
rectal_temp               60
pulse                     24
respiratory_rate          58
temp_of_extremities       56
peripheral_pulse          69
mucous_membrane           47
capillary_refill_time     32
pain                      55
peristalsis               44
abdominal_distention      56
nasogastric_tube         104
nasogastric_reflux       106
nasogastric_reflux_ph    246
rectal_exam_feces        102
abdomen                  118
packed_cell_volume        29
total_protein             33
abdomo_appearance        165
abdomo_protein           198
outcome                    0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
dtype: int64

#### Replacing the missing values by the most frequent value in each column.

In [7]:
for col in df.columns:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mode()[0], inplace=True)

In [8]:
df.isnull().sum()

surgery                  0
age                      0
hospital_number          0
rectal_temp              0
pulse                    0
respiratory_rate         0
temp_of_extremities      0
peripheral_pulse         0
mucous_membrane          0
capillary_refill_time    0
pain                     0
peristalsis              0
abdominal_distention     0
nasogastric_tube         0
nasogastric_reflux       0
nasogastric_reflux_ph    0
rectal_exam_feces        0
abdomen                  0
packed_cell_volume       0
total_protein            0
abdomo_appearance        0
abdomo_protein           0
outcome                  0
surgical_lesion          0
lesion_1                 0
lesion_2                 0
lesion_3                 0
cp_data                  0
dtype: int64

#### This dataset contains many categorical features, replacing them with label encoding.

In [9]:
for col_name in df.columns:
    if df[col_name].dtype == 'object':
        df[col_name] = preprocessing.LabelEncoder().fit_transform(df[col_name])

In [10]:
df.dtypes

surgery                    int32
age                        int32
hospital_number            int64
rectal_temp              float64
pulse                    float64
respiratory_rate         float64
temp_of_extremities        int32
peripheral_pulse           int32
mucous_membrane            int32
capillary_refill_time      int32
pain                       int32
peristalsis                int32
abdominal_distention       int32
nasogastric_tube           int32
nasogastric_reflux         int32
nasogastric_reflux_ph    float64
rectal_exam_feces          int32
abdomen                    int32
packed_cell_volume       float64
total_protein            float64
abdomo_appearance          int32
abdomo_protein           float64
outcome                    int32
surgical_lesion            int32
lesion_1                   int64
lesion_2                   int64
lesion_3                   int64
cp_data                    int32
dtype: object

#### Fitting a decision tree classifier and observing the accuracy.

In [11]:
# train/test split
train,test = train_test_split(df,test_size=0.2, random_state=1)
x_train = train.drop(['outcome'],axis=1)
y_train = train.outcome

x_test = test.drop(['outcome'],axis=1)
y_test = test.outcome

In [12]:
# fit
model = tree.DecisionTreeClassifier()
model.fit(x_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [13]:
# predict
pred_y = model.predict(x_test)
df_Ys = pd.DataFrame(pred_y,y_test)
df_Ys.head(10)

Unnamed: 0_level_0,0
outcome,Unnamed: 1_level_1
0,0
2,2
0,1
0,1
2,2
2,2
0,2
0,0
2,2
1,2


In [14]:
# accuracy
metrics.accuracy_score(pred_y, y_test)

0.6333333333333333

#### Fitting a random forest classifier and observe the accuracy.

In [15]:
# fit
model = RandomForestClassifier(n_estimators=100)
model.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [16]:
# predict
pred_y = model.predict(x_test)
df_Ys = pd.DataFrame(pred_y,y_test)
df_Ys.head(10)

Unnamed: 0_level_0,0
outcome,Unnamed: 1_level_1
0,2
2,2
0,2
0,2
2,2
2,2
0,2
0,0
2,2
1,2


In [17]:
# accuracy
metrics.accuracy_score(pred_y, y_test)

0.7833333333333333