##### PROBLEM STATEMENT : Predict the survival of a horse based on various observed medical conditions. 

#### Step 1 : Import  required Libraries and the dataset 

In [16]:
#Import the libraries
import pandas as pd

In [17]:
df = pd.read_csv("D:\\Downloads\\horse.csv")
df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no


In [18]:
df.shape

(299, 28)

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 79 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   hospital_number                   299 non-null    int64  
 1   rectal_temp                       239 non-null    float64
 2   pulse                             275 non-null    float64
 3   respiratory_rate                  241 non-null    float64
 4   nasogastric_reflux_ph             53 non-null     float64
 5   packed_cell_volume                270 non-null    float64
 6   total_protein                     266 non-null    float64
 7   abdomo_protein                    101 non-null    float64
 8   lesion_1                          299 non-null    int64  
 9   lesion_2                          299 non-null    int64  
 10  lesion_3                          299 non-null    int64  
 11  surgery_no                        299 non-null    uint8  
 12  surgery_

In [20]:
Target = df.outcome

df = df.drop('outcome',axis=1)


In [21]:
Target.unique()

array(['died', 'euthanized', 'lived'], dtype=object)

In [36]:
df.keys()

Index(['surgery', 'age', 'hospital_number', 'rectal_temp', 'pulse',
       'respiratory_rate', 'temp_of_extremities', 'peripheral_pulse',
       'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis',
       'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux',
       'nasogastric_reflux_ph', 'rectal_exam_feces', 'abdomen',
       'packed_cell_volume', 'total_protein', 'abdomo_appearance',
       'abdomo_protein', 'surgical_lesion', 'lesion_1', 'lesion_2', 'lesion_3',
       'cp_data'],
      dtype='object')

#### Step 2 : Preprocess the data using get_dummies and label encoders

In [40]:
# define a list of categorical variable names
categorical_variables = ['surgery', 'age', 'temp_of_extremities', 'peripheral_pulse',
       'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis',
       'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux',
       'rectal_exam_feces', 'abdomen','abdomo_appearance','surgical_lesion', 'cp_data']

# iterate through each variable name in the list
for category in categorical_variables:
    
    # handle missing values by filling them with a default value
    df[category].fillna('Unknown', inplace=True)
    
    # convert the column to numerical data using get_dummies()
    dummies = pd.get_dummies(df[category], prefix=category)
    
    # ensure that all categorical variables have the same length
    if len(dummies.columns) != len(df[category].unique()):
        missing_columns = set(df[category].unique()) - set(dummies.columns)
        for column in missing_columns:
            dummies[column] = 0
            
    # add the new columns to the dataframe
    df = pd.concat([df, dummies], axis=1)
    
    # drop the original categorical variable
    df.drop(category, axis=1, inplace=True)


In [41]:
from sklearn.model_selection import train_test_split

In [42]:
from sklearn.preprocessing import LabelEncoder
X,y = df.values,Target.values

le = LabelEncoder()
y = le.fit_transform(y)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2 ,random_state =1)

#### Step 3 : Impute the missing values using the most frequent values

In [44]:
from sklearn.impute import SimpleImputer
import numpy as np

imp = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')

X_train = imp.fit_transform(X_train)
X_test = imp.fit_transform(X_test)

#### Step 4: Fit decision tree classifier on the transformed data

In [45]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()

In [46]:
classifier.fit(X_train,y_train)

In [47]:
predictions = classifier.predict(X_test)
predictions

array([1, 0, 0, 1, 2, 2, 2, 0, 0, 2, 2, 2, 1, 1, 0, 1, 0, 0, 0, 2, 2, 2,
       1, 0, 2, 1, 1, 2, 0, 1, 2, 1, 2, 0, 2, 0, 2, 2, 2, 2, 1, 2, 2, 2,
       0, 0, 1, 2, 1, 2, 2, 2, 0, 1, 0, 0, 2, 2, 0, 0])

#### Step 5: Check for accuracy

In [52]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,predictions)*100 ,"%")

61.66666666666667 %


In [53]:
#Using RandomForest

In [54]:
from sklearn.ensemble import RandomForestClassifier
RF =  RandomForestClassifier()

In [55]:
RF.fit(X_train,y_train)

In [56]:
RF_predict = RF.predict(X_test)
RF_predict

array([2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 1, 0, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0,
       0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2])

In [57]:
accuracy_score(y_test,RF_predict)*100

70.0