In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler


In [2]:
# Load your data into a pandas dataframe
data = pd.read_csv('features.csv',index_col=0)
data = data[~data.isnull().any(axis=1)]

In [3]:
# Split the data into training and testing sets
train, test = train_test_split(data, test_size=0.2, random_state=42)


# Separate the target variable from the input features
X_train = train.drop('OUTCOMETYPE', axis=1)
y_train = train['OUTCOMETYPE']

X_test = test.drop('OUTCOMETYPE', axis=1)
y_test = test['OUTCOMETYPE']

cols = X_train.columns


In [4]:
# Scale the input features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
# Train a random forest classifier with default parameters
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rfc.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.76


In [6]:
from sklearn.metrics import classification_report, confusion_matrix

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Get feature importances
importances = rfc.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

Confusion Matrix:
[[5562   36  954]
 [ 221   96  121]
 [1306   22 2623]]
Classification Report:
              precision    recall  f1-score   support

       ADOPT       0.78      0.85      0.82      6552
        DIED       0.62      0.22      0.32       438
         RTO       0.71      0.66      0.69      3951

    accuracy                           0.76     10941
   macro avg       0.71      0.58      0.61     10941
weighted avg       0.75      0.76      0.75     10941



In [7]:
# Print the feature ranking
print(70*"#")
print("FEATURE RANKING USING IMPORTANCE SCORE")
print(70*"#")
print("RANK","\tIMPORTANCE", "\tFEATURE")
print("----","\t----------", "\t--------------------------")
for f in range(X_train.shape[1]):    
    #print(f"{f + 1}. feature {indices[f]} ({importances[indices[f]]:.2f})")
    print(f"{f + 1}. \t({importances[indices[f]]:.2f}) \t\t{cols[f]} ")
print(70*"#")

######################################################################
FEATURE RANKING USING IMPORTANCE SCORE
######################################################################
RANK 	IMPORTANCE 	FEATURE
---- 	---------- 	--------------------------
1. 	(0.31) 		AGEOUTCOME 
2. 	(0.16) 		PREVHIST 
3. 	(0.14) 		IS_PUREBREED 
4. 	(0.08) 		INTAKETYPE_ABANDONED 
5. 	(0.06) 		INTAKETYPE_EUTHANASIAREQUEST 
6. 	(0.05) 		INTAKETYPE_OWNERSURRENDER 
7. 	(0.04) 		INTAKETYPE_PUBLICASSIST 
8. 	(0.04) 		INTAKETYPE_STRAY 
9. 	(0.03) 		INTAKECONDITION_AGED 
10. 	(0.02) 		INTAKECONDITION_BEHAVIOR 
11. 	(0.02) 		INTAKECONDITION_FERAL 
12. 	(0.01) 		INTAKECONDITION_INJURED 
13. 	(0.01) 		INTAKECONDITION_MEDICAL 
14. 	(0.01) 		INTAKECONDITION_NORMAL 
15. 	(0.01) 		INTAKECONDITION_NURSING 
16. 	(0.00) 		INTAKECONDITION_OTHER 
17. 	(0.00) 		INTAKECONDITION_PREGNANT 
18. 	(0.00) 		INTAKECONDITION_SICK 
19. 	(0.00) 		SEXINTAKE_INTACTFEMALE 
20. 	(0.00) 		SEXINTAKE_INTACTMALE 
21. 	(0.00) 		SEXINTAKE_NEUTERED