In [1]:
# import required packages
import os 
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

# https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#sphx-glr-auto-examples-classification-plot-classifier-comparison-py

In [2]:
# Import the data 
file_path = os.path.join(os.getcwd(), 'data')
sample_submission = os.listdir(file_path)[0]
test_file = os.listdir(file_path)[1]
train_file = os.listdir(file_path)[2]

# Check files are correctly referenced
print(sample_submission)
print(test_file)
print(train_file)

sample_submission.csv
test.csv
train.csv


In [3]:
# Import as pandas dataframe
file_name = os.listdir(file_path)[0]
train_data = pd.read_csv(os.path.join(file_path, train_file))
test_data = pd.read_csv(os.path.join(file_path, test_file))

test_data = test_data.set_index("id")
train_data = train_data.set_index("id")

In [4]:
# Add some analysis/charting of the data
# Including correlation matrix graphs

In [5]:
# Drop NA values from dataset (needed this step as observed some rows had NA figures)
train_data = train_data.dropna()

In [6]:
train_data

Unnamed: 0_level_0,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul),label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,3556.0,2489.0,265.19,77.53,176.55,0.00,4.20,307.91,52,0,7515.0,1
1,1906.0,134.0,1442.61,551.90,876.07,112.10,168.15,1735.48,20,1,1756.0,0
2,1586.0,71.0,1332.74,684.20,655.26,244.95,216.52,1820.04,28,1,1311.0,0
3,683.0,94.0,419.23,255.80,162.17,72.05,44.68,538.22,55,1,1443.0,0
4,1032.0,71.0,1102.72,480.27,625.30,188.78,130.77,1427.97,28,1,1542.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
82,626.0,68.0,1771.57,666.99,1117.48,360.21,118.84,2306.82,42,1,1521.0,0
83,1237.0,71.0,1348.53,428.09,924.69,120.02,48.67,1524.78,56,0,1345.0,0
84,634.0,1002.0,1300.00,558.00,724.00,67.00,105.00,1484.26,34,0,2926.0,1
85,112.0,884.0,942.83,378.49,567.06,116.77,31.81,1104.59,33,1,2352.0,1


In [7]:
# Define features and labels for training data
train_features = train_data.drop(['label'], axis = 1)
train_labels = train_data['label']

print(train_features.dtypes)
print(train_labels.dtypes)

# Convert to np array
train_features_np = np.array(train_features)
train_labels_np = np.array(train_labels)

MO HLADR+ MFI (cells/ul)    float64
Neu CD64+MFI (cells/ul)     float64
CD3+T (cells/ul)            float64
CD8+T (cells/ul)            float64
CD4+T (cells/ul)            float64
NK (cells/ul)               float64
CD19+ (cells/ul)            float64
CD45+ (cells/ul)            float64
Age                           int64
Sex 0M1F                      int64
Mono CD64+MFI (cells/ul)    float64
dtype: object
int64


In [8]:
# Define features and labels for test data
test_features = test_data

print(test_features.dtypes)

# Convert to np array
test_features_np = np.array(test_features)

MO HLADR+ MFI (cells/ul)    float64
Neu CD64+MFI (cells/ul)     float64
CD3+T (cells/ul)            float64
CD8+T (cells/ul)            float64
CD4+T (cells/ul)            float64
NK (cells/ul)               float64
CD19+ (cells/ul)            float64
CD45+ (cells/ul)            float64
Age                           int64
Sex 0M1F                      int64
Mono CD64+MFI (cells/ul)    float64
dtype: object


In [9]:
print('Training Features Shape:', train_features_np.shape)
print('Training Labels Shape:', train_labels_np.shape)

Training Features Shape: (86, 11)
Training Labels Shape: (86,)


In [10]:
train_features_np

array([[3.55600e+03, 2.48900e+03, 2.65190e+02, 7.75300e+01, 1.76550e+02,
        0.00000e+00, 4.20000e+00, 3.07910e+02, 5.20000e+01, 0.00000e+00,
        7.51500e+03],
       [1.90600e+03, 1.34000e+02, 1.44261e+03, 5.51900e+02, 8.76070e+02,
        1.12100e+02, 1.68150e+02, 1.73548e+03, 2.00000e+01, 1.00000e+00,
        1.75600e+03],
       [1.58600e+03, 7.10000e+01, 1.33274e+03, 6.84200e+02, 6.55260e+02,
        2.44950e+02, 2.16520e+02, 1.82004e+03, 2.80000e+01, 1.00000e+00,
        1.31100e+03],
       [6.83000e+02, 9.40000e+01, 4.19230e+02, 2.55800e+02, 1.62170e+02,
        7.20500e+01, 4.46800e+01, 5.38220e+02, 5.50000e+01, 1.00000e+00,
        1.44300e+03],
       [1.03200e+03, 7.10000e+01, 1.10272e+03, 4.80270e+02, 6.25300e+02,
        1.88780e+02, 1.30770e+02, 1.42797e+03, 2.80000e+01, 1.00000e+00,
        1.54200e+03],
       [4.95000e+02, 1.43000e+02, 1.71932e+03, 8.85940e+02, 8.42940e+02,
        2.71420e+02, 1.19460e+02, 2.14366e+03, 4.20000e+01, 1.00000e+00,
        1.8270

In [11]:
# Check data for any inapprpriate data values (added this step after experiencing issues training model without it)
print("Data contains NaNs:" + str(np.any(np.isnan(train_features_np))))
print("Data is all finite values:" + str(np.all(np.isfinite(train_features_np))))

Data contains NaNs:False
Data is all finite values:True


In [12]:
# Implement Random Forest Classifier
clf = RandomForestClassifier(n_estimators = 1000, random_state = 1)
clf.fit(train_features_np, train_labels_np)

RandomForestClassifier(n_estimators=1000, random_state=1)

In [13]:
# Validate against training data
y_hat = pd.DataFrame(clf.predict(train_features_np), columns = ['label'])
print("Classification report for the training data:")
print(classification_report(train_labels,y_hat))
print("AUC score for the training data:")
print(roc_auc_score(train_labels,y_hat))

Classification report for the training data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        57
           1       1.00      1.00      1.00        29

    accuracy                           1.00        86
   macro avg       1.00      1.00      1.00        86
weighted avg       1.00      1.00      1.00        86

AUC score for the training data:
1.0


In [14]:
# Create predictions for test data
y_hat = pd.DataFrame(clf.predict(test_features_np), columns = ['label'])

In [15]:
# Save prediction file to csv file
file_path_out = os.path.join(os.getcwd(), 'output')
y_hat.to_csv(file_path_out + '/predictions.csv', index_label='id')