In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load Data

In [89]:
df_train = pd.read_csv('/Users/alexis/Cranfield/AI/assignment/train.csv')
df_train = df_train.drop(columns=["rad_id", "x_min", "y_min", "class_name", "x_max", "y_max"])
df_train.head()

Unnamed: 0,image_id,class_id
0,50a418190bc3fb1ef1633bf9678929b3,14
1,21a10246a5ec7af151081d0cd6d65dc9,14
2,9a5094b2563a1ef3ff50dc5c7ff71345,3
3,051132a778e61a86eb147c7c6f564dfe,0
4,063319de25ce7edb9b1c6b8881290140,14


Pixels Features CSV

In [90]:
df_pixel_features = pd.read_csv('/Users/alexis/Cranfield/AI/assignment/dicom_pixels_features.csv')
df_pixel_features = df_pixel_features.rename(columns={"filename": "image_id"})
df_pixel_features['image_id'] = df_pixel_features['image_id'].str.replace('.dicom', '')
print(df_pixel_features.shape)
df_pixel_features.head()

(15000, 40)


Unnamed: 0,intensity_mean,intensity_std,intensity_median,glcm_contrast,glcm_correlation,glcm_energy,glcm_homogeneity,mean_gabor_real,std_gabor_real,mean_gabor_imag,...,haralick_9,haralick_10,haralick_11,haralick_12,sobel_edges_count,canny_edges_count,skewness,kurtosis,entropy,image_id
0,162.508754,53.610391,170.0,50.945162,0.991132,0.01746,0.176312,13.336609,27.888411,8.551008,...,0.015018,0.046868,-0.962266,0.939101,6611401,123449,-0.744572,0.210735,7.563331,005d70155f949c7785671800f2c8e1ca
1,125.648836,67.795674,144.0,7.800591,0.999151,0.11638,0.68288,7.823448,7.411495,1.016623,...,0.020257,0.036356,-0.966909,0.928575,6556797,84878,-0.669602,-0.808662,6.91708,0061cf6d35e253b6e7f03940592cc35e
2,107.468138,74.136839,115.0,80.262698,0.992697,0.063658,0.197132,25.949493,65.264177,3.803589,...,0.030123,0.026028,-0.975495,0.928329,8505142,23409,-0.128583,-1.31742,7.448992,006501b11e04aec2d403177b9ae0f34c
3,115.759135,63.175611,140.0,88.948071,0.988853,0.025961,0.251142,7.008416,5.248002,26.063167,...,0.03973,0.026295,-0.975028,0.925514,7626872,453359,-0.453477,-1.299697,7.459437,00675cd546313f912cadd4ad54415d69
4,132.849532,66.681063,143.0,7.035192,0.999209,0.05014,0.519648,8.137124,5.306767,0.117948,...,0.06603,0.022631,-0.978496,0.927692,7163833,7488,-0.511102,-0.841245,7.485881,006e2726c6aa72f042a08b1406c39d52


Metadata CSV

In [91]:
dicom_metadata = pd.read_csv('/Users/alexis/Cranfield/AI/assignment/dicom_metadata.csv')
dicom_metadata.head()

Unnamed: 0,File Name,Transfer Syntax UID,SOP Class UID,SOP Instance UID,Version,Photometric Interpretation,Pixel Representation,High Bit,Samples per Pixel,Bits Allocated,...,Lossy Image Compression,Lossy Image Compression Ratio,Patient's Sex,Patient's Age,Rows,Columns,Pixel Spacing,Bits Stored,Window Center,Window Width
0,f7f7736401b22c94a9d2b09425adeab9.dicom,1.2.840.10008.1.2.4.90,1.2.840.10008.5.1.4.1.1.1.1,f7f7736401b22c94a9d2b09425adeab9,INF_3.9,MONOCHROME2,0,11,1,16,...,0.0,,F,037Y,3028,2517,"[0.139, 0.139]",12,2048.0,4096.0
1,93ae57ca1df1d19c6d64155e74e14d23.dicom,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1.1,93ae57ca1df1d19c6d64155e74e14d23,,MONOCHROME2,0,13,1,16,...,,,O,Y,3072,3072,"[0.139000, 0.139000]",14,10287.0,5292.0
2,f1a45afaee0efd07fef17057f3942464.dicom,1.2.840.10008.1.2.4.90,1.2.840.10008.5.1.4.1.1.1,f1a45afaee0efd07fef17057f3942464,OFFIS_DCMTK_360,MONOCHROME1,0,11,1,16,...,0.0,,M,,2880,2304,"[0.15, 0.15]",12,2047.0,4095.0
3,559fdfb3b9db3eb206b9521824f716e3.dicom,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1,559fdfb3b9db3eb206b9521824f716e3,OFFIS_DCMTK_360,MONOCHROME2,0,13,1,16,...,,,M,038Y,3072,2540,"[0.140, 0.140]",14,10075.0,10242.0
4,371d6f540ac601affe81e547b05c551a.dicom,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1,371d6f540ac601affe81e547b05c551a,OFFIS_DCMTK_360,MONOCHROME2,0,13,1,16,...,,,M,060Y,3072,3072,"[0.140, 0.140]",14,8755.0,10228.0


In [92]:
dicom_metadata.shape

(15000, 22)

# Data Preprocessing

In [93]:
dicom_metadata = dicom_metadata.rename(columns={"SOP Instance UID": "image_id"})
dicom_metadata = dicom_metadata[
    [
        "image_id",
        "Transfer Syntax UID",
        "SOP Class UID",
        "Version",
        "Photometric Interpretation",
        "Pixel Representation",
        "High Bit",
        "Samples per Pixel",
        "Bits Allocated",
        "Rescale Intercept",
        "Rescale Slope",
        "Lossy Image Compression",
        "Lossy Image Compression Ratio",
        "Patient's Sex",
        "Patient's Age",
        "Rows",
        "Columns",
        "Pixel Spacing",
        "Bits Stored",
        "Window Center",
        "Window Width",
    ]
]

In [94]:
dicom_metadata["image_id"] = dicom_metadata["image_id"].str.replace(".dicom", "")
dicom_metadata

Unnamed: 0,image_id,Transfer Syntax UID,SOP Class UID,Version,Photometric Interpretation,Pixel Representation,High Bit,Samples per Pixel,Bits Allocated,Rescale Intercept,...,Lossy Image Compression,Lossy Image Compression Ratio,Patient's Sex,Patient's Age,Rows,Columns,Pixel Spacing,Bits Stored,Window Center,Window Width
0,f7f7736401b22c94a9d2b09425adeab9,1.2.840.10008.1.2.4.90,1.2.840.10008.5.1.4.1.1.1.1,INF_3.9,MONOCHROME2,0,11,1,16,0.0,...,0.0,,F,037Y,3028,2517,"[0.139, 0.139]",12,2048.0,4096.0
1,93ae57ca1df1d19c6d64155e74e14d23,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1.1,,MONOCHROME2,0,13,1,16,0.0,...,,,O,Y,3072,3072,"[0.139000, 0.139000]",14,10287.0,5292.0
2,f1a45afaee0efd07fef17057f3942464,1.2.840.10008.1.2.4.90,1.2.840.10008.5.1.4.1.1.1,OFFIS_DCMTK_360,MONOCHROME1,0,11,1,16,0.0,...,0.0,,M,,2880,2304,"[0.15, 0.15]",12,2047.0,4095.0
3,559fdfb3b9db3eb206b9521824f716e3,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1,OFFIS_DCMTK_360,MONOCHROME2,0,13,1,16,0.0,...,,,M,038Y,3072,2540,"[0.140, 0.140]",14,10075.0,10242.0
4,371d6f540ac601affe81e547b05c551a,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1,OFFIS_DCMTK_360,MONOCHROME2,0,13,1,16,0.0,...,,,M,060Y,3072,3072,"[0.140, 0.140]",14,8755.0,10228.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,0c5ff01c7bfb4362fcd98f36e555b08c,1.2.840.10008.1.2.4.90,1.2.840.10008.5.1.4.1.1.1,OFFIS_DCMTK_360,MONOCHROME1,0,11,1,16,0.0,...,0.0,,F,,2880,2304,"[0.15, 0.15]",12,2047.0,4095.0
14996,3286115baacb6ed3affc5c46c9205b7a,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1,OFFIS_DCMTK_360,MONOCHROME1,0,11,1,16,0.0,...,0.0,,F,,2880,2304,"[0.15, 0.15]",12,2047.0,4095.0
14997,d44766bf0eb902fbc1848f418417c45f,1.2.840.10008.1.2.4.90,1.2.840.10008.5.1.4.1.1.1,OFFIS_DCMTK_360,MONOCHROME1,0,11,1,16,0.0,...,,,F,035Y,2208,2104,,12,,
14998,cd7b83d24ee9c092a0f32cb020da6dac,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1.1,,MONOCHROME2,0,11,1,16,0.0,...,0.0,,,,2926,2847,"[0.125, 0.125]",12,1202.0,5482.0


In [95]:
Photometric_Interpretation_class = {"MONOCHROME2": 2, "MONOCHROME1": 1}
dicom_metadata["Photometric_Interpretation_class"] = dicom_metadata["Photometric Interpretation"].map(
    Photometric_Interpretation_class
)

In [96]:
Transfer_Syntax_UID_class = {
    "1.2.840.10008.1.2.4.90": 1,
    "1.2.840.10008.1.2": 2,
    "1.2.840.10008.1.2.1": 3,
}
dicom_metadata["Transfer_Syntax_UID_class"] = dicom_metadata["Transfer Syntax UID"].map(Transfer_Syntax_UID_class)

In [97]:
SOP_Class_UID = {
    "1.2.840.10008.5.1.4.1.1.1": 1,
    "1.2.840.10008.5.1.4.1.1.1.1": 2,
}
dicom_metadata["SOP_Class_UID"] = dicom_metadata["SOP Class UID"].map(SOP_Class_UID)

In [98]:
versions_class = {"INF_3.9": 1, "OFFIS_DCMTK_360": 2, "dcm4che-1.4.34": 3, "INF_4.5": 4, np.nan: 0}
dicom_metadata["Version_class"] = dicom_metadata["Version"].map(versions_class)

In [99]:
sex_class = {"M": 1, "F": 2, "O": 3, np.nan: 0}
dicom_metadata["sex_class"] = dicom_metadata["Patient's Sex"].map(sex_class)

In [100]:
age = dicom_metadata["Patient's Age"].unique()
age_class = {np.nan: 0}
for i in range(1, len(age)):
    age_class[age[i]] = i
dicom_metadata["age_class"] = dicom_metadata["Patient's Age"].map(age_class)

# Data Merging


In [101]:
df_merged = pd.merge(dicom_metadata, df_pixel_features, on='image_id')
print(df_merged.shape)
df_merged.head()

(15000, 66)


Unnamed: 0,image_id,Transfer Syntax UID,SOP Class UID,Version,Photometric Interpretation,Pixel Representation,High Bit,Samples per Pixel,Bits Allocated,Rescale Intercept,...,haralick_8,haralick_9,haralick_10,haralick_11,haralick_12,sobel_edges_count,canny_edges_count,skewness,kurtosis,entropy
0,f7f7736401b22c94a9d2b09425adeab9,1.2.840.10008.1.2.4.90,1.2.840.10008.5.1.4.1.1.1.1,INF_3.9,MONOCHROME2,0,11,1,16,0.0,...,1.096679,0.02688,0.026656,-0.977429,0.936543,7452076,7192,-0.701459,-0.302151,7.576512
1,93ae57ca1df1d19c6d64155e74e14d23,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1.1,,MONOCHROME2,0,13,1,16,0.0,...,1.052247,0.035452,0.030933,-0.971723,0.929042,8215117,85349,-0.488563,-0.968595,7.328269
2,f1a45afaee0efd07fef17057f3942464,1.2.840.10008.1.2.4.90,1.2.840.10008.5.1.4.1.1.1,OFFIS_DCMTK_360,MONOCHROME1,0,11,1,16,0.0,...,1.31755,0.076018,0.025446,-0.981678,0.959767,6292597,28826,-0.74097,-0.402109,7.576292
3,559fdfb3b9db3eb206b9521824f716e3,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1,OFFIS_DCMTK_360,MONOCHROME2,0,13,1,16,0.0,...,0.990296,0.03422,0.032104,-0.968306,0.918775,7427926,49113,-1.328552,0.927739,6.618164
4,371d6f540ac601affe81e547b05c551a,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1,OFFIS_DCMTK_360,MONOCHROME2,0,13,1,16,0.0,...,1.035567,0.019867,0.032615,-0.96953,0.926027,7785805,59297,-0.732994,-0.82517,6.647353


In [102]:
df_merged = df_merged.drop(
    columns=[
        "Pixel Representation",
        "Bits Allocated",
        "Rescale Intercept",
        "Rescale Slope",
        "Lossy Image Compression",
        "Lossy Image Compression Ratio",
        "Samples per Pixel",
        "image_id",
        "Transfer Syntax UID",
        "SOP Class UID",
        "Photometric Interpretation",
        "Pixel Spacing",
        "Version",
        "Patient's Sex",
        "Patient's Age",
    ]
)

In [103]:
import warnings

warnings.filterwarnings("ignore")

In [104]:
df_merged.shape

(15000, 51)

## Train Model

In [105]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier


_sc = StandardScaler()
_pca = PCA(n_components=6)
_model = RandomForestClassifier()
pipeline_random_forest = Pipeline([("scaler", _sc), ("pca", _pca), ("model", _model)])

In [106]:
df_train

Unnamed: 0,image_id,class_id
0,50a418190bc3fb1ef1633bf9678929b3,14
1,21a10246a5ec7af151081d0cd6d65dc9,14
2,9a5094b2563a1ef3ff50dc5c7ff71345,3
3,051132a778e61a86eb147c7c6f564dfe,0
4,063319de25ce7edb9b1c6b8881290140,14
...,...,...
67909,936fd5cff1c058d39817a08f58b72cae,14
67910,ca7e72954550eeb610fe22bf0244b7fa,14
67911,aa17d5312a0fb4a2939436abca7f9579,14
67912,4b56bc6d22b192f075f13231419dfcc8,3


In [107]:
y = df_train["class_id"]

In [108]:
y.value_counts()

class_id
14    31818
0      7162
3      5427
11     4842
13     4655
8      2580
7      2483
10     2476
9      2203
6      1247
5      1000
2       960
4       556
1       279
12      226
Name: count, dtype: int64

In [109]:
y_binary = y.apply(lambda x: 1 if x == 14 else 0)
y_binary = pd.DataFrame(y_binary, columns=["class_id"])
y_binary["image_id"] = df_train["image_id"]
y_binary = y_binary.drop_duplicates(subsets=["image_id"])
y_binary.head()

Unnamed: 0,class_id,image_id
0,1,50a418190bc3fb1ef1633bf9678929b3
1,1,21a10246a5ec7af151081d0cd6d65dc9
2,0,9a5094b2563a1ef3ff50dc5c7ff71345
3,0,051132a778e61a86eb147c7c6f564dfe
4,1,063319de25ce7edb9b1c6b8881290140


In [110]:
y_binary["class_id"].value_counts()

class_id
1    10606
0     4394
Name: count, dtype: int64

In [111]:
X = df_merged
X = X.replace(np.nan, 1)

In [112]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.15, stratify=y_binary["class_id"])

In [113]:
print(f"x_train shape: {X_train.shape} | x_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape} | y_test shape: {y_test.shape}")

x_train shape: (12750, 51) | x_test shape: (2250, 51)
y_train shape: (12750, 2) | y_test shape: (2250, 2)


In [114]:
y_train_image_id = y_train["image_id"]
y_test_image_id = y_test["image_id"]

y_train = y_train.drop(columns=["image_id"])
y_test = y_test.drop(columns=["image_id"])

y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

### Naive Bayes

In [115]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier().fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(sgd.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(sgd.score(X_test, y_test)))

Accuracy on training set: 0.707
Accuracy on test set: 0.707


### KNN

In [116]:
from sklearn.neighbors import KNeighborsClassifier

# instantiate & fit
knn = KNeighborsClassifier(algorithm="brute", n_jobs=-1).fit(X_train, y_train)

print("Accuracy on training set: {:.3f}".format(knn.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(knn.score(X_test, y_test)))

Accuracy on training set: 0.749
Accuracy on test set: 0.650


### Logistic Regression

In [117]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

pipeline = Pipeline([("scaler", _sc), ("pca", _pca), ("model", model)])

pipeline.fit(X_train, y_train)

print("Accuracy on training set: {:.3f}".format(pipeline.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(pipeline.score(X_test, y_test)))

Accuracy on training set: 0.707
Accuracy on test set: 0.707


### Random Forest

In [118]:
import joblib

# train the model using the PCA components
pipeline = pipeline_random_forest.fit(X_train, y_train)

y_pred = pipeline_random_forest.predict(X_test)

print("Accuracy on training set: {:.3f}".format(pipeline_random_forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(pipeline_random_forest.score(X_test, y_test)))

Accuracy on training set: 1.000
Accuracy on test set: 0.684


### Bagging Classifier

In [121]:
# import the library
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# instantiate & fit
bg = BaggingClassifier(
    DecisionTreeClassifier(min_samples_split=10, max_depth=3),
).fit(X_train, y_train)

print("Accuracy on training set: {:.3f}".format(bg.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(bg.score(X_test, y_test)))

Accuracy on training set: 0.707
Accuracy on test set: 0.707


### Decision Tree

In [120]:
# import the library
from sklearn.tree import DecisionTreeClassifier

# instantiate & fit
clf = DecisionTreeClassifier(min_samples_split=10, max_depth=3).fit(X_train, y_train)

print("Accuracy on training set: {:.3f}".format(clf.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(clf.score(X_test, y_test)))

Accuracy on training set: 0.708
Accuracy on test set: 0.705
