In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load Data

In [2]:
df_train = pd.read_csv('/Users/alexis/Cranfield/AI/assignment/data/training/train.csv')
df_train = df_train.drop(columns=["rad_id", "x_min", "y_min", "class_name", "x_max", "y_max"])
df_train.head()

Unnamed: 0,image_id,class_id
0,8138c0bc649a534f449cf703475528ec,3
1,e8e6ff19c3be05a4ccfb20abc4202d52,11
2,de6d6efd88431d3253198106bb070471,14
3,01ee6e560f083255a630c41bba779405,13
4,da668869900c862ce12bd06fde5feb8d,13


Pixels Features CSV

In [3]:
df_pixel_features = pd.read_csv('/Users/alexis/Cranfield/AI/assignment/data/test/dicom_pixels_features.csv')
df_pixel_features = df_pixel_features.rename(columns={"filename": "image_id"})
df_pixel_features['image_id'] = df_pixel_features['image_id'].str.replace('.dicom', '')
print(df_pixel_features.shape)
df_pixel_features.head()

(15000, 40)


Unnamed: 0,intensity_mean,intensity_std,intensity_median,glcm_contrast,glcm_correlation,glcm_energy,glcm_homogeneity,mean_gabor_real,std_gabor_real,mean_gabor_imag,...,haralick_9,haralick_10,haralick_11,haralick_12,sobel_edges_count,canny_edges_count,skewness,kurtosis,entropy,image_id
0,162.508754,53.610391,170.0,50.945162,0.991132,0.01746,0.176312,13.336609,27.888411,8.551008,...,0.015018,0.046868,-0.962266,0.939101,6611401,123449,-0.744572,0.210735,7.563331,005d70155f949c7785671800f2c8e1ca
1,125.648836,67.795674,144.0,7.800591,0.999151,0.11638,0.68288,7.823448,7.411495,1.016623,...,0.020257,0.036356,-0.966909,0.928575,6556797,84878,-0.669602,-0.808662,6.91708,0061cf6d35e253b6e7f03940592cc35e
2,107.468138,74.136839,115.0,80.262698,0.992697,0.063658,0.197132,25.949493,65.264177,3.803589,...,0.030123,0.026028,-0.975495,0.928329,8505142,23409,-0.128583,-1.31742,7.448992,006501b11e04aec2d403177b9ae0f34c
3,115.759135,63.175611,140.0,88.948071,0.988853,0.025961,0.251142,7.008416,5.248002,26.063167,...,0.03973,0.026295,-0.975028,0.925514,7626872,453359,-0.453477,-1.299697,7.459437,00675cd546313f912cadd4ad54415d69
4,132.849532,66.681063,143.0,7.035192,0.999209,0.05014,0.519648,8.137124,5.306767,0.117948,...,0.06603,0.022631,-0.978496,0.927692,7163833,7488,-0.511102,-0.841245,7.485881,006e2726c6aa72f042a08b1406c39d52


Metadata CSV

In [4]:
dicom_metadata = pd.read_csv('/Users/alexis/Cranfield/AI/assignment/data/test/dicom_metadata.csv')
dicom_metadata.head()

Unnamed: 0,File Name,Transfer Syntax UID,SOP Class UID,SOP Instance UID,Version,Photometric Interpretation,Pixel Representation,High Bit,Samples per Pixel,Bits Allocated,...,Lossy Image Compression,Lossy Image Compression Ratio,Patient's Sex,Patient's Age,Rows,Columns,Pixel Spacing,Bits Stored,Window Center,Window Width
0,f7f7736401b22c94a9d2b09425adeab9.dicom,1.2.840.10008.1.2.4.90,1.2.840.10008.5.1.4.1.1.1.1,f7f7736401b22c94a9d2b09425adeab9,INF_3.9,MONOCHROME2,0,11,1,16,...,0.0,,F,037Y,3028,2517,"[0.139, 0.139]",12,2048.0,4096.0
1,93ae57ca1df1d19c6d64155e74e14d23.dicom,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1.1,93ae57ca1df1d19c6d64155e74e14d23,,MONOCHROME2,0,13,1,16,...,,,O,Y,3072,3072,"[0.139000, 0.139000]",14,10287.0,5292.0
2,f1a45afaee0efd07fef17057f3942464.dicom,1.2.840.10008.1.2.4.90,1.2.840.10008.5.1.4.1.1.1,f1a45afaee0efd07fef17057f3942464,OFFIS_DCMTK_360,MONOCHROME1,0,11,1,16,...,0.0,,M,,2880,2304,"[0.15, 0.15]",12,2047.0,4095.0
3,559fdfb3b9db3eb206b9521824f716e3.dicom,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1,559fdfb3b9db3eb206b9521824f716e3,OFFIS_DCMTK_360,MONOCHROME2,0,13,1,16,...,,,M,038Y,3072,2540,"[0.140, 0.140]",14,10075.0,10242.0
4,371d6f540ac601affe81e547b05c551a.dicom,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1,371d6f540ac601affe81e547b05c551a,OFFIS_DCMTK_360,MONOCHROME2,0,13,1,16,...,,,M,060Y,3072,3072,"[0.140, 0.140]",14,8755.0,10228.0


In [5]:
dicom_metadata.shape

(15000, 22)

# Data Preprocessing

In [6]:
dicom_metadata = dicom_metadata.rename(columns={"SOP Instance UID": "image_id"})
dicom_metadata = dicom_metadata[
    [
        "image_id",
        "Transfer Syntax UID",
        "SOP Class UID",
        "Version",
        "Photometric Interpretation",
        "Pixel Representation",
        "High Bit",
        "Samples per Pixel",
        "Bits Allocated",
        "Rescale Intercept",
        "Rescale Slope",
        "Lossy Image Compression",
        "Lossy Image Compression Ratio",
        "Patient's Sex",
        "Patient's Age",
        "Rows",
        "Columns",
        "Pixel Spacing",
        "Bits Stored",
        "Window Center",
        "Window Width",
    ]
]

In [7]:
dicom_metadata["image_id"] = dicom_metadata["image_id"].str.replace(".dicom", "")
dicom_metadata

Unnamed: 0,image_id,Transfer Syntax UID,SOP Class UID,Version,Photometric Interpretation,Pixel Representation,High Bit,Samples per Pixel,Bits Allocated,Rescale Intercept,...,Lossy Image Compression,Lossy Image Compression Ratio,Patient's Sex,Patient's Age,Rows,Columns,Pixel Spacing,Bits Stored,Window Center,Window Width
0,f7f7736401b22c94a9d2b09425adeab9,1.2.840.10008.1.2.4.90,1.2.840.10008.5.1.4.1.1.1.1,INF_3.9,MONOCHROME2,0,11,1,16,0.0,...,0.0,,F,037Y,3028,2517,"[0.139, 0.139]",12,2048.0,4096.0
1,93ae57ca1df1d19c6d64155e74e14d23,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1.1,,MONOCHROME2,0,13,1,16,0.0,...,,,O,Y,3072,3072,"[0.139000, 0.139000]",14,10287.0,5292.0
2,f1a45afaee0efd07fef17057f3942464,1.2.840.10008.1.2.4.90,1.2.840.10008.5.1.4.1.1.1,OFFIS_DCMTK_360,MONOCHROME1,0,11,1,16,0.0,...,0.0,,M,,2880,2304,"[0.15, 0.15]",12,2047.0,4095.0
3,559fdfb3b9db3eb206b9521824f716e3,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1,OFFIS_DCMTK_360,MONOCHROME2,0,13,1,16,0.0,...,,,M,038Y,3072,2540,"[0.140, 0.140]",14,10075.0,10242.0
4,371d6f540ac601affe81e547b05c551a,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1,OFFIS_DCMTK_360,MONOCHROME2,0,13,1,16,0.0,...,,,M,060Y,3072,3072,"[0.140, 0.140]",14,8755.0,10228.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,0c5ff01c7bfb4362fcd98f36e555b08c,1.2.840.10008.1.2.4.90,1.2.840.10008.5.1.4.1.1.1,OFFIS_DCMTK_360,MONOCHROME1,0,11,1,16,0.0,...,0.0,,F,,2880,2304,"[0.15, 0.15]",12,2047.0,4095.0
14996,3286115baacb6ed3affc5c46c9205b7a,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1,OFFIS_DCMTK_360,MONOCHROME1,0,11,1,16,0.0,...,0.0,,F,,2880,2304,"[0.15, 0.15]",12,2047.0,4095.0
14997,d44766bf0eb902fbc1848f418417c45f,1.2.840.10008.1.2.4.90,1.2.840.10008.5.1.4.1.1.1,OFFIS_DCMTK_360,MONOCHROME1,0,11,1,16,0.0,...,,,F,035Y,2208,2104,,12,,
14998,cd7b83d24ee9c092a0f32cb020da6dac,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1.1,,MONOCHROME2,0,11,1,16,0.0,...,0.0,,,,2926,2847,"[0.125, 0.125]",12,1202.0,5482.0


In [8]:
Photometric_Interpretation_class = {"MONOCHROME2": 2, "MONOCHROME1": 1}
dicom_metadata["Photometric_Interpretation_class"] = dicom_metadata["Photometric Interpretation"].map(
    Photometric_Interpretation_class
)

In [9]:
Transfer_Syntax_UID_class = {
    "1.2.840.10008.1.2.4.90": 1,
    "1.2.840.10008.1.2": 2,
    "1.2.840.10008.1.2.1": 3,
}
dicom_metadata["Transfer_Syntax_UID_class"] = dicom_metadata["Transfer Syntax UID"].map(Transfer_Syntax_UID_class)

In [10]:
SOP_Class_UID = {
    "1.2.840.10008.5.1.4.1.1.1": 1,
    "1.2.840.10008.5.1.4.1.1.1.1": 2,
}
dicom_metadata["SOP_Class_UID"] = dicom_metadata["SOP Class UID"].map(SOP_Class_UID)

In [11]:
versions_class = {"INF_3.9": 1, "OFFIS_DCMTK_360": 2, "dcm4che-1.4.34": 3, "INF_4.5": 4, np.nan: 0}
dicom_metadata["Version_class"] = dicom_metadata["Version"].map(versions_class)

In [12]:
sex_class = {"M": 1, "F": 2, "O": 3, np.nan: 0}
dicom_metadata["sex_class"] = dicom_metadata["Patient's Sex"].map(sex_class)

In [13]:
age = dicom_metadata["Patient's Age"].unique()
age_class = {np.nan: 0}
for i in range(1, len(age)):
    age_class[age[i]] = i
dicom_metadata["age_class"] = dicom_metadata["Patient's Age"].map(age_class)

# Data Merging


In [14]:
df_merged = pd.merge(dicom_metadata, df_pixel_features, on='image_id')
print(df_merged.shape)
df_merged.head()

(15000, 66)


Unnamed: 0,image_id,Transfer Syntax UID,SOP Class UID,Version,Photometric Interpretation,Pixel Representation,High Bit,Samples per Pixel,Bits Allocated,Rescale Intercept,...,haralick_8,haralick_9,haralick_10,haralick_11,haralick_12,sobel_edges_count,canny_edges_count,skewness,kurtosis,entropy
0,f7f7736401b22c94a9d2b09425adeab9,1.2.840.10008.1.2.4.90,1.2.840.10008.5.1.4.1.1.1.1,INF_3.9,MONOCHROME2,0,11,1,16,0.0,...,1.096679,0.02688,0.026656,-0.977429,0.936543,7452076,7192,-0.701459,-0.302151,7.576512
1,93ae57ca1df1d19c6d64155e74e14d23,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1.1,,MONOCHROME2,0,13,1,16,0.0,...,1.052247,0.035452,0.030933,-0.971723,0.929042,8215117,85349,-0.488563,-0.968595,7.328269
2,f1a45afaee0efd07fef17057f3942464,1.2.840.10008.1.2.4.90,1.2.840.10008.5.1.4.1.1.1,OFFIS_DCMTK_360,MONOCHROME1,0,11,1,16,0.0,...,1.31755,0.076018,0.025446,-0.981678,0.959767,6292597,28826,-0.74097,-0.402109,7.576292
3,559fdfb3b9db3eb206b9521824f716e3,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1,OFFIS_DCMTK_360,MONOCHROME2,0,13,1,16,0.0,...,0.990296,0.03422,0.032104,-0.968306,0.918775,7427926,49113,-1.328552,0.927739,6.618164
4,371d6f540ac601affe81e547b05c551a,1.2.840.10008.1.2,1.2.840.10008.5.1.4.1.1.1,OFFIS_DCMTK_360,MONOCHROME2,0,13,1,16,0.0,...,1.035567,0.019867,0.032615,-0.96953,0.926027,7785805,59297,-0.732994,-0.82517,6.647353


In [15]:
df_merged = df_merged.drop(
    columns=[
        "Pixel Representation",
        "Bits Allocated",
        "Rescale Intercept",
        "Rescale Slope",
        "Lossy Image Compression",
        "Lossy Image Compression Ratio",
        "Samples per Pixel",
        "image_id",
        "Transfer Syntax UID",
        "SOP Class UID",
        "Photometric Interpretation",
        "Pixel Spacing",
        "Version",
        "Patient's Sex",
        "Patient's Age",
    ]
)

In [16]:
import warnings

warnings.filterwarnings("ignore")

In [17]:
df_merged.shape

(15000, 51)

## Train Model

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier


_sc = StandardScaler()
_pca = PCA(n_components=6)
_model = RandomForestClassifier()
pipeline_random_forest = Pipeline([("scaler", _sc), ("pca", _pca), ("model", _model)])

In [27]:
df_train

Unnamed: 0,image_id,class_id
0,8138c0bc649a534f449cf703475528ec,0
1,e8e6ff19c3be05a4ccfb20abc4202d52,0
2,de6d6efd88431d3253198106bb070471,1
3,01ee6e560f083255a630c41bba779405,0
4,da668869900c862ce12bd06fde5feb8d,0
...,...,...
51849,b31cded9486986079d87d7212f1089a7,1
51850,8af8c2cd2865f2254195ee6b034522fe,1
51869,e8daa2e3fdc66a7c3359235762d14744,1
51880,e9e59e91a7e4419a9b6a997e004e8f9a,1


In [20]:
df_train["class_id"]= df_train["class_id"].apply(lambda x: 1 if x == 14 else 0)
df_train = df_train.drop_duplicates()

In [21]:
df_train["class_id"].value_counts()

class_id
1    10441
0     4388
Name: count, dtype: int64

In [24]:
df_merged = df_merged.replace(np.nan, 1)

In [26]:
df_merged

Unnamed: 0,High Bit,Rows,Columns,Bits Stored,Window Center,Window Width,Photometric_Interpretation_class,Transfer_Syntax_UID_class,SOP_Class_UID,Version_class,...,haralick_8,haralick_9,haralick_10,haralick_11,haralick_12,sobel_edges_count,canny_edges_count,skewness,kurtosis,entropy
0,11,3028,2517,12,2048.0,4096.0,2,1,2,1,...,1.096679,0.026880,0.026656,-0.977429,0.936543,7452076,7192,-0.701459,-0.302151,7.576512
1,13,3072,3072,14,10287.0,5292.0,2,2,2,0,...,1.052247,0.035452,0.030933,-0.971723,0.929042,8215117,85349,-0.488563,-0.968595,7.328269
2,11,2880,2304,12,2047.0,4095.0,1,1,1,2,...,1.317550,0.076018,0.025446,-0.981678,0.959767,6292597,28826,-0.740970,-0.402109,7.576292
3,13,3072,2540,14,10075.0,10242.0,2,2,1,2,...,0.990296,0.034220,0.032104,-0.968306,0.918775,7427926,49113,-1.328552,0.927739,6.618164
4,13,3072,3072,14,8755.0,10228.0,2,2,1,2,...,1.035567,0.019867,0.032615,-0.969530,0.926027,7785805,59297,-0.732994,-0.825170,6.647353
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,11,2880,2304,12,2047.0,4095.0,1,1,1,2,...,1.014098,0.052247,0.021734,-0.978877,0.925692,5750537,25917,-0.518805,-1.080867,7.210155
14996,11,2880,2304,12,2047.0,4095.0,1,2,1,2,...,1.011967,0.049573,0.028494,-0.972034,0.923297,6326310,37494,-0.938144,-0.109780,7.424021
14997,11,2208,2104,12,1.0,1.0,1,1,1,2,...,1.095924,0.108795,0.025474,-0.977443,0.936452,4353956,12830,-0.500592,-0.863967,7.475707
14998,11,2926,2847,12,1202.0,5482.0,2,2,2,0,...,1.053981,0.021599,0.031707,-0.971102,0.929106,8077520,135245,-0.794128,-0.090921,7.557350


In [25]:
y = df_merged["class_id"]
X = df_merged.drop("class_id")

KeyError: 'class_id'

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.15, stratify=y_binary["class_id"])

NameError: name 'y_binary' is not defined

In [None]:
print(f"x_train shape: {X_train.shape} | x_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape} | y_test shape: {y_test.shape}")

In [None]:
y_train_image_id = y_train["image_id"]
y_test_image_id = y_test["image_id"]

y_train = y_train.drop(columns=["image_id"])
y_test = y_test.drop(columns=["image_id"])

y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

### Naive Bayes

In [None]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier().fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(sgd.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(sgd.score(X_test, y_test)))

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# instantiate & fit
knn = KNeighborsClassifier(algorithm="brute", n_jobs=-1).fit(X_train, y_train)

print("Accuracy on training set: {:.3f}".format(knn.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(knn.score(X_test, y_test)))

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

pipeline = Pipeline([("scaler", _sc), ("pca", _pca), ("model", model)])

pipeline.fit(X_train, y_train)

print("Accuracy on training set: {:.3f}".format(pipeline.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(pipeline.score(X_test, y_test)))

### Random Forest

In [None]:
import joblib

# train the model using the PCA components
pipeline = pipeline_random_forest.fit(X_train, y_train)

y_pred = pipeline_random_forest.predict(X_test)

print("Accuracy on training set: {:.3f}".format(pipeline_random_forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(pipeline_random_forest.score(X_test, y_test)))

### Bagging Classifier

In [None]:
# import the library
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# instantiate & fit
bg = BaggingClassifier(
    DecisionTreeClassifier(min_samples_split=10, max_depth=3),
).fit(X_train, y_train)

print("Accuracy on training set: {:.3f}".format(bg.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(bg.score(X_test, y_test)))

### Decision Tree

In [None]:
# import the library
from sklearn.tree import DecisionTreeClassifier

# instantiate & fit
clf = DecisionTreeClassifier(min_samples_split=10, max_depth=3).fit(X_train, y_train)

print("Accuracy on training set: {:.3f}".format(clf.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(clf.score(X_test, y_test)))