In [1]:
import pandas as pd
from matplotlib import image
from matplotlib import pyplot
from PIL import Image

In [2]:
from matplotlib.pyplot import imread

In [3]:
def check_missing(dataframe):
    if len(dataframe.columns[dataframe.isnull().any()]) == 0:
        print('There are no missing values.')
    else:
        print('Column Name', ' ', 'Number of Missing Values', ' ', 'Percentage of Total')
        for index, value in dataframe.isnull().sum().items():
            if value>0:
                print(index, ' '*(23-len(index)), value, ' '*(18-len(str(value))),'  ', 
                      round(value*100/len(dataframe), 2), "%")
                
def check_eda(dataframe):
    print('Preview of data: ')
    display(dataframe.head(3))
    print('\n', '-'*50, '\n')
    find_shape(dataframe)
    print('\n', '-'*50)
    print('\n Total number of rows, column types and null values:\n')
    print(dataframe.info())
    
def find_shape(dataframe):
    print('Rows:', dataframe.shape[0])
    print('Columns:', dataframe.shape[1])
    
def check_integrity(dataframe):
    print('Checking for duplicated rows...')
    print('\nNumber of duplicated rows: ', dataframe[dataframe.duplicated()].shape[0])
    if (dataframe[dataframe.duplicated()].shape[0]) > 0:
        print('\nDropping duplicates...')
        dataframe.drop_duplicates(inplace=True)
        print('\nDropped.')
    print('\nChecking min, max, mean and STD of numerical features to see if any values are illogical.')
    display(dataframe.describe(include='all').transpose())
    print('\nCounting the unique values of non numerical features (including missing values):\n')
    for col in dataframe.select_dtypes(include='object').columns:
        print('Count of: ', col)
        print(dataframe[col].value_counts(dropna=False), '\n')

In [4]:
df_hemorrhage = pd.read_csv('hemorrhage_diagnosis.csv')
df_patient = pd.read_csv('patient_demographics.csv')

In [5]:
df_hemorrhage.head(35)

Unnamed: 0,PatientNumber,SliceNumber,Intraventricular,Intraparenchymal,Subarachnoid,Epidural,Subdural,No_Hemorrhage,Fracture_Yes_No
0,49,1,0,0,0,0,0,1,0
1,49,2,0,0,0,0,0,1,0
2,49,3,0,0,0,0,0,1,0
3,49,4,0,0,0,0,0,1,0
4,49,5,0,0,0,0,0,1,0
5,49,6,0,0,0,0,0,1,0
6,49,7,0,0,0,0,0,1,0
7,49,8,0,0,0,0,0,1,0
8,49,9,0,0,0,0,0,1,0
9,49,10,0,0,0,0,0,1,0


In [6]:
check_eda(df_hemorrhage)

Preview of data: 


Unnamed: 0,PatientNumber,SliceNumber,Intraventricular,Intraparenchymal,Subarachnoid,Epidural,Subdural,No_Hemorrhage,Fracture_Yes_No
0,49,1,0,0,0,0,0,1,0
1,49,2,0,0,0,0,0,1,0
2,49,3,0,0,0,0,0,1,0



 -------------------------------------------------- 

Rows: 2501
Columns: 9

 --------------------------------------------------

 Total number of rows, column types and null values:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2501 entries, 0 to 2500
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   PatientNumber     2501 non-null   int64
 1   SliceNumber       2501 non-null   int64
 2   Intraventricular  2501 non-null   int64
 3   Intraparenchymal  2501 non-null   int64
 4   Subarachnoid      2501 non-null   int64
 5   Epidural          2501 non-null   int64
 6   Subdural          2501 non-null   int64
 7   No_Hemorrhage     2501 non-null   int64
 8   Fracture_Yes_No   2501 non-null   int64
dtypes: int64(9)
memory usage: 176.0 KB
None


In [7]:
check_integrity(df_hemorrhage)

Checking for duplicated rows...

Number of duplicated rows:  0

Checking min, max, mean and STD of numerical features to see if any values are illogical.


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PatientNumber,2501.0,89.591363,23.76236,49.0,69.0,90.0,110.0,130.0
SliceNumber,2501.0,15.909636,9.071176,1.0,8.0,16.0,23.0,40.0
Intraventricular,2501.0,0.009596,0.097508,0.0,0.0,0.0,0.0,1.0
Intraparenchymal,2501.0,0.029188,0.168368,0.0,0.0,0.0,0.0,1.0
Subarachnoid,2501.0,0.007197,0.084547,0.0,0.0,0.0,0.0,1.0
Epidural,2501.0,0.069172,0.253798,0.0,0.0,0.0,0.0,1.0
Subdural,2501.0,0.022391,0.147981,0.0,0.0,0.0,0.0,1.0
No_Hemorrhage,2501.0,0.872851,0.333207,0.0,1.0,1.0,1.0,1.0
Fracture_Yes_No,2501.0,0.077969,0.268176,0.0,0.0,0.0,0.0,1.0



Counting the unique values of non numerical features (including missing values):



In [8]:
check_missing(df_hemorrhage)

There are no missing values.


In [9]:
check_eda(df_patient)

Preview of data: 


Unnamed: 0,Patient Number,Age\n(years),Gender,Intraventricular,Intraparenchymal,Subarachnoid,Epidural,Subdural,Fracture (Yes/No),Condition on file,Note
0,49,35.0,Male,,1.0,,1.0,,1.0,Intracranial HGE+ Extradural HGE,
1,50,0.583333,Female,,1.0,,,,1.0,Subdural HGE,
2,51,5.0,Male,,1.0,,,1.0,1.0,Extadural HGE,



 -------------------------------------------------- 

Rows: 82
Columns: 11

 --------------------------------------------------

 Total number of rows, column types and null values:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82 entries, 0 to 81
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Patient Number     82 non-null     int64  
 1   Age
(years)        82 non-null     float64
 2   Gender             82 non-null     object 
 3   Intraventricular   5 non-null      float64
 4   Intraparenchymal   16 non-null     float64
 5   Subarachnoid       7 non-null      float64
 6   Epidural           21 non-null     float64
 7   Subdural           4 non-null      float64
 8   Fracture (Yes/No)  22 non-null     float64
 9   Condition on file  82 non-null     object 
 10  Note               1 non-null      object 
dtypes: float64(7), int64(1), object(3)
memory usage: 7.2+ KB
None


In [10]:
check_integrity(df_patient)

Checking for duplicated rows...

Number of duplicated rows:  0

Checking min, max, mean and STD of numerical features to see if any values are illogical.


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Patient Number,82,,,,89.5,23.8153,49.0,69.25,89.5,109.75,130.0
Age\n(years),82,,,,27.8437,19.5209,0.00274725,11.25,26.0,40.0,72.0
Gender,82,2.0,Male,46.0,,,,,,,
Intraventricular,5,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Intraparenchymal,16,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Subarachnoid,7,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Epidural,21,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Subdural,4,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Fracture (Yes/No),22,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Condition on file,82,13.0,Normal CT,47.0,,,,,,,



Counting the unique values of non numerical features (including missing values):

Count of:  Gender
Male      46
Female    36
Name: Gender, dtype: int64 

Count of:  Condition on file
Normal CT                                              47
Extradural HGE                                         13
Intracranial HGE                                        7
Subdural HGE                                            3
Intracranial HGE+ Extradural HGE                        2
Extadural HGE                                           2
Intracranial HGE + IVH                                  2
Extradural HGE + Intracranial HGE + Extradural HGE      1
ICH                                                     1
Subdural HGE+ Intracranial HGE                          1
Chronic Intracranial HGE                                1
Subdural HGE +Intracranial HGE + IVH                    1
Intracranial HGE + Subdural HGE                         1
Name: Condition on file, dtype: int64 

Count of:  Note
NaN  

In [11]:
check_missing(df_patient)

Column Name   Number of Missing Values   Percentage of Total
Intraventricular         77                     93.9 %
Intraparenchymal         66                     80.49 %
Subarachnoid             75                     91.46 %
Epidural                 61                     74.39 %
Subdural                 78                     95.12 %
Fracture (Yes/No)        60                     73.17 %
Note                     81                     98.78 %


In [12]:
patient = {}
for i in range(49, 131):
    for j in range(1, len(df_hemorrhage.loc[df_hemorrhage['PatientNumber']==i, 'SliceNumber'])+1):
        if i < 100:
            im =Image.open(f'computed-tomography-images-for-intracranial-hemorrhage-detection-and-segmentation-1.0.0/Patients_CT/0{i}/brain/{j}.jpg', 'r')
            patient[f'0{i}-{j}'] = list(im.getdata())
        else:
            im =Image.open(f'computed-tomography-images-for-intracranial-hemorrhage-detection-and-segmentation-1.0.0/Patients_CT/{i}/brain/{j}.jpg', 'r')
            patient[f'{i}-{j}'] = list(im.getdata())

In [13]:
df_brain = pd.DataFrame(patient)

In [14]:
df_brain = df_brain.transpose()

In [15]:
df_brain

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,422490,422491,422492,422493,422494,422495,422496,422497,422498,422499
049-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
049-2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
049-3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
049-4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
049-5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130-27,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
130-28,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
130-29,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
130-30,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, \
                            precision_score, classification_report, f1_score

In [53]:
from sklearn.model_selection import train_test_split

In [54]:
X = df_brain
y = df_hemorrhage['No_Hemorrhage']

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   random_state=42,
                                                   test_size=0.2)

# Validating 
print(f"Train set: {len(X_train)/len(X):.2%}")
print(f"Train set: {len(X_test)/len(X):.2%}")

Train set: 79.97%
Train set: 20.03%


In [56]:
from sklearn.linear_model import LogisticRegression

In [57]:
clf = LogisticRegression(solver='lbfgs', max_iter=10000)
clf.fit(X_train, y_train)

LogisticRegression(max_iter=10000)

In [58]:
y_pred = clf.predict(X_test)

In [59]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.79      0.80        62
           1       0.97      0.97      0.97       439

    accuracy                           0.95       501
   macro avg       0.89      0.88      0.88       501
weighted avg       0.95      0.95      0.95       501



In [44]:
(y_pred == y_test).sum()

378

In [None]:
im = Image.open('computed-tomography-images-for-intracranial-hemorrhage-detection-and-segmentation-1.0.0/Patients_CT/049/bone/1.jpg', 'r')

In [None]:
pix_val = list(im.getdata())

In [None]:
pix_val