In [None]:
#imports
from sklearn.linear_model import LogisticRegression
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import os 
import cv2
from random import shuffle 
from tqdm import tqdm 
from PIL import Image
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow import keras
from keras.layers import Dense
from keras.models import Sequential

In [None]:
#Testing File path
Image.open('data/chest_xray/train/NORMAL/IM-0115-0001.jpeg')

In [None]:
labels = ['PNEUMONIA', 'NORMAL']
img_size = 210
def get_data(data_dir):
    data = [] 
    for label in tqdm(labels): 
        path = os.path.join(data_dir, label)
        class_num = labels.index(label)
        for img in os.listdir(path):
            try:
                img_arr = cv2.imread(os.path.join(path, img), cv2.IMREAD_GRAYSCALE)
                resized_arr = cv2.resize(img_arr, (img_size, img_size)) # Reshaping images to preferred size
                data.append([resized_arr, class_num])
            except Exception as e:
                print(e)
    return np.array(data)

In [None]:
train_path = get_data('data/chest_xray/train/')
test_path = get_data('data/chest_xray/test/')
val_path = get_data('data/chest_xray/val')

In [None]:
mask = []
for i in train_path:
    if(i[1] == 0):
        mask.append("Pneumonia")
    else:
        mask.append("Normal")
sns.countplot(mask);

In [None]:
X_train = []
y_train = []

X_val = []
y_val = []

X_test = []
y_test = []

for image, label in train_path:
    X_train.append(image)
    y_train.append(label)

for image, label in test_path:
    X_test.append(image)
    y_test.append(label)
    
for image, label in val_path:
    X_val.append(image)
    y_val.append(label)
    


In [None]:
# Normalize the data
X_train = np.array(X_train) / 255
X_val = np.array(X_val) / 255
X_test = np.array(X_test) / 255
y_train = np.array(y_train)

In [None]:
X_train = X_train.reshape(5216, 32400).astype('float32')
X_test = X_test.reshape(624, 32400).astype('float32')

In [None]:
print(X_train.shape)

In [None]:
print(y_train.shape)

In [None]:
log = LogisticRegression(penalty='l2')
log.fit(X_train, y_train)

In [None]:
log.score(X_test, y_test)

In [None]:
y_train = tf.keras.utils.to_categorical(y_train, 2)
y_test = tf.keras.utils.to_categorical(y_test, 2)

In [None]:
model = Sequential()
model.add(Dense(64, activation='tanh', input_shape=(32400,)))
model.add(Dense(2, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['acc'])

In [None]:
model.summary()

In [None]:
results = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
#param_grid = {
    'n_estimators': [25,100,250],
    'criterion': ['gini', 'entropy'],
    'max_depth': [1, 2, 5, 10],
    'min_samples_split': [2, 5, 10, 20]
}

model = RandomForestClassifier(random_state=777)

#gs_forest = GridSearchCV(model, param_grid, cv=3)
#gs_forest.fit(X_train,y_train)
#gs_forest.best_params_

In [None]:
#great lets do it again with more params based on the ones it picked

#param_grid = {
    'n_estimators': [250,350,500],
    'criterion': ['entropy'],
    'max_depth': [10,15,25,100],
    'min_samples_split': [2,5]
}

#gs_forest = GridSearchCV(model, param_grid, cv=3)
#gs_forest.fit(X_train,y_train)
#gs_forest.best_params_

Lets actually model now



In [None]:
model = RandomForestClassifier(criterion= 'entropy', max_depth= 15, min_samples_split= 5, 
                               n_estimators= 500, random_state=777)
model.fit(X_train,y_train)
model.score(X_test,y_test)

In [None]:
model.score(X_train, y_train)

In [None]:
# Playing with weights

weights = [{0: 1, 1: 3}, {0: 1, 1: 1}]

model = RandomForestClassifier(criterion= 'entropy', max_depth= 15, min_samples_split= 5, 
                               n_estimators= 700, random_state=777,class_weight=weights)
model.fit(X_train,y_train)
print(model.score(X_train, y_train))
print(model.score(X_test,y_test))


In [None]:
from sklearn.model_selection import cross_val_score
#increasing the depbths cause google told me too, but only a little cause our GS said 15 was better than 25, so we trying 20

model = RandomForestClassifier(criterion= 'entropy', max_depth= 20, min_samples_split= 5, 
                               n_estimators= 700, random_state=777,class_weight=weights)
model.fit(X_train,y_train)
print(cross_val_score(model, X_test, y_test, cv=5))
print(model.score(X_train, y_train))
print(model.score(X_test,y_test))
# Seems that made it  little worse

In [None]:
# Trying adding a max features limiter
model = RandomForestClassifier(criterion= 'entropy', max_depth= 15, min_samples_split= 5, 
                               n_estimators= 700, random_state=777,max_features='log2')
model.fit(X_train,y_train)
print(cross_val_score(model, X_test, y_test, cv=5))
print(model.score(X_train, y_train))
print(model.score(X_test,y_test))

In [None]:
# Trying adding a max features limiter
model = RandomForestClassifier(criterion= 'entropy', max_depth= 15, min_samples_split= 5, 
                               n_estimators= 700, random_state=777,max_features='sqrt')
model.fit(X_train,y_train)
print(cross_val_score(model, X_test, y_test, cv=5))
print(model.score(X_train, y_train))
print(model.score(X_test,y_test))

In [None]:
print('they each get worse idk what to do')