In [1]:
import numpy as np
import pandas as pd
from PIL import Image
import sys
import time
from csv import writer
from io import BytesIO

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [9]:
def confuse(y_true, y_pred):
    return pd.DataFrame(
        data = confusion_matrix(y_test, y_pred).T,
        columns = ['real false', 'real true'],
        index = ['pred false', 'pred true']
    )

In [2]:
def evl(model, y_true, y_pred, title):
    matrix = confusion_matrix(y_test, y_pred).T
    tp = matrix[1, 1]
    tn = matrix[0, 0]
    fp = matrix[1, 0]
    fn = matrix[0, 1]
    return pd.DataFrame(
        data = {
            'accuracy': [np.round((tp + tn) / (tp + tn + fp + fn), 3)],
            'sensitivity': [np.round(tp / (tp + fn), 3)],
            'specificity': [np.round(tn / (tn + fp), 3)],
            'precision': [np.round(tp / (tp + fp), 3)],
            'ROC AUC': [np.round(roc_auc_score(y_true, y_pred), 3)],
        },
        index = [title]
    )

In [2]:
img = Image.open('../img/img_0_0.jpg')

In [13]:
data = list(img.getdata())

In [14]:
data

[178,
 179,
 180,
 180,
 179,
 177,
 175,
 174,
 165,
 161,
 155,
 151,
 150,
 153,
 158,
 161,
 168,
 169,
 169,
 170,
 170,
 169,
 169,
 168,
 167,
 167,
 167,
 168,
 168,
 168,
 169,
 169,
 171,
 171,
 170,
 169,
 169,
 169,
 170,
 170,
 172,
 171,
 171,
 171,
 171,
 172,
 173,
 173,
 172,
 172,
 169,
 165,
 163,
 166,
 173,
 180,
 183,
 184,
 185,
 187,
 188,
 188,
 188,
 188,
 179,
 180,
 181,
 181,
 180,
 178,
 176,
 175,
 168,
 164,
 158,
 154,
 153,
 156,
 160,
 164,
 168,
 169,
 170,
 170,
 170,
 170,
 169,
 168,
 167,
 167,
 168,
 168,
 168,
 169,
 169,
 169,
 171,
 171,
 170,
 169,
 169,
 170,
 170,
 171,
 174,
 173,
 172,
 171,
 170,
 169,
 169,
 169,
 168,
 164,
 160,
 163,
 171,
 178,
 181,
 181,
 183,
 183,
 185,
 186,
 186,
 187,
 186,
 186,
 180,
 181,
 182,
 182,
 181,
 179,
 177,
 176,
 171,
 167,
 162,
 157,
 157,
 159,
 163,
 166,
 169,
 169,
 170,
 171,
 171,
 170,
 169,
 169,
 168,
 168,
 168,
 169,
 169,
 169,
 170,
 170,
 171,
 171,
 170,
 170,
 170,
 171,
 171

In [15]:
len(data)

4096

In [16]:
64*64

4096

In [5]:
non_flood_images = [

6, 10, 13, 33, 40, 57, 59, 72, 106, 114, 118, 
127, 129, 131, 134, 135, 137, 139, 140, 143,
144, 145, 148, 149, 150, 151, 154, 157, 159, 
165, 166, 171, 172, 173, 177, 178, 183, 187,
192, 193, 195, 196, 197, 199, 200, 201, 202,
203, 205, 206, 207, 208, 209, 211, 213, 215,
217, 219, 221, 222, 226, 229, 230, 231, 235,
236, 237, 238, 239, 240, 241, 242, 243, 244,
245, 246, 247, 248, 249, 250, 251, 252, 253,
255, 256, 258, 259, 260, 261, 262, 266, 267,
268, 271, 272, 273, 274, 275, 276, 278, 279,
281, 285, 286, 287, 289, 291, 294, 297, 299,
300, 301, 302, 303, 305, 307, 309, 311, 312,
313, 314, 318, 322, 323, 324, 325, 327, 329,
330, 331, 332, 336, 339, 341, 342, 343, 347,
348, 350, 352, 355, 356, 358, 360, 361, 363,
370, 372, 373, 376, 379, 383, 388, 389, 390, 
393, 394, 395, 399, 401, 402, 403, 405, 409,
410, 413, 414, 415, 421, 423, 424, 425, 428, 
430, 437, 439, 440, 444, 445, 447, 448, 449,
451, 455, 458, 459, 460, 461, 462, 463, 464,
465, 471, 473, 474, 475, 477, 479, 483, 485, 
487, 488, 490, 491, 494, 495, 498, 499, 502, 
503, 504, 505, 506, 507, 508, 509, 512, 513,
514, 516, 518, 520, 521, 524, 528, 529, 530,
531, 532, 533, 534, 535, 536, 539, 541, 543, 
544, 551, 555, 556, 557, 559, 566, 567, 569,
573, 574, 575, 576, 578, 579, 580, 581, 585, 
587, 592, 595, 597, 599, 600, 604, 605, 607, 
608, 609, 610, 613, 614, 616, 617, 618, 619, 
622, 623, 624, 625, 626, 627, 628, 631, 632,
633, 636, 637, 638, 640, 641, 642, 646, 647,
648, 649, 650, 652, 653, 654, 657, 659, 661, 
663, 664, 665, 666, 671, 673, 674, 678, 679,
680, 681, 682, 684, 685, 688, 689, 690, 692,
693, 695, 697, 698, 699, 700, 701, 702, 703,
708, 710, 711, 713, 716, 719, 720, 721, 722,
724, 725, 726, 727, 729, 731, 734, 737, 738,
741, 743, 746, 747, 749, 751, 752, 753, 755,
757, 758, 759, 763, 764, 766, 767, 799, 807, 
809, 888, 907

]

In [6]:
len(non_flood_images)

365

In [24]:
start_time = time.time()
label_list = []
img_list = []
for i in range(0, 908):
    if i % 100 == 0:
        print(f'Loading image {i} at time {round(time.time() - start_time)} s...')
    is_flood = int(i not in non_flood_images)
    for j in range(0, 160):
        with Image.open(f'../binary_images/img/img_{i}_{j}.jpg') as image:
            img_list.append(np.array(image.getdata(), np.uint8))
            label_list.append(is_flood)

with open('../binary_images/data/binary_img.csv', 'w') as file:
    csv_writer = writer(file)
    
    for i, image in enumerate(img_list):
        if i % 10000 == 0:
            print(f'Writing line {i} at time {round(time.time() - start_time)} s...')
        csv_writer.writerow(list(image) + [label_list[i]])

print(f'Done. Took {round(time.time() - start_time)} seconds.')
del label_list
del img_list


Loading image 0 at time 0 s...
Loading image 100 at time 5 s...
Loading image 200 at time 9 s...
Loading image 300 at time 13 s...
Loading image 400 at time 17 s...
Loading image 500 at time 21 s...
Loading image 600 at time 26 s...
Loading image 700 at time 30 s...
Loading image 800 at time 34 s...
Loading image 900 at time 39 s...
Writing line 0 at time 39 s...
Writing line 10000 at time 43 s...
Writing line 20000 at time 48 s...
Writing line 30000 at time 52 s...
Writing line 40000 at time 56 s...
Writing line 50000 at time 60 s...
Writing line 60000 at time 64 s...
Writing line 70000 at time 69 s...
Writing line 80000 at time 73 s...
Writing line 90000 at time 77 s...
Writing line 100000 at time 82 s...
Writing line 110000 at time 86 s...
Writing line 120000 at time 90 s...
Writing line 130000 at time 95 s...
Writing line 140000 at time 99 s...
Done. Took 102 seconds.


In [11]:
%%time
df = pd.read_csv('../binary_images/data/binary_img.csv', header=None)

CPU times: user 8.53 s, sys: 328 ms, total: 8.85 s
Wall time: 8.88 s


In [3]:
sys.getsizeof(df) / 1_000_000_000 # size of dataframe in gb

1.19129616

In [4]:
df.shape # (number of images, pixels per image + label)

(145280, 1025)

In [5]:
df.head() # column 1024 is the label column

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,1024
0,89,90,92,92,90,87,83,80,76,77,...,88,75,72,69,66,65,67,71,73,1
1,88,88,88,88,88,88,88,88,90,89,...,95,95,102,102,94,94,102,105,101,1
2,93,91,89,90,93,95,93,91,94,93,...,113,84,65,70,81,76,79,82,67,1
3,89,92,90,85,84,88,93,93,86,85,...,52,47,66,82,83,82,85,84,79,1
4,84,84,84,84,84,84,84,84,83,83,...,70,82,86,87,84,77,71,65,61,1


In [6]:
# class balance
df[1024].mean()

0.5980176211453745

In [12]:
%%time
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=[1024]), df[1024], test_size = 0.25, random_state = 2, stratify = df[1024])
del df

lr = LogisticRegression(max_iter = 100)
lr.fit(X_train, y_train)
print('train', lr.score(X_train, y_train))
print('test', lr.score(X_test, y_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


train 0.6066905286343612
test 0.5972191629955947
CPU times: user 53.3 s, sys: 928 ms, total: 54.2 s
Wall time: 15.4 s


In [6]:
y_test.mean()

0.5980176211453745

In [9]:
pd.DataFrame(
    data = confusion_matrix(y_test, lr.predict(X_test)).T,
    columns = ['real false', 'real true'],
    index = ['pred false', 'pred true']
)

Unnamed: 0,real false,real true
pred false,2681,2710
pred true,11919,19010


In [3]:
%%time
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=[1024]), df[1024], test_size = 0.25, random_state = 2)
del df

svc = SVC(kernel = 'rbf', max_iter = 1000)
svc.fit(X_train, y_train)
print('train', svc.score(X_train, y_train))
print('test', svc.score(X_test, y_test))



train 0.4491831864904552
test 0.44812775330396476
CPU times: user 9min 48s, sys: 619 ms, total: 9min 48s
Wall time: 9min 48s


In [15]:
%%time
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=[1024]), df[1024], test_size = 0.25, random_state = 2)
del df

forest = RandomForestClassifier(
    n_estimators = 1024,
    max_depth = 32,
    n_jobs = 7
)
forest.fit(X_train, y_train)
print('train', forest.score(X_train, y_train))
print('test', forest.score(X_test, y_test))

train 0.9971732745961821
test 0.7032764317180616
CPU times: user 1h 13min 9s, sys: 3.11 s, total: 1h 13min 12s
Wall time: 10min 31s


In [16]:
confuse(y_test, forest.predict(X_test))

Unnamed: 0,real false,real true
pred false,4487,691
pred true,10086,21056


In [20]:
evl(forest, y_test, forest.predict(X_test), 'Forest')

Unnamed: 0,accuracy,sensitivity,specificity,precision,ROC AUC
Forest,0.703,0.968,0.308,0.676,0.638


In [4]:
%%time
df = pd.read_csv('../binary_images/data/binary_img.csv', header=None)
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=[1024]), 
    df[1024], 
    test_size = 0.25, 
    random_state = 2
)
del df

ss = StandardScaler(copy = False, with_mean = 0, with_std = 1)
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

mlp = MLPClassifier(
    hidden_layer_sizes = (256, 64, 16, 4), # tuple([128] + [16] * 16), 
    activation = 'logistic',
    solver = 'adam',
    random_state = 2,
)    
#     batch_size = 100,
#     alpha = 0.00001,
#     max_iter = 1000,
#     tol = 0.00001,
#     n_iter_no_change = 1000

mlp.fit(X_train, y_train)
print('train', mlp.score(X_train, y_train))
print('test', mlp.score(X_test, y_test))



train 0.8320851688693098
test 0.7987885462555067
CPU times: user 57min 37s, sys: 55.5 s, total: 58min 33s
Wall time: 15min 11s


In [10]:
confuse(y_test, mlp.predict(X_test))

Unnamed: 0,real false,real true
pred false,11946,4681
pred true,2627,17066


In [7]:
evl(mlp, y_test, mlp.predict(X_test), 'MLP')

Unnamed: 0,accuracy,sensitivity,specificity,precision,ROC AUC
MLP,0.799,0.785,0.82,0.867,0.802


In [None]:
# (128, 16) : train 0.664 test 0.638
# (128, 32, 6) : train 0.655 test 0.637