<a href="https://colab.research.google.com/github/AhmadJamal01/Floodead-Inside/blob/main/Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [144]:
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC
import pandas as pd
import numpy as np
import glob
import os
import cv2
import seaborn as sns
import matplotlib.pyplot as plt
# import plotly.express as px
# from osgeo import gdal
import cv2
from skimage.feature import hog
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

NUM_FOLDS = 10

## Read the Data

In [None]:
!pip install gdal > /dev/null

In [None]:
import gdown
gdown.download("https://drive.google.com/uc?id=1och-QmNa3FAiS-wssgzCwISbmpSezIi_", "dataset.zip", quiet=False)
gdown.extractall("dataset.zip")
path = 'dataset/'


Downloading...
From: https://drive.google.com/file/d/1och-QmNa3FAiS-wssgzCwISbmpSezIi_
To: /content/dataset.zip
75.5kB [00:00, 34.8MB/s]


BadZipFile: ignored

## Prepare the Data

In [108]:
df = pd.DataFrame(columns=['image_path', 'label'])

for image_path in sorted(glob.glob('dataset/flooded/*.jpg')):
    data = {'image_path': image_path, 'label': 'flooded'}
    df.loc[len(df)] = data

for image_path in sorted(glob.glob('dataset/non-flooded/*.jpg')):
    data = {'image_path': image_path, 'label': 'non-flooded'}
    df.loc[len(df)] = data

In [109]:
print(df.head())
print("Dataset shape:", df.shape)

                image_path    label
0    dataset/flooded\0.jpg  flooded
1    dataset/flooded\1.jpg  flooded
2   dataset/flooded\10.jpg  flooded
3  dataset/flooded\100.jpg  flooded
4  dataset/flooded\101.jpg  flooded
Dataset shape: (922, 2)


### Extract Features

### HOG

In [110]:
def calculate_hog_features(image):
    hog_features= hog(image, orientations=8, pixels_per_cell=(40, 50),
                        cells_per_block=(1, 1), multichannel=False)
    return hog_features


### Color-based Features

In [111]:
import cv2
import numpy as np

def calculate_average_color(image):
    average_color = np.mean(image, axis=(0, 1))
    return average_color

def calculate_color_histogram(image):
    hist = cv2.calcHist([image], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
    hist = cv2.normalize(hist, hist).flatten()
    return hist


### Texture-based Features

In [112]:
from skimage.feature import greycomatrix, greycoprops
from skimage.feature import local_binary_pattern

def get_patches(image, n = 10):
    # width and height of the image
    w, h = image.shape
    # patch size = w/n, h/n
    patches = []
    for i in range(n):
        for j in range(n):
            patch = image[int(i*w/n):int((i+1)*w/n), int(j*h/n):int((j+1)*h/n)]
            patches.append(patch)
    return patches

# Gray-Level Co-occurrence Matrix (GLCM): Computes the distribution of co-occurring pixel values in different directions.
def calculate_glcm_features(image, n = 8):
    w, h = image.shape

    # make the image intensity to be integer from 0 to 255 instead of float from 0 to 1
    image_int = (image*255).astype('uint8')
    
    distances = [1, 3]
    angles = [0, np.pi/4, np.pi/2]
    properties = ['energy', 'homogeneity', 'contrast']

    patches = get_patches(image_int, n)
    # calculate the GLCM for each patch
    glcm = [greycomatrix(patch, distances, angles, normed=True, symmetric=True) for patch in patches]


    # calculate the properties for each patch
    property_values = [np.hstack([greycoprops(g, prop).ravel() for prop in properties]) for g in glcm]

    # make property_values a numpy array
    property_values = np.array(property_values)

    # flatten the array
    property_values = property_values.flatten()

    return property_values

# Local Binary Patterns (LBP): Captures the patterns in the texture of the image.
def calculate_lbp_features(image):
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    lbp = local_binary_pattern(gray_image, 8, 1, method='uniform')
    hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, 10), range=(0, 10))
    hist = hist.astype("float")
    hist /= (hist.sum() + 1e-7)
    return hist


### Shape-based Features

In [113]:
def calculate_contour_area(image):
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary_image = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    contours, _ = cv2.findContours(binary_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contour_area = 0
    for contour in contours:
        contour_area += cv2.contourArea(contour)
    return contour_area

def calculate_aspect_ratio(image):
    height, width, _ = image.shape
    aspect_ratio = width / height
    return aspect_ratio


### Sampling

In [114]:
# Take only a portion of the dataframe
newLen = 50
X_sampled = df.sample(n=newLen, random_state=42)

# Drop the remaining rows
X_dropped = df.drop(X_sampled.index)

# Verify the shapes of the dataframes
print("Sampled Data Shape:", X_sampled.shape)
print("Dropped Data Shape:", X_dropped.shape)

Sampled Data Shape: (50, 2)
Dropped Data Shape: (872, 2)


## Preprocessing

In [116]:
import os
import cv2
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from skimage import exposure
from skimage.transform import resize
from tqdm import tqdm


# Initialize lists to store features and target labels
features = []
targets = []

X_columns = ['hog_features', 'glcm_features']
y_column = 'label'

X = pd.DataFrame(columns=X_columns)
y = pd.DataFrame(columns=[y_column])

# Iterate over the images in the directory
# for index, row in tqdm(df.iterrows()):
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        image_path = row['image_path']
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

                # brightness correction
        image_corr = exposure.equalize_adapthist(image, clip_limit=0.03)

        # scale image to 400x300
        image_corr = resize(image_corr, (320, 400), anti_aliasing=True)

        # Convert the image to grayscale
        # image_gray = cv2.cvtColor(image_corr, cv2.COLOR_BGR2GRAY)

        # Extract the features from the image
        # hog_features = calculate_hog_features(image_corr)
        # average_color = calculate_average_color(image)
        # color_histogram = calculate_color_histogram(image)
        # glcm_features = calculate_glcm_features(image_corr)
        # lbp_features = calculate_lbp_features(image)
        # contour_area = calculate_contour_area(image)
        # aspect_ratio = calculate_aspect_ratio(image)

        # add the features to the dataframe
        X.loc[index, 'hog_features'] = calculate_hog_features(image_corr)
        X.loc[index, 'glcm_features'] = calculate_glcm_features(image_corr)
        
        # Append the features to the list
        # feature_row = [*hog_features, *average_color, *color_histogram, *glcm_features, *lbp_features, contour_area, aspect_ratio]
        # feature_row = [*hog_features,*glcm_features]
        # features.append(feature_row)

        
        # Determine the target class based on the folder name
        label = row['label']
        if 'flooded' == label:
            target = 1
        elif 'non-flooded' == label:
            target = -1
        else:
            target = 0
        
        y.loc[index, 'label'] = row['label']

        # Append the target label to the list
        # targets.append(target)



100%|██████████| 922/922 [2:09:36<00:00,  8.43s/it]     


In [117]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 922 entries, 0 to 921
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   hog_features   922 non-null    object
 1   glcm_features  922 non-null    object
dtypes: object(2)
memory usage: 53.9+ KB


In [118]:
# get the first row of the dataframe
X.head()
type(X.iloc[0]['hog_features'][0])


numpy.float64

In [140]:
X['hog_features'][0].shape
X['glcm_features'][0].shape

(1152,)

In [None]:
# Scale the numerical features in X using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Split the Data

In [119]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [120]:
y_train = y_train.values.ravel()

In [121]:
y_train

array(['flooded', 'non-flooded', 'flooded', 'flooded', 'flooded',
       'flooded', 'non-flooded', 'non-flooded', 'non-flooded', 'flooded',
       'flooded', 'flooded', 'flooded', 'flooded', 'flooded',
       'non-flooded', 'non-flooded', 'non-flooded', 'flooded', 'flooded',
       'flooded', 'flooded', 'non-flooded', 'flooded', 'non-flooded',
       'non-flooded', 'non-flooded', 'flooded', 'flooded', 'non-flooded',
       'non-flooded', 'flooded', 'flooded', 'non-flooded', 'flooded',
       'non-flooded', 'non-flooded', 'flooded', 'non-flooded', 'flooded',
       'flooded', 'non-flooded', 'flooded', 'non-flooded', 'flooded',
       'non-flooded', 'non-flooded', 'flooded', 'flooded', 'flooded',
       'flooded', 'flooded', 'flooded', 'flooded', 'non-flooded',
       'flooded', 'flooded', 'non-flooded', 'flooded', 'flooded',
       'non-flooded', 'flooded', 'flooded', 'non-flooded', 'non-flooded',
       'flooded', 'non-flooded', 'flooded', 'non-flooded', 'non-flooded',
       'non-floo

In [122]:
print(X_train.values)

[[array([0.35933428, 0.35933428, 0.35933428, 0.35933428, 0.35933428,
         0.35933428, 0.35933428, 0.31008409, 0.35355339, 0.35355339,
         0.35355339, 0.35355339, 0.35355339, 0.35355339, 0.35355339,
         0.35355339, 0.35355339, 0.35355339, 0.35355339, 0.35355339,
         0.35355339, 0.35355339, 0.35355339, 0.35355339, 0.35355339,
         0.35355339, 0.35355339, 0.35355339, 0.35355339, 0.35355339,
         0.35355339, 0.35355339, 0.35497466, 0.35497466, 0.35497466,
         0.34535128, 0.35311536, 0.35497466, 0.35497466, 0.35497466,
         0.35355339, 0.35355339, 0.35355339, 0.35355339, 0.35355339,
         0.35355339, 0.35355339, 0.35355339, 0.35355339, 0.35355339,
         0.35355339, 0.35355339, 0.35355339, 0.35355339, 0.35355339,
         0.35355339, 0.35355339, 0.35355339, 0.35355339, 0.35355339,
         0.35355339, 0.35355339, 0.35355339, 0.35355339, 0.28154063,
         0.36181186, 0.4062988 , 0.4062988 , 0.4062988 , 0.4062988 ,
         0.27325224, 0.23419094, 0

In [123]:
# create empty lists to store the HOG and GLCM features
hog_features_temp = []
glcm_features_temp = []

# iterate over the rows in the X dataframe
for index, row in X_train.iterrows():
    # append the HOG and GLCM features for this sample to their respective lists
    hog_features_temp.append(row['hog_features'])
    glcm_features_temp.append(row['glcm_features'])

# concatenate the HOG and GLCM features for each sample into a single 1D array
features = np.hstack([hog_features_temp, glcm_features_temp])

# create a new X_train_numpy array by stacking the feature arrays vertically
X_train_numpy = np.vstack(features)

In [142]:
print(X_train_numpy.shape)

(737, 1664)


## Model

In [129]:
svm = LinearSVC(max_iter=1_000_000)
svm.fit(X_train_numpy, y_train)
# increase the number of iterations


LinearSVC(max_iter=1000000)

In [131]:
# create empty lists to store the HOG and GLCM features
hog_features_test = []
glcm_features_test = []

# iterate over the rows in the X_test dataframe
for index, row in X_test.iterrows():
    # append the HOG and GLCM features for this sample to their respective lists
    hog_features_test.append(row['hog_features'])
    glcm_features_test.append(row['glcm_features'])

# concatenate the HOG and GLCM features for each sample into a single 1D array
features_test = [np.hstack([hog_feature, glcm_feature]) for hog_feature, glcm_feature in zip(hog_features_test, glcm_features_test)]

# create a new X_test_numpy array by stacking the feature arrays vertically
X_test_numpy = np.vstack(features_test)

## Evaluation

In [132]:
# y_pred = svm.predict(X_test)
y_pred = svm.predict(X_test_numpy)


In [133]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))


Accuracy: 70.27%


In [145]:
# random forest classifier
clf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=0)
clf.fit(X_train_numpy, y_train)

# predict the test set
y_pred = clf.predict(X_test_numpy)

# print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     flooded       0.90      0.86      0.88        98
 non-flooded       0.85      0.90      0.87        87

    accuracy                           0.88       185
   macro avg       0.88      0.88      0.88       185
weighted avg       0.88      0.88      0.88       185



In [146]:
# run classification using lazy predict
from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train_numpy, X_test_numpy, y_train, y_test)
models.head(20)

100%|██████████| 29/29 [01:16<00:00,  2.65s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.9,0.9,,0.9,19.69
NuSVC,0.88,0.88,,0.88,1.32
RandomForestClassifier,0.88,0.88,,0.88,2.99
ExtraTreesClassifier,0.88,0.88,,0.88,1.18
SVC,0.87,0.87,,0.87,1.06
AdaBoostClassifier,0.86,0.86,,0.86,19.59
LogisticRegression,0.81,0.81,,0.81,1.17
CalibratedClassifierCV,0.8,0.8,,0.8,6.19
LinearSVC,0.8,0.8,,0.8,1.82
PassiveAggressiveClassifier,0.8,0.8,,0.8,0.57
