In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import tensorflow as tf
from tensorflow import keras
from keras import datasets, layers, models

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

# Preprocessing the dataset
The dataset consists of 556 different folders that need to be extracted into subfolders. For the preliminary results, I chose to import a small sample size of around 5700 images from the folders named 10-0 and 100-0, as marked in the stationId column. I will also only be evaluating the road condition label for this section. Since the 10-0 and 100-10 folder images are detailed in the first 7000 rows of the images.csv file, then dataframe will only be composed of the first 7000 rows.

In [6]:
df = pd.read_csv('/content/drive/My Drive/MAIS_202_Data/images.csv')
df = df[['stationId', 'imgPath', 'roadCondition']]
df = df.head(7000)
df.head(20)

Unnamed: 0,stationId,imgPath,roadCondition
0,10-0,10-0/83/1552428483_10-0.jpg,dry
1,10-0,10-0/82/1552429682_10-0.jpg,dry
2,10-0,10-0/84/1552430884_10-0.jpg,dry
3,10-0,10-0/81/1552432081_10-0.jpg,dry
4,10-0,10-0/82/1552433282_10-0.jpg,dry
5,10-0,10-0/81/1552434481_10-0.jpg,dry
6,10-0,10-0/81/1552435681_10-0.jpg,dry
7,10-0,10-0/82/1552436882_10-0.jpg,dry
8,10-0,10-0/82/1552438082_10-0.jpg,dry
9,10-0,10-0/83/1552439283_10-0.jpg,dry


The following section creates two lists, one that stores each image and one that stores the image's respective label.

In [7]:
import os
import cv2

# Lists to store images and labels
images = []
labels = []

# Iterate through each row in df
for index, row in df.iterrows():

  # Find the path to retrieve the image
  img_path = os.path.join('/content/drive/MyDrive/MAIS_202_Data/', row['imgPath'])

  # Read the image
  img = cv2.imread(img_path)

  # Resizing and adding the image information and label into the lists, if the image exists
  if img is not None:
    img = cv2.resize(img, (128, 128))
    images.append(img)
    labels.append(row['roadCondition'])
    print(f"success loading {img_path}")
  else:
    print(f"Error loading {img_path}") # Prints this message if the image was not loaded properly

# Check number of images in the lists
print(f"Loaded {len(images)} images successfully.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
success loading /content/drive/MyDrive/MAIS_202_Data/100-0/86/1544927786_100-0.jpg
success loading /content/drive/MyDrive/MAIS_202_Data/100-0/68/1544928368_100-0.jpg
success loading /content/drive/MyDrive/MAIS_202_Data/100-0/10/1544929710_100-0.jpg
Error loading /content/drive/MyDrive/MAIS_202_Data/100-0/09/1544930909_100-0.jpg
success loading /content/drive/MyDrive/MAIS_202_Data/100-0/87/1544932587_100-0.jpg
success loading /content/drive/MyDrive/MAIS_202_Data/100-0/49/1544933649_100-0.jpg
success loading /content/drive/MyDrive/MAIS_202_Data/100-0/47/1544934847_100-0.jpg
success loading /content/drive/MyDrive/MAIS_202_Data/100-0/49/1544936049_100-0.jpg
success loading /content/drive/MyDrive/MAIS_202_Data/100-0/48/1544937248_100-0.jpg
success loading /content/drive/MyDrive/MAIS_202_Data/100-0/59/1544938359_100-0.jpg
success loading /content/drive/MyDrive/MAIS_202_Data/100-0/49/1544939649_100-0.jpg
success loading /content

# Creating the training and validation data sets

In [13]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# Convert image list to a numpy array
X = np.array(images)

# Convert list of (unique) class names into list of integers
class_to_int = {}

for i, class_name in enumerate(np.unique(labels)):
  class_to_int[class_name] = i

# Replace label name with its associated integer
Y = []

for label in labels:
  Y.append(class_to_int[label])

# Convert label list to a numpy array
Y = np.array(Y)

# Convert label array to one-hot encoding
Y = to_categorical(Y)

# Split the data into training and validation sets, 20% of the data will be allocated for validation
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)


# Implementing the CNN Model

The following section implements the VGG16 model. Further into the project, I hope to implement my own CNN model instead of relying on an external model.

In [14]:
from tensorflow.keras.applications import VGG16

# VGG16 Setup
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(128, 128, 3))
base_model.trainable = False

model = models.Sequential([
    base_model,
    layers.Flatten(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(len(np.unique(labels)), activation='softmax')  # Output layer
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', 'recall'])


# Training

In [None]:
history = model.fit(
    X_train, Y_train,
    epochs=10,
    validation_data= (X_val, Y_val)
)

Epoch 1/10
[1m 60/143[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m8:18[0m 6s/step - accuracy: 0.5303 - loss: 16.2516 - recall: 0.5221