# Project

In [1]:
# Name: Alexis Lizardo
# Student ID: 301318503

## Imports

In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
import sys
import os
import glob
import re
from scipy import misc
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

## Functions

In [3]:
# Adapted from: https://stackoverflow.com/a/26912445

def remove_headers(file_list):
    pattern = r'^"Date/Time"'
    pattern = re.compile(pattern)
    path = file_list[0].rpartition('/')[0] + "/"
    for file in file_list:
        new_file = "weather-data/"+ file.split('/')[1]
        with open(file,"r") as input:
            with open(new_file,"w") as output: 
                for line in input:
                    if pattern.match(line):
                        output.write(line)
                        break
                for line in input:
                    output.write(line)
        #os.rename(new_file, path + new_file)

In [4]:
def path_to_datetime(path):
    fname = path.split('/')[1]
    fname_no_ext = fname.split('.')[0]
    date = fname_no_ext.split('-')[1]
    dt = pd.to_datetime(date, unit='ns')
    return dt

## Specify in report that having color information did not provide better accuracy

In [5]:
def img_data(filenames, dates):
    N = dates.size
    image_data = np.empty((N, 192*256), dtype=np.uint8)
    i = int(0)
    for f in filenames:
        if (i >= N):
            break
        if (path_to_datetime(f) == dates.values[i]):
            img = misc.imread(f, flatten=True)
            image_data[i, ...] = img.flatten()
            i = i + 1
    return image_data

In [6]:
def img_datetime(filenames):
    N = len(filenames)
    images_date = np.empty(N, dtype=np.dtype('M8[ns]'))
    for i, f in enumerate(filenames):
        images_date[i] = path_to_datetime(f)
    return images_date

## Get Data

### Get file names

In [7]:
images_path = 'katkam-scaled/'
orig_weather_path = 'yvr-weather/'
mod_weather_path = 'weather-data/'
orig_files = glob.glob(os.path.join(orig_weather_path, "*.csv"))
image_files = glob.glob(os.path.join(images_path, "*.jpg"))

### Clean weather data files

In [8]:
remove_headers(orig_files)
mod_files = glob.glob(os.path.join(mod_weather_path, "*.csv"))

### Load weather data into dataframe

In [9]:
# Adapted from: https://stackoverflow.com/a/36416258
columns = ['Date/Time', 'Weather']
df_from_each_file = (pd.read_csv(f, parse_dates=['Date/Time'], usecols=columns) for f in mod_files)
weather_data = pd.concat(df_from_each_file, ignore_index=True)
#weather_data = weather_data.dropna()

### Get dates of image files

In [10]:
images_dates = img_datetime(image_files)
dates_df = pd.DataFrame(images_dates, columns=['Date/Time'])

### Merge weather_data with dates_df to keep images which have weather information

In [11]:
data = pd.merge(weather_data, dates_df, on = 'Date/Time')

### Get weather categories

In [12]:
categories = ['Cloudy', 'Rain', 'Clear', 'Drizzle', 'Fog', 'Snow']
pattern = re.compile(r'\b(?:(%s))\b' % '|'.join(categories))
data['weather_mlabel'] = data['Weather'].str.findall(pattern)
data['weather_label'] = data['Weather'].str.extract(pattern, expand=False)

## Classify according to weather categories

### Load image data

In [13]:
data = data.dropna(subset = ['weather_label', 'weather_mlabel'])
image_data = img_data(image_files, data['Date/Time'])

### Single label classification

#### Split data

In [57]:
X = image_data
y = data['weather_label']

In [58]:
lbl_enc = LabelEncoder()
y_labels = lbl_enc.fit_transform(y)

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y_labels, random_state = 21)

#### Fit model

#### "Good" accuracy, around 0.7135

In [71]:
model = make_pipeline(
    PCA(250),
    RandomForestClassifier(n_estimators=500, n_jobs=-1)
)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.52846975089


#### "Best" accuracy soo far, 0.779, 500 estimator

In [67]:
model = make_pipeline(
    PCA(250),
    BaggingClassifier(n_estimators=500, n_jobs=-1)
)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.725978647687


### Multilabel classification

#### Split data

In [41]:
X = image_data
y = data['weather_mlabel']
y = MultiLabelBinarizer(classes=categories).fit_transform(y)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y_labels, random_state = 21)

#### Fit model

In [56]:
model = make_pipeline(
    PCA(250),
    #RandomForestClassifier(n_estimators=500, n_jobs=-1)
    KNeighborsClassifier(n_neighbors=5, weights='distance', n_jobs=-1)
)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.670818505338


## Load data for other types of classification

### Load Temp and Visibility data into dataframe

In [25]:
# Adapted from: https://stackoverflow.com/a/36416258
columns = ['Date/Time', 'Temp (°C)', 'Visibility (km)']
df_from_each_file = (pd.read_csv(f, parse_dates=['Date/Time'], usecols=columns) for f in mod_files)
weather_data = pd.concat(df_from_each_file, ignore_index=True)
weather_data = weather_data.dropna()

### Get dates of image files

In [26]:
images_dates = img_datetime(image_files)
dates_df = pd.DataFrame(images_dates, columns=['Date/Time'])

### Merge weather_data with dates_df to keep images which have weather information

In [27]:
data = pd.merge(weather_data, dates_df, on = 'Date/Time')

### Load image data

In [32]:
image_data = img_data(image_files, data['Date/Time'])

## Classify by time of day

### Specify categories

In [34]:
data['t_day'] = 'Afternoon'
data.loc[data['Date/Time'].dt.hour < 12, 't_day'] = 'Morning'
data.loc[data['Date/Time'].dt.hour >= 18, 't_day'] = 'Evening'

### Split data

In [35]:
X = image_data
y = data['t_day']

In [36]:
lbl_enc = LabelEncoder()
y_labels = lbl_enc.fit_transform(y)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y_labels, random_state = 21)

### Fit model

In [38]:
model = make_pipeline(
    PCA(250),
    RandomForestClassifier(n_estimators=250, n_jobs=-1)
)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.765500794913


## Classify by Temperature range

### Specify categories

In [39]:
data['temp_label'] = 'Cool'
data.loc[data['Temp (°C)'] < 10, 'temp_label'] = 'Cold'
data.loc[data['Temp (°C)'] >= 20 , 'temp_label'] = 'Warm'

### Split data

In [40]:
X = image_data
y = data['temp_label']

In [41]:
lbl_enc = LabelEncoder()
y_labels = lbl_enc.fit_transform(y)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y_labels, random_state = 21)

### Fit model

In [43]:
model = make_pipeline(
    PCA(250),
    RandomForestClassifier(n_estimators=150, n_jobs=-1)
)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.800476947536


In [44]:
model = make_pipeline(
    PCA(250),
    BaggingClassifier(n_estimators=300, n_jobs=-1)
)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.825914149444


## Classify by Visibility

### Specify categories

In [45]:
data['visibility_label'] = 'Poor'
data.loc[(data['Visibility (km)'] > 5) & (data['Visibility (km)'] <= 10), 'visibility_label'] = 'Moderate'
data.loc[(data['Visibility (km)'] > 10) & (data['Visibility (km)'] <= 20), 'visibility_label'] = 'Good'
data.loc[(data['Visibility (km)'] > 20) & (data['Visibility (km)'] <= 30), 'visibility_label'] = 'Very Good'
data.loc[data['Visibility (km)'] > 30, 'visibility_label'] = 'Excellent'

### Split data

In [46]:
X = image_data
y = data['visibility_label']

In [47]:
lbl_enc = LabelEncoder()
y_labels = lbl_enc.fit_transform(y)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y_labels, random_state = 21)

### Fit model

In [49]:
model = make_pipeline(
    PCA(250),
    BaggingClassifier(n_estimators=150, n_jobs=-1)
)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.77106518283
