# Project

In [1]:
# Name: Alexis Lizardo
# Student ID: 301318503

## Imports

In [193]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
import sys
import os
import glob
import re
from PIL import Image
from scipy import misc
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.multiclass import OneVsRestClassifier

## Functions

In [3]:
# Adapted from: https://stackoverflow.com/a/26912445

def remove_headers(file_list):
    pattern = r'^"Date/Time"'
    pattern = re.compile(pattern)
    path = file_list[0].rpartition('/')[0] + "/"
    for file in file_list:
        new_file = "weather-data/"+ file.split('/')[1]
        with open(file,"r") as input:
            with open(new_file,"w") as output: 
                for line in input:
                    if pattern.match(line):
                        output.write(line)
                        break
                for line in input:
                    output.write(line)
        #os.rename(new_file, path + new_file)

In [4]:
def path_to_datetime(path):
    fname = path.split('/')[1]
    fname_no_ext = fname.split('.')[0]
    date = fname_no_ext.split('-')[1]
    dt = pd.to_datetime(date, unit='ns')
    return dt

## Specify in report that having color information did not provide better accuracy

In [184]:
def img_data(filenames, dates):
    N = dates.size
    image_data = np.empty((N, 192*256), dtype=np.uint8)
    i = int(0)
    for f in filenames:
        if (i >= N):
            break
        if (path_to_datetime(f) == dates.values[i]):
            img = misc.imread(f, flatten=True)
            image_data[i, ...] = img.flatten()
            i = i + 1
    return image_data

In [8]:
def img_datetime(filenames):
    N = len(filenames)
    images_date = np.empty(N, dtype=np.dtype('M8[ns]'))
    for i, f in enumerate(filenames):
        images_date[i] = path_to_datetime(f)
    return images_date

## Get Data

### Get file names

In [59]:
images_path = 'katkam-scaled/'
orig_weather_path = 'yvr-weather/'
mod_weather_path = 'weather-data/'
orig_files = glob.glob(os.path.join(orig_weather_path, "*.csv"))
image_files = glob.glob(os.path.join(images_path, "*.jpg"))

### Clean weather data files

In [60]:
remove_headers(orig_files)
mod_files = glob.glob(os.path.join(mod_weather_path, "*.csv"))

### Load weather data into dataframe

In [61]:
# Adapted from: https://stackoverflow.com/a/36416258
columns = ['Date/Time', 'Weather']
df_from_each_file = (pd.read_csv(f, parse_dates=['Date/Time'], usecols=columns) for f in mod_files)
weather_data = pd.concat(df_from_each_file, ignore_index=True)
weather_data = weather_data.dropna()

### Get dates of image files

In [62]:
images_dates = img_datetime(image_files)
dates_df = pd.DataFrame(images_dates, columns=['Date/Time'])

### Merge weather_data with dates_df to keep images which have weather information

In [64]:
data = pd.merge(weather_data, dates_df, on = 'Date/Time')

### Get categories

In [65]:
#weather_data.Weather.unique()

In [66]:
#categories = ['Cloudy', 'Rain', 'Clear', 'Drizzle', 'Fog', 'Thunderstorms', 'Snow', 'Ice Pellets', 'Snow Pellets']
#pattern = re.compile(r'\b(?:%s)\b' % '|'.join(categories))

In [67]:
categories = ['Cloudy', 'Rain', 'Clear', 'Drizzle', 'Fog', 'Snow']
pattern = re.compile(r'\b(?:(%s))\b' % '|'.join(categories))
data['Categories_ML'] = data['Weather'].str.findall(pattern)
data['Categories'] = data['Weather'].str.extract(pattern, expand=False)

In [68]:
data = data.dropna(subset = ['Categories', 'Categories_ML'])

### Load image data

In [185]:
image_data = img_data(image_files, data['Date/Time'])

## Train data

### Split data

In [198]:
X = image_data
y = data['Categories']

In [199]:
lbl_enc = LabelEncoder()
y_labels = lbl_enc.fit_transform(y)

In [200]:
X_train, X_test, y_train, y_test = train_test_split(X, y_labels, random_state = 21)

### Fit model

In [39]:
param_grid = { 
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2', 0.50],
}

## "Good" accuracy, around 0.7135

In [209]:
model = make_pipeline(
    PCA(250),
    RandomForestClassifier(n_estimators=300, n_jobs=-1)
)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.725978647687


## "Best" accuracy soo far, 0.73136

In [203]:
model = make_pipeline(
    PCA(250),
    BaggingClassifier(n_estimators=300, n_jobs=-1)
)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.729537366548


## Try multilabel classifier

In [None]:
X = image_data
y = data['Categories_ML']
y = MultiLabelBinarizer(classes=categories).fit_transform(y)

## Determining time of day

In [187]:
data['t_day'] = 'Afternoon'
data.loc[data['Date/Time'].dt.hour < 12, 't_day'] = 'Morning'
data.loc[data['Date/Time'].dt.hour >= 18, 't_day'] = 'Evening'

In [188]:
X = image_data
y = data['t_day']

In [189]:
lbl_enc = LabelEncoder()
y_labels = lbl_enc.fit_transform(y)

In [191]:
X_train, X_test, y_train, y_test = train_test_split(X, y_labels, random_state = 21)

In [192]:
model = make_pipeline(
    PCA(250),
    RandomForestClassifier(n_estimators=250, n_jobs=-1)
)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.747330960854


In [167]:
model = make_pipeline(
    PCA(250),
    BaggingClassifier(n_estimators=250, n_jobs=-1)
)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.733096085409


### Test model