# Project

In [1]:
# Name: Alexis Lizardo
# Student ID: 301318503

## Imports

In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
import sys
import os
import glob
import re
from PIL import Image
from scipy import misc
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier

## Functions

In [3]:
# Adapted from: https://stackoverflow.com/a/26912445

def remove_headers(file_list):
    pattern = r'^"Date/Time"'
    pattern = re.compile(pattern)
    path = file_list[0].rpartition('/')[0] + "/"
    for file in file_list:
        new_file = "weather-data/"+ file.split('/')[1]
        with open(file,"r") as input:
            with open(new_file,"w") as output: 
                for line in input:
                    if pattern.match(line):
                        output.write(line)
                        break
                for line in input:
                    output.write(line)
        #os.rename(new_file, path + new_file)

In [4]:
def path_to_datetime(path):
    fname = path.split('/')[1]
    fname_no_ext = fname.split('.')[0]
    date = fname_no_ext.split('-')[1]
    dt = pd.to_datetime(date)
    return dt

In [5]:
def create_img_df(filenames):
    N = len(filenames)
    image_data = np.empty((N, 192*256), dtype=np.uint8)
    for i, f in enumerate(filenames):
        img = misc.imread(f, flatten=True)
        image_data[i, ...] = img.flatten()
    img_df = pd.DataFrame(image_data)
    return img_df

In [6]:
def img_dt(filenames):
    N = len(filenames)
    images_date = np.empty(N, dtype=np.dtype('M8[ns]'))
    for i, f in enumerate(filenames):
        images_date[i] = path_to_datetime(f)
    return images_date

## Get Data

### Get file names

In [7]:
images_path = 'katkam-scaled/'
orig_weather_path = 'yvr-weather/'
mod_weather_path = 'weather-data/'
orig_files = glob.glob(os.path.join(orig_weather_path, "*.csv"))
image_files = glob.glob(os.path.join(images_path, "*.jpg"))

### Clean weather data files

In [8]:
remove_headers(orig_files)
mod_files = glob.glob(os.path.join(mod_weather_path, "*.csv"))

### Load weather data into dataframe

In [9]:
# Adapted from: https://stackoverflow.com/a/36416258
columns = ['Date/Time', 'Weather']
df_from_each_file = (pd.read_csv(f, parse_dates=['Date/Time'], usecols=columns) for f in mod_files)
weather_data = pd.concat(df_from_each_file, ignore_index=True)
weather_data = weather_data.dropna()

### Get categories

In [10]:
#weather_data.Weather.unique()

In [11]:
#categories = ['Cloudy', 'Rain', 'Clear', 'Drizzle', 'Fog', 'Thunderstorms', 'Snow', 'Ice Pellets', 'Snow Pellets']
#pattern = re.compile(r'\b(?:%s)\b' % '|'.join(categories))

In [12]:
categories = ['Cloudy', 'Rain', 'Clear', 'Drizzle', 'Fog', 'Thunderstorms', 'Snow']
pattern = re.compile(r'\b(?:(%s))\b' % '|'.join(categories))
weather_data['Categories'] = weather_data['Weather'].str.extract(pattern)

  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
image_data = create_img_df(image_files)
image_data['Date/Time'] = img_dt(image_files)

In [14]:
#dates = img_dt(image_files)

### Merge weather data with corresponding image data

In [15]:
data = pd.merge(weather_data, image_data, on = 'Date/Time')
#data = weather_data[weather_data['Date/Time'] == dates]

## Train data

### Split data

In [16]:
cols = ['Date/Time', 'Weather', 'Categories']
X = data[data.columns.difference(cols)]
Y = data['Categories']
#Y = MultiLabelBinarizer(classes=categories).fit_transform(Y)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 42)

### Fit model

In [130]:
# model = make_pipeline(
#     PCA(250),
#     OneVsRestClassifier(SVC(kernel='linear', C=2.0))
# )
# model.fit(X_train, y_train)

In [None]:
model = make_pipeline(
    PCA(250),
    #KNeighborsClassifier(6)
    SVC(kernel='linear', decision_function_shape='ovr')
)
model.fit(X_train, y_train)

### Test model

In [48]:
print(model.score(X_test, y_test))

0.674377224199
