# Project

In [16]:
# Name: Alexis Lizardo
# Student ID: 301318503

## Imports

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
import sys
import os
import glob
import re
from PIL import Image

## Functions

In [18]:
# Adapted from: https://stackoverflow.com/a/26912445

def remove_headers(file_list):
    pattern = r'^"Date/Time"'
    pattern = re.compile(pattern)
    path = file_list[0].rpartition('/')[0] + "/"
    for file in file_list:
        new_file = "weather-data/"+ file.split('/')[1]
        with open(file,"r") as input:
            with open(new_file,"w") as output: 
                for line in input:
                    if pattern.match(line):
                        output.write(line)
                        break
                for line in input:
                    output.write(line)
        #os.rename(new_file, path + new_file)

In [19]:
def path_to_datetime(path):
    fname = path.split('/')[1]
    fname_no_ext = fname.split('.')[0]
    date = fname_no_ext.split('-')[1]
    dt = pd.to_datetime(date)
    return dt

In [30]:
def create_img_df(filename):
    im = Image.open(filename)
    im_df = pd.DataFrame(np.array(im).flatten())
    im_df = im_df.transpose()
    im_df['Date/Time'] = path_to_datetime(filename)
    im.close()
    return im_df

## Get Data

### Get file names

In [31]:
images_path = 'katkam-scaled/'
orig_weather_path = 'yvr-weather/'
mod_weather_path = 'weather-data/'
orig_files = glob.glob(os.path.join(orig_weather_path, "*.csv"))
image_files = glob.glob(os.path.join(images_path, "*.jpg"))

### Clean weather data files

In [32]:
remove_headers(orig_files)
mod_files = glob.glob(os.path.join(mod_weather_path, "*.csv"))

### Load weather data into dataframe

In [33]:
# Adapted from: https://stackoverflow.com/a/36416258

df_from_each_file = (pd.read_csv(f, parse_dates=['Date/Time']) for f in mod_files)
weather_data = pd.concat(df_from_each_file, ignore_index=True)

## Load image data into dataframe

In [34]:
df_from_each_image = (create_img_df(f) for f in image_files[0:50])
image_data = pd.concat(df_from_each_image, ignore_index = True)

In [44]:
image_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,147447,147448,147449,147450,147451,147452,147453,147454,147455,Date/Time
0,108,127,170,106,125,168,103,122,165,100,...,0,0,0,0,0,0,0,0,0,2016-06-05 06:00:00
1,89,121,180,89,121,180,89,121,180,89,...,0,0,0,0,0,0,0,0,0,2016-06-05 07:00:00
2,79,118,183,80,119,184,80,119,184,81,...,0,0,0,0,0,0,0,0,0,2016-06-05 08:00:00
3,90,119,175,90,119,175,90,119,175,89,...,0,0,0,0,0,0,0,0,0,2016-06-05 09:00:00
4,125,150,191,122,147,188,121,146,187,122,...,0,0,0,0,0,0,0,0,0,2016-06-05 10:00:00


In [49]:
data = pd.merge(weather_data, image_data, on = 'Date/Time')

In [50]:
data

Unnamed: 0,Date/Time,Year,Month,Day,Time,Data Quality,Temp (°C),Temp Flag,Dew Point Temp (°C),Dew Point Temp Flag,...,147446,147447,147448,147449,147450,147451,147452,147453,147454,147455
0,2016-06-05 06:00:00,2016,6,5,06:00,‡,17.7,,15.2,,...,0,0,0,0,0,0,0,0,0,0
1,2016-06-05 07:00:00,2016,6,5,07:00,‡,18.9,,15.6,,...,0,0,0,0,0,0,0,0,0,0
2,2016-06-05 08:00:00,2016,6,5,08:00,‡,19.6,,15.5,,...,0,0,0,0,0,0,0,0,0,0
3,2016-06-05 09:00:00,2016,6,5,09:00,,,,,,...,0,0,0,0,0,0,0,0,0,0
4,2016-06-05 10:00:00,2016,6,5,10:00,‡,21.2,,15.8,,...,0,0,0,0,0,0,0,0,0,0
5,2016-06-05 11:00:00,2016,6,5,11:00,‡,22.6,,17.0,,...,0,0,0,0,0,0,0,0,0,0
6,2016-06-05 12:00:00,2016,6,5,12:00,‡,23.5,,15.9,,...,0,0,0,0,0,0,0,0,0,0
7,2016-06-05 13:00:00,2016,6,5,13:00,‡,24.0,,15.9,,...,0,0,0,0,0,0,0,0,0,0
8,2016-06-05 14:00:00,2016,6,5,14:00,‡,24.4,,16.3,,...,0,0,0,0,0,0,0,0,0,0
9,2016-06-05 15:00:00,2016,6,5,15:00,‡,24.7,,16.6,,...,0,0,0,0,0,0,0,0,0,0
