# Data Cleaning Notebook

In [6]:
import os
import pandas as pd
import numpy as np
import datetime as dt

from utilities import extract_position_time
from image_generator import plot_sky, create_times_array

from skyfield.api import load, utc, Topos


In [2]:

# get num of files in directories
print('Original Directory :', len(os.listdir('../cloud_cover1'))) # should now be empty
print('Training images :', len(os.listdir('../images/train'))) # should now have 90% of the files
print('Validation images :', len(os.listdir('../images/valid'))) # should now have 10% of the files 


Original Directory : 0
Training images : 78679
Validation images : 0


In [3]:
# training
files_train = os.listdir('../images/train')
files_train = [f for f in files_train if f.endswith('.png')]

# validation
files_val = os.listdir('../images/valid')
files_val= [f for f in files_val if f.endswith('.png')]

# files is a list of all the files in train and valid directories
files = files_train + files_val



In [4]:
len(files_val + files_train)

78679

In [5]:

data_df = pd.DataFrame({'times':[extract_position_time(f)[1] for f in files]})
data_df['positions'] = [extract_position_time(f)[0] for f in files]
data_df.sort_values(by = 'times', inplace = True)
complete_times = data_df['times'].unique()



In [6]:
# for each position, find which times are missing

def fill_missing():
    # iterate for each position and find which times are missing
    missing_times = []
    for position in data_df['positions'].unique():
        # get all times for this position
        times = data_df[data_df['positions'] == position]['times'].unique()
        # find which times are missing
        missing = np.setdiff1d(complete_times, times)
        if len(missing) > 0:
            missing = [pd.to_datetime(m) for m in missing]
            # get times array
            times_array = create_times_array(missing[0], missing[-1], 6)
            for time in times_array:
                observer = Topos(position[0], position[1])
                plot_sky(t = time, observer = observer, cloud_cover= 0/8, img_directory='../images/valid/')


Every position now has an image for every time!

In [9]:
data_df.groupby('times').count()

Unnamed: 0_level_0,positions
times,Unnamed: 1_level_1
2020-03-13 04:00:00,1920
2020-03-13 04:06:00,1920
2020-03-13 04:12:00,1920
2020-03-13 04:18:00,1920
2020-03-13 04:24:00,1920
2020-03-13 04:30:00,1920
2020-03-13 04:36:00,1920
2020-03-13 04:42:00,1920
2020-03-13 04:48:00,1920
2020-03-13 04:54:00,1920
