In [16]:
# Import packages

import pandas as pd
import datetime
import calendar

# Create constants for filenames

city_attrib_file = 'city_attributes.csv'
humidity_file = 'humidity.csv'
pressure_file = 'pressure.csv'
temperature_file = 'temperature.csv'
weather_description_file = 'weather_description.csv'
wind_direction_file = 'wind_direction.csv'
wind_speed_file = 'wind_speed.csv'

# List of all independent variable files we are pulling data from
attrib_files = [humidity_file, pressure_file, temperature_file, weather_description_file, wind_direction_file, wind_speed_file]



In [17]:
# Create function that will form our training and test dataset 

# Args:
    # df = dataframe to append to
    # city_name = name of city we are creating the dataset for
    # date = date we are trying to predict (mm/dd/yyyy) as a str
# Return:
    # new df with added row
def create_training_df(df, city_name, date):
    datetime_obj = datetime.strptime(date, '%m/%d/%Y')
    
    new_data_row = []
    
    # For each independent attribute, get each feature (1yr, 3 days)
    for file in attrib_files:
        # add the new tuple to the end of the list (row)
        new_data_row.extend(get_1yr_3days_attrib(city_name, datetime_obj, file))
        
            
    # For each dependent attribute, get each feature (just 1 day)
    for file in attrib_files:
        new_data_row.append(get_today_attrib(city_name, datetime_obj, file))
            
    df.loc[len(df)] = new_data_row
    return df

def create_test_df(df, city_name, date):
    datetime_obj = datetime.strptime(date, '%m/%d/%Y')
    
    new_data_row = []
    
    # For each independent attribute, get each feature
    for file in attrib_files:
        # add the new tuple to the end of the list (row)
        new_data_row.extend(get_1yr_3days_attrib(city_name, datetime_obj, file))
            
    df.loc[len(df)] = new_data_row
    return df

In [18]:
# Helper function to get 1 year ago and 3 days ago data from the date
    # for a given attribute (temp/pres/humidity...etc)
    
# Args:
    # city_name = name of city you want the data of
    # date = the current date, from which you want 1 year ago and the past 3 days
    # csv_file = the attribute you want to get
    
# Returns:
    # tuple of (1yr,3days,2days,1day)
def get_1yr_3days_attrib(city_name, date, csv_file):
    ret = (0,0,0,0)
    
    df = pd.read_csv(csv_file, sep=',')
    
    year_1_date = date - timedelta(days=365)
    ret[0] = get_avg(df, city_name, year_1_date)
    
    
    day_3_date = date - timedelta(days=3)
    ret[1] = get_avg(df, city_name, day_3_date)
    
    day_2_date = date - timedelta(days=2)
    ret[2] = get_avg(df, city_name, day_2_date)
    
    day_1_date = date - timedelta(days=1)
    ret[3] = get_avg(df, city_name, day_1_date)
    
    return ret

In [19]:
# Get today's attribute (avg'ed) based on city and csv_file
# Args:
    # city_name = name of city you want the data of
    # date = the current date, from which you want 1 year ago and the past 3 days
    # csv_file = the attribute you want to get
# Returns:
    # the avg of the attribute for the given day and city
def get_today_attrib(city_name, date, csv_file):
    df = pd.read_csv(csv_file, sep=',')
    return get_avg(df, city_name, date)