In [1]:
# Import packages

import numpy as np
import pandas as pd
from datetime import *
import calendar
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoLars
from sklearn.svm import SVR
from sklearn.decomposition import PCA

# Create constants for dirs
training_dir = 'city_training/'
test_dir = 'city_testing/'

# Create constants for filenames
city_attrib_file = 'city_attributes.csv'
humidity_file = 'humidity.csv'
pressure_file = 'pressure.csv'
temperature_file = 'temperature.csv'
weather_description_file = 'converted.csv'
wind_direction_file = 'wind_direction.csv'
wind_speed_file = 'wind_speed.csv'

training_date_range = ('2013-10-2', '2016-12-31')
test_date_range = ('2017-1-1', '2017-11-30')
# List of all independent variable files we are pulling data from
attrib_files = [humidity_file, pressure_file, temperature_file, weather_description_file, wind_direction_file, wind_speed_file]
# List of all independent variables (not files)
attribs = []
for attrib_file in attrib_files:
    attribs.append(attrib_file[:-4])


# Create lists of 12 cities
cities = list(pd.read_csv(humidity_file, sep=',').columns.values)
cities.remove('datetime')

devin_cities = cities[:12]
ethan_cities = cities[12:24]
phil_cities = cities[24:]


In [2]:
attrib_dfs = []
for file in attrib_files:
    attrib_dfs.append(pd.read_csv(file, sep=','))

In [3]:
# Read in test or train files for the given city
def get_df_train_file(city):
    return pd.read_csv(training_dir + city + '_training.csv')
    
def get_df_test_file(city):
    return pd.read_csv(test_dir + city + '_test.csv')

In [4]:
# Parse datetime from Y-m-d strings
def get_datetime(date):
    return datetime.strptime(date, '%Y-%m-%d')

In [5]:
# Create function that will form our training and test dataset 

# Args:
    # df = dataframe to append to
    # city_name = name of city we are creating the dataset for
    # date = date we are trying to predict (mm/dd/yyyy) as a str
# Return:
    # new df with added row
def create_data_df_for_city_date(df, city_name, date):
    datetime_obj = get_datetime(date)
    
    new_data_row = []
    
    # For each independent attribute, get each feature
    for atrrib_df in attrib_dfs:
        # add the new tuple to the end of the list (row)
        attrib_1yr_3days_values = get_1yr_3days_attrib(city_name, datetime_obj, atrrib_df)
        for val in attrib_1yr_3days_values:
            new_data_row.append(val)
        
        
    # For each dependent attribute, get each feature (just 1 day)
    for atrrib_df in attrib_dfs:
        new_data_row.append(get_today_attrib(city_name, datetime_obj, atrrib_df))
        
    
    df.loc[len(df)] = new_data_row
    return df

In [6]:
# Helper function to get 1 year ago and 3 days ago data from the date
    # for a given attribute (temp/pres/humidity...etc)
    
# Args:
    # city_name = name of city you want the data of
    # date = the current date, from which you want 1 year ago and the past 3 days
    # csv_file = the attribute you want to get
    
# Returns:
    # tuple of (1yr,3days,2days,1day)
def get_1yr_3days_attrib(city_name, date, attrib_df):
    ret = []
    
    year_1_date = date - timedelta(days=365)
    ret.append(get_avg(attrib_df, city_name, year_1_date))
    
    
    day_3_date = date - timedelta(days=3)
    ret.append(get_avg(attrib_df, city_name, day_3_date))
    
    day_2_date = date - timedelta(days=2)
    ret.append(get_avg(attrib_df, city_name, day_2_date))
    
    day_1_date = date - timedelta(days=1)
    ret.append(get_avg(attrib_df, city_name, day_1_date))
    
    return ret

In [7]:
# Get today's attribute (avg'ed) based on city and csv_file
# Args:
    # city_name = name of city you want the data of
    # date = the current date in datetime, from which you want 1 year ago and the past 3 days
    # csv_file = the attribute you want to get
# Returns:
    # the avg of the attribute for the given day and city
def get_today_attrib(city_name, date, attrib_df):
    return get_avg(attrib_df, city_name, date)

In [8]:
def get_avg(dataframe, city, date):
    strdate = date.strftime("%Y-%m-%d")
    daily = dataframe[['datetime', city]].copy() #create dataframe of just datetimes and that city 
    day = daily[daily['datetime'].str.contains(strdate)]   #filter above dataframe for a specific day
    valGood = day.dropna()
    vals = list(valGood[city])     #create list of all temps for that day
    return np.mean(vals)

In [9]:
# Get a list of all dates within the date_range
def dates_list(date_range):
    dates = []
    date1 = get_datetime(date_range[0])
    date2 = get_datetime(date_range[1])
    delta = date2 - date1       # timedelta
    for i in range(delta.days + 1):
        newDate = (date1 + timedelta(days=i))
        dates.append(newDate.strftime('%Y-%m-%d'))
    return dates

In [10]:
# Create all possible combinations of cities and dates
def get_date_city_combo(city, date_range):
    #create city + date tuples in a list
    city_date_combo = []
    for date in date_range:
        tup = (city, date)
        city_date_combo.append(tup)
    return city_date_combo

In [11]:
# Create a list of all city names
def get_all_cities():
    cities = list(attrib_dfs[0].columns.values)
    cities.remove('datetime')
    return cities

In [12]:
# Create the df with all independent + dependent variables for given city
def create_training_df_for_city(city):
    columns = []
    for index in attrib_files:
        index = index.rstrip('.csv')
        columns.append(index + '_1year')
        columns.append(index + '_3days')
        columns.append(index + '_2days')
        columns.append(index + '_1days')
    for index in attrib_files:
        columns.append(index + '_today')
    df = pd.DataFrame(columns=columns)
    city_date_combos = get_date_city_combo(city, dates_list(training_date_range))
    for city_date in city_date_combos:
        df = create_data_df_for_city_date(df, city_date[0], city_date[1])
        
    return df

# Create the df with all independent variables for given city
def create_test_df_for_city(city):
    columns = []
    for index in attrib_files:
        index = index.rstrip('.csv')
        columns.append(index + '_1year')
        columns.append(index + '_3days')
        columns.append(index + '_2days')
        columns.append(index + '_1days')
    for index in attrib_files:
        columns.append(index + '_today')
    df = pd.DataFrame(columns=columns)
    city_date_combos = get_date_city_combo(city, dates_list(test_date_range))
    for city_date in city_date_combos:
        df = create_data_df_for_city_date(df, city_date[0], city_date[1])
        
    return df

In [13]:
# Create a model for the given attribute
def create_model(training_df, test_df, attrib):
    # Model types
    model_types = [LinearRegression(), SVR(), Ridge(), Lasso(), LassoLars()]

    # Find the index of the attribute we are talking about, by comparing to the attribute list we have
    attrib_file = attrib + '.csv'
    attrib_index = attrib_files.index(attrib_file)
    # Create the training and test data sets, X=all rows but last 6, y=the attrib column we are interested in
    train_X = training_df.iloc[:, :-6]
    train_y = training_df.iloc[:, 24 + attrib_index]
    
    test_X = test_df.iloc[:, :-6]
    test_y = test_df.iloc[:, -attrib_index]
    
    train_X = train_X.sample(frac=1).reset_index(drop=True)
    train_y = train_y.sample(frac=1).reset_index(drop=True)
    test_X = test_X.sample(frac=1).reset_index(drop=True)
    test_y = test_y.sample(frac=1).reset_index(drop=True)
    
    train_y = np.ravel(train_y)
    test_y = np.ravel(test_y)
    
    # Calculate k and n for AIC BIC calculation
    n = training_df.shape[0]
    k = train_X.shape[1] + 1
    
    # Run through all models and fit the data
    model_metrics = []
    for model in model_types:
        model.fit(train_X, train_y)
        # Associate the accuracy score to the given model
        cv_scores = cross_val_score(model, test_X, test_y, cv=10, scoring='neg_mean_squared_error')
        # Ensure each metric is positive before taking the mean
        for index in range(0, cv_scores.size):
            cv_scores[index] = abs(cv_scores[index])
        MSE = cv_scores.mean()
        #print('MSE', MSE)
        
        aic = (2 * k) + n * np.log(MSE)
        bic = k * np.log(n) + n * np.log(MSE)
        model_metrics.append((aic + bic)/2)
        
    best_metric = min(model_metrics)
    #print('Best AIC + BIC avg: ', best_metric)
    best_model_index = model_metrics.index(best_metric)
    
    return model_types[best_model_index]

In [14]:
# Create a list of models for each attribute for the given city
def create_all_attrib_models(city):
    models = {}
    train_df = get_df_train_file(city)
    test_df = get_df_test_file(city)
    for attrib in attribs:
        models[attrib] = create_model(train_df, test_df, attrib)
    return models

In [None]:
# Prints the explained variance of each feature in the given data
def print_pca(training_df):

    train_X = training_df.iloc[:, :-6]
    pca = PCA(n_components=24)
    pca.fit(train_X)
    print(pca.explained_variance_ratio)
print_pca(create_test_df_for_city('Boston'))

In [185]:
# Predict all attributes given a date and city,
# Returns: (prediction, true values)
def predict_all_attrib(city, date):
    
    all_models = create_all_attrib_models(city)
    test_df = get_df_test_file(city)
    
    # Get what row we want from the test file
    date = get_datetime(date)
    delta = date - get_datetime(test_date_range[0])
    row_index_from_date = delta.days
    
    test_X = test_df.loc[row_index_from_date][:-6].values.reshape(1, -1)
    
    predicted_attribs = []
    true_attribs = []
    
    
    for i in range(0, len(attrib_dfs)):
        predicted_attrib = all_models[attribs[i]].predict(test_X)
        predicted_attribs.append((attribs[i], predicted_attrib))

        true_attrib = get_today_attrib(city, date, attrib_dfs[i])
        true_attribs.append((attribs[i], true_attrib))

        
    return [predicted_attribs, true_attribs]

In [195]:
# main - test creating df:

prediction = predict_all_attrib('Boston', '2017-5-5')
print('Predicted:\n', prediction[0])
print()
print('True:\n', prediction[1])


Predicted:
 [('humidity', array([79.31785714])), ('pressure', array([1020.61538462])), ('temperature', array([283.67871556])), ('converted', array([2.69095539])), ('wind_direction', array([200.413])), ('wind_speed', array([3.29914384]))]

True:
 [('humidity', 84.33333333333333), ('pressure', 1012.25), ('temperature', 283.44166666666666), ('converted', 2.5), ('wind_direction', 145.83333333333334), ('wind_speed', 5.166666666666667)]
