In [1]:
# Import packages

import numpy as np
import pandas as pd
from datetime import *
import calendar
from sklearn.model_selection import train_test_split

# Create constants for filenames

city_attrib_file = 'city_attributes.csv'
humidity_file = 'humidity.csv'
pressure_file = 'pressure.csv'
temperature_file = 'temperature.csv'
weather_description_file = 'converted.csv'
wind_direction_file = 'wind_direction.csv'
wind_speed_file = 'wind_speed.csv'

training_date_range = ('2013-10-2', '2016-12-31')
test_date_range = ('2017-1-1', '2017-11-30')
# List of all independent variable files we are pulling data from
attrib_files = [humidity_file, pressure_file, temperature_file, weather_description_file, wind_direction_file, wind_speed_file]



# Create lists of 12 cities
cities = list(pd.read_csv(humidity_file, sep=',').columns.values)
cities.remove('datetime')

devin_cities = cities[:12]
ethan_cities = cities[12:24]
phil_cities = cities[24:]


In [2]:
attrib_dfs = []
for file in attrib_files:
    attrib_dfs.append(pd.read_csv(file, sep=','))

In [3]:
# Create function that will form our training and test dataset 

# Args:
    # df = dataframe to append to
    # city_name = name of city we are creating the dataset for
    # date = date we are trying to predict (mm/dd/yyyy) as a str
# Return:
    # new df with added row
def create_training_df_for_city_date(df, city_name, date):
    datetime_obj = datetime.strptime(date, '%Y-%m-%d')
    
    new_data_row = []
    
    # For each independent attribute, get each feature
    for atrrib_df in attrib_dfs:
        # add the new tuple to the end of the list (row)
        attrib_1yr_3days_values = get_1yr_3days_attrib(city_name, datetime_obj, atrrib_df)
        for val in attrib_1yr_3days_values:
            new_data_row.append(val)
        
        
    # For each dependent attribute, get each feature (just 1 day)
    for atrrib_df in attrib_dfs:
        new_data_row.append(get_today_attrib(city_name, datetime_obj, atrrib_df))
        
    
    df.loc[len(df)] = new_data_row
    return df

def create_test_df_for_city_date(df, city_name, date):
    datetime_obj = datetime.strptime(date, '%Y-%m-%d')
    
    new_data_row = []
    
    # For each independent attribute, get each feature
    for attrib_df in attrib_dfs:
        # add the new tuple to the end of the list (row)
        attrib_1yr_3days_values = get_1yr_3days_attrib(city_name, datetime_obj, attrib_df)
        for val in attrib_1yr_3days_values:
            new_data_row.append(val)
        
    df.loc[len(df)] = new_data_row
    return df

In [4]:
# Helper function to get 1 year ago and 3 days ago data from the date
    # for a given attribute (temp/pres/humidity...etc)
    
# Args:
    # city_name = name of city you want the data of
    # date = the current date, from which you want 1 year ago and the past 3 days
    # csv_file = the attribute you want to get
    
# Returns:
    # tuple of (1yr,3days,2days,1day)
def get_1yr_3days_attrib(city_name, date, attrib_df):
    ret = []
    
    year_1_date = date - timedelta(days=365)
    ret.append(get_avg(attrib_df, city_name, year_1_date))
    
    
    day_3_date = date - timedelta(days=3)
    ret.append(get_avg(attrib_df, city_name, day_3_date))
    
    day_2_date = date - timedelta(days=2)
    ret.append(get_avg(attrib_df, city_name, day_2_date))
    
    day_1_date = date - timedelta(days=1)
    ret.append(get_avg(attrib_df, city_name, day_1_date))
    
    return ret

In [5]:
# Get today's attribute (avg'ed) based on city and csv_file
# Args:
    # city_name = name of city you want the data of
    # date = the current date, from which you want 1 year ago and the past 3 days
    # csv_file = the attribute you want to get
# Returns:
    # the avg of the attribute for the given day and city
def get_today_attrib(city_name, date, attrib_df):
    return get_avg(attrib_df, city_name, date)

In [6]:
def get_avg(dataframe, city, date):
    strdate = date.strftime("%Y-%m-%d")
    daily = dataframe[['datetime', city]].copy() #create dataframe of just datetimes and that city 
    day = daily[daily['datetime'].str.contains(strdate)]   #filter above dataframe for a specific day
    valGood = day.dropna()
    vals = list(valGood[city])     #create list of all temps for that day
    return np.mean(vals)

In [7]:
# Get a list of all dates within the date_range
def dates_list(date_range):
    dates = []
    date1 = datetime.strptime(date_range[0], "%Y-%m-%d")
    date2 = datetime.strptime(date_range[1], "%Y-%m-%d")
    delta = date2 - date1       # timedelta
    for i in range(delta.days + 1):
        newDate = (date1 + timedelta(days=i))
        dates.append(newDate.strftime('%Y-%m-%d'))
    return dates

In [8]:
# Create all possible combinations of cities and dates
def get_date_city_combo(city, date_range):
    #create city + date tuples in a list
    city_date_combo = []
    for date in date_range:
        tup = (city, date)
        city_date_combo.append(tup)
    return city_date_combo

In [9]:
# Create a list of all city names
def get_all_cities():
    cities = list(humidity.columns.values)
    cities.remove('datetime')
    return cities

In [13]:
# Create the df with all independent + dependent variables for given city
def create_training_df_for_city(city):
    columns = []
    for index in attrib_files:
        index = index.rstrip('.csv')
        columns.append(index + '_1year')
        columns.append(index + '_3days')
        columns.append(index + '_2days')
        columns.append(index + '_1days')
    for index in attrib_files:
        columns.append(index + '_today')
    df = pd.DataFrame(columns=columns)
    city_date_combos = get_date_city_combo(city, dates_list(training_date_range))
    for city_date in city_date_combos:
        df = create_training_df_for_city_date(df, city_date[0], city_date[1])
        print(df)
        
    return df

# Create the df with all independent variables for given city
def create_test_df_for_city(city):
    columns = []
    for index in attrib_files:
        index = index.rstrip('.csv')
        columns.append(index + '_1year')
        columns.append(index + '_3days')
        columns.append(index + '_2days')
        columns.append(index + '_1days')
    df = pd.DataFrame(columns=columns)
    city_date_combos = get_date_city_combo(city, dates_list(test_date_range))
    for city_date in city_date_combos:
        df = create_test_df_for_city_date(df, city_date[0], city_date[1])
        
    return df

In [None]:
# Create a trained fatality prediction model for the given road
def create_model(training_df):
    # Model types
    model_types = [LinearRegression(), SVR()]

    # Create the training and test data sets, X=all rows but last 6, y=last 6
    X = training_df.iloc[:, :-6]
    y = training_df.iloc[:, -6:]
    
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.1)
    
    train_y = np.ravel(train_y)
    test_y = np.ravel(test_y)
    
    # Run through all models and fit the data
    model_accuracies = {}
    for model in model_types:
        model.fit(train_X, train_y)
        # Associate the accuracy score to the given model
        model_accuracies[model] = abs(cross_val_score(model, test_X, test_y, cv=10, scoring='mean_squared_error').mean())
        
    best_model = min(model_accuracies.items(), key=operator.itemgetter(1))[0]
    print('MSE: ', model_accuracies[best_model])
    
    return best_model

In [15]:
# main - test creating df:

for city in devin_cities:
    test_df = create_test_df_for_city(city)
    test_df.to_csv(city + '_test.csv', sep = ',', index=False)
    print(test_df)


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


KeyboardInterrupt: 