In [1]:
from datetime import datetime
from datetime import timedelta
import time
import pandas as pd
from cover_path import cover_path, get_adj
from ast import literal_eval
from pprint import pprint

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

### CHANGE THESE TO CHANGE THE TIME RANGE ###
# recommended to not use the first datapoint you have
# as the start of time range
# because the model relies on looking back one hour
lower_bound = datetime(2018,1,3)
upper_bound = datetime(2018,1,10)
############################################


  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
def get_weather():
    weather_df = pd.read_csv("data/weather_data.csv")
    weather_types = set()
    weather_dict = {}
    for i,r in weather_df.iterrows():
        wt = r['HOURLYPRSENTWEATHERTYPE']
        dt = datetime.strptime(r['DATE'],"%Y-%m-%d %H:%M")
        weather = set()
        if type(wt) is str:
            wt = wt.replace("|","")
            wt = wt.strip()
            wt = wt.split(" ")
            for w in wt: 
                if w=="+RA:02":
                    weather.add("heavy rain")
                elif w=="-RA:02":
                    weather.add("light rain")
                elif w=="BR:1":
                    weather.add("mist")
                elif w=="FG:2" or w=="FG:30" or w=="FG:05":
                    weather.add("fog")
                elif w=="HZ:05" or w=="HZ:7":
                    weather.add("haze")
                elif w=="RA:02" or w=="RA:61" or w=="RA:62" or w=="RA:63":
                    weather.add("rain")
                elif w=="FU:05":
                    weather.add("smoke")
                else:
                    print("Warning: unidentified weather type. Code: {}".format(w))
        weather_dict[(dt.date(),dt.hour)] = weather
    return weather_dict

weather_dict = get_weather()

In [3]:
def index_streets():
    print("Indexing streets.")
    streets_df = pd.read_csv("data/street_grid.csv")
    streets = set()
    index = 0
    m = len(streets_df)
    for i,r in streets_df.iterrows():
        if i%10000==0:
            print("{} of {}".format(i,m))
        # target = (r['streets'], r['type'], r['intersection'], {})
        street = literal_eval(r['coordinates'])
        streets.add(street)

    print("done indexing streets.")
    return streets

streets = index_streets()


Indexing streets.
0 of 345268
10000 of 345268
20000 of 345268
30000 of 345268
40000 of 345268
50000 of 345268
60000 of 345268
70000 of 345268
80000 of 345268
90000 of 345268
100000 of 345268
110000 of 345268
120000 of 345268
130000 of 345268
140000 of 345268
150000 of 345268
160000 of 345268
170000 of 345268
180000 of 345268
190000 of 345268
200000 of 345268
210000 of 345268
220000 of 345268
230000 of 345268
240000 of 345268
250000 of 345268
260000 of 345268
270000 of 345268
280000 of 345268
290000 of 345268
300000 of 345268
310000 of 345268
320000 of 345268
330000 of 345268
340000 of 345268
done indexing streets.


In [None]:
# creates snapshots of the city
# each the keys are times and hours
# the output is the indexed grid (1 for jammed, 0 for not jammed)
def construct_snapshots():
    print("Constructing snapshots.")
    jams_df = pd.read_csv("data/jams.csv")


    states = {}
    jam_index = {}

    starting_time = datetime.fromtimestamp(jams_df.iloc[0]['pub_millis']/1000)
    starting_date = starting_time.date()
    starting_hour = starting_time.hour

    new_streets = set()

    for i,r in jams_df.iterrows():
        if i%5000 == 0:
            print("{} of {}".format(i,len(jams_df['street'])))
        time = datetime.fromtimestamp(r['pub_millis']/1000)
        hour = time.hour
        path = literal_eval(r['line'])
        key = (time.date(),time.hour)
        if key not in states:
            states[key] = set()

        covered_path = cover_path(path)
        for s in covered_path:
            # we are just going to throw out our data that didn't get caught by our grid
            # sorry :(
            # this is a tiny part (~0.5%) of the data anyways
            if s in streets:
                states[key].add(s)
                new_streets.add(s)

    print("Done constructing snapshots.")

    all_weather = set()

    date = lower_bound.date()
    hour = lower_bound.hour
    
    while date!=upper_bound.date() or hour!=upper_bound.hour:
        weather = weather_dict[(time.date(),hour)]
        for w in weather:
            all_weather.add(w)
        hour += 1
        if hour > 23:
            date += timedelta(days=1)
            hour = 0
    
    date = lower_bound.date()
    hour = lower_bound.hour

    data = {}
    data['x'] = []
    data['y'] = []
    data['day_of_week'] = []
    data['hour'] = []
    for aw in all_weather:
        data[aw] = []
    data['adjacent_traffic'] = []
    data['traffic'] = []

    
    while date!=upper_bound.date() or hour!=upper_bound.hour:
        print("{}, {}".format(date,hour))
        weather = weather_dict[(date,hour)]
        for s in new_streets:
            x,y = s
            data['x'].append(x)
            data['y'].append(y)
            data['day_of_week'].append(date.weekday())
            data['hour'].append(hour)
            for aw in all_weather:
                if aw in weather:
                    data[aw].append(1)
                else:
                    data[aw].append(0)
            traffic = 0
            if (date,hour) in states:
                if s in states[(date,hour)]:
                    traffic = 1
            adj = 0
            last_hour = hour-1
            last_date = date
            if last_hour < 0:
                last_hour = 23
                last_date = date - timedelta(days=1)
            if (last_date,last_hour) in states:
                for a in get_adj(s):
                    if a in states[last_date,last_hour]:
                        adj = 1
                if s in states[last_date,last_hour]:
                    adj = 1
            data['adjacent_traffic'].append(adj) 
            data['traffic'].append(traffic)

        hour += 1
        if hour > 23:
            date += timedelta(days=1)
            hour = 0

    return pd.DataFrame(data)
    

df = construct_snapshots()
print(df)

Constructing snapshots.
0 of 12327
5000 of 12327
10000 of 12327
Done constructing snapshots.
2018-01-03, 0
2018-01-03, 1
2018-01-03, 2
2018-01-03, 3
2018-01-03, 4
2018-01-03, 5
2018-01-03, 6
2018-01-03, 7
2018-01-03, 8
2018-01-03, 9
2018-01-03, 10
2018-01-03, 11
2018-01-03, 12
2018-01-03, 13
2018-01-03, 14
2018-01-03, 15
2018-01-03, 16
2018-01-03, 17
2018-01-03, 18
2018-01-03, 19
2018-01-03, 20
2018-01-03, 21
2018-01-03, 22
2018-01-03, 23
2018-01-04, 0
2018-01-04, 1
2018-01-04, 2
2018-01-04, 3
2018-01-04, 4
2018-01-04, 5
2018-01-04, 6
2018-01-04, 7
2018-01-04, 8
2018-01-04, 9
2018-01-04, 10
2018-01-04, 11


In [None]:
xtr, xts, ytr, yts = train_test_split(df.drop(['traffic'], axis=1),
                                      df['traffic'],test_size = 0.33)


In [None]:
from sklearn import linear_model

clf = linear_model.SGDClassifier()
clf.fit(xtr,ytr)

In [None]:
true_negative = 0
false_negative = 0
false_positive = 0
true_positive = 0
predict = clf.predict(xts)
for i in range(len(predict)):
    if i%50000 == 0:
        print("{} of {}".format(i,len(predict)))
    guess = predict[i]
    true = yts.iloc[i]
    if guess == 0:
        if true == 0:
            true_negative += 1
        else:
            false_negative += 1
    else:
        if true == 0:
            false_positive += 1
        else:
            true_positive += 1
            
# this needs some work
print("True negative: {}".format(true_negative))
print("False negative: {}".format(false_negative))
print("False positive: {}".format(false_positive))
print("True positive: {}".format(true_positive))