In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import time
import math
import scipy
import scipy.stats as stats

### Read in Big Belly trash can data and 311 rodent report data (previously from full 311 dataset using OpenRefine

In [2]:
srv_url = 'https://raw.githubusercontent.com/DoxVader/RatsVsTrash/master/Data/311_Service_Requests.csv'
trash_url = 'https://raw.githubusercontent.com/DoxVader/RatsVsTrash/master/Data/Big_Belly_Alerts_2014.csv'

srvdf = pd.read_csv(srv_url)
trashdf = pd.read_csv(trash_url)

### Create a new column for the actual status (empty, green, or full) and remove the less useful colors

In [3]:
trashdf['status'] = trashdf['fullness'].map({'GREEN':'empty', 'RED':'full', 'YELLOW':'medium'}).astype(str)
trashdf = trashdf.drop('fullness',1)

### Convert the dates in the OPEN_DT column to a python datetime

In [4]:
srvdf['OPEN_DT'] = pd.to_datetime(srvdf['OPEN_DT'])


### Convert the dates in the trashdf into a python datetime column

In [5]:
trashdf['trash_time'] = pd.to_datetime(trashdf.timestamp)

### zip together latitude and longitude columns into a tuple

In [6]:
srvdf['location'] = zip(srvdf.LATITUDE, srvdf.LONGITUDE )

### cut srvdf down to the timeframe of the big belly data, removed unnecessary columns and reindexed

In [7]:
srvdf = srvdf[srvdf.OPEN_DT <= max(trashdf.trash_time)]
srvdf = srvdf[srvdf.OPEN_DT >= min(trashdf.trash_time)]
srvdf = srvdf.loc[:,['OPEN_DT','LATITUDE','LONGITUDE','LOCATION_STREET_NAME', 'location']]
srvdf = srvdf.reset_index(drop= True)

### zip location data together for trashdf

In [8]:
trashdf['location'] = zip(trashdf.Latitude, trashdf.Longitude)

# Calculating closest big belly to each rat event using relative latitude and longitude 
### Create function to compute the square of the distance from two tuples, converting degrees latitude and longitude to feet for the city of Boston

In [9]:
def lldist((lat1, long1), (lat2, long2)):
    rise = lat2 - lat1
    run =  long2 - long1

    rise = rise*364412.83
    run = run*271819.44

    return math.sqrt(rise**2 + run**2)        

In [10]:
from scipy.spatial import distance
def findMinDist(lat_long_vector1, lat_long_vector2):
    return distance.cdist(lat_long_vector1, lat_long_vector2, lldist)

### Create a dataframe of unique trash locations

In [11]:
unique_trash_locations = trashdf.groupby('description').head(1).drop(['trash_time', 'status', 'Latitude', 'Longitude'], axis=1)

### Create a matrix of distances between each rat event and Big Belly location

In [12]:
distMatrix = findMinDist(srvdf.location.tolist(), unique_trash_locations.location.tolist())

### Create a matrix of the horizontal key to the shortest values in the matrix  
#### ie. figure out which Big Belly each 311 rat event is closest to

In [13]:
closestTrash = np.argmin(distMatrix, axis=1)

# Create new columns in the distance matrix that will show the location description of the closest Big Belly and it's status at the time of each 311 rat event.
### Rat events that occur more than 350 ft from the closest Big Belly will be given the status and location description "NA"

In [14]:
srvdf['status'] = 'NA'
srvdf['BigBelly'] = 'NA'
for x in range(0,len(srvdf)):
    if distMatrix[x,closestTrash[x]]<350:
        trash_to_use = closestTrash[x]
        desc =  unique_trash_locations.iloc[trash_to_use,:].description
        trashHistory = trashdf[trashdf.description == desc]
        srvdf.status[x] = trashHistory[trashHistory.trash_time <= srvdf.iloc[x].OPEN_DT].sort_values('trash_time').tail(1).status.values
        srvdf.BigBelly[x] = trashHistory[trashHistory.trash_time <= srvdf.iloc[x].OPEN_DT].sort_values('trash_time').tail(1).description.values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Export the dataframe as a csv file

In [17]:
#srvdf.to_csv('rat.csv')