In [1]:
# Imports for data manipulation
import pandas as pd
import numpy as np

In [2]:
# Importa for data visualization
from matplotlib import pyplot as plt
import seaborn as sns

In [3]:
# Read sfdata (NO NEED TO RUN!)
sfdata_file = "../data/sf.csv"
sfdata = pd.read_csv(sfdata_file)

In [75]:
# Read bosdata
import re
import warnings

bos_file = '../data/boston.csv'
target_type = str  # The desired output type

with warnings.catch_warnings(record=True) as ws:
    warnings.simplefilter("always")

    bosData = pd.read_csv(bos_file, sep=",", header=0)
    print("Warnings raised:", ws)
    # We have an error on specific columns, try and load them as string
    for w in ws:
        s = str(w.message)
        print("Warning message:", s)
        match = re.search(r"Columns \(([0-9,]+)\) have mixed types\.", s)
        if match:
            columns = match.group(1).split(',') # Get columns as a list
            columns = [int(c) for c in columns]
            print("Applying %s dtype to columns:" % target_type, columns)
            bosData.iloc[:,columns] = bosData.iloc[:,columns].astype(target_type)

("Applying <type 'str'> dtype to columns:", [10])


In [6]:
# Add a crime-count column to quickly count the data
bosData['CrimeCount'] = 1
sfdata['CrimeCount'] = 1

In [7]:
def plotCrimeCounts(data, features, city):
    '''
    Data is the dataframe containing the crime statistics.
    Features is  disctionary mapping {description: column}
    city is the name of the city for which we are plotting.
    '''
    for description, columnName in features.iteritems():
        sns.countplot(y=columnName, data=data)
        plt.title('Crimes in {} by {}'.format(city, description))
        plt.show()

In [None]:
# Create histograms on some important metrics
# Map from english_description : column name
columnsToPlotSF = { 'Police District' : 'PdDistrict',
                    'Crime Type' : 'Category',
                    'Day of the Week' : 'DayOfWeek',
                   'Crime Outcome' : 'Resolution' }

In [None]:
plotCrimeCounts(sfdata, columnsToPlotSF, 'San Francisco')

In [None]:
# Create histograms on some important metrics for Boston
columnsToPlotBos = { 'Reporting District' : 'REPTDISTRICT', 
                     'Weapon Type' : 'WEAPONTYPE',
                     'Shooting/No Shooting' : 'Shooting',
                     'Officer Shift' : 'SHIFT',
                     'Year' : 'Year', 
                     'Month' : 'Month',
                     'Day of the Week' : 'DAY_WEEK'}

In [None]:
plotCrimeCounts(bosData, columnsToPlotBos, 'Boston')

In [None]:
loadPickle = True

In [None]:
# Convert date to actual date format. This might take a while!
# Note that we should not have to do this once the data
# has been cleaned!
# SKIP IF CLEAN DATA EXISTS
if not loadPickle:
    sfdata.Date = sfdata['Date'].apply(lambda x: pd.to_datetime(x, errors='raise'))
    sfdata.Time = sfdata['Time'].apply(lambda x: pd.to_datetime(x, errors='raise'))

In [54]:
# Helper functions to help clean SF data. We do this only once, as we 
# now save the results using pickle!
def buckets(series, n):
    # Takes a series and returns a series mapping each element to a
    # one of n buckets.
    mi, ma = series.min(), series.max()
    buckets = np.linspace(mi, ma, n + 1)
    
    res = series.copy()
    array = np.array(series)
    for i in xrange(n):
        res[(buckets[i] <= array) & (array < buckets[i+1])] = i
    return res

def cleanColumns(data):
    # Used to rename the columns in our data grame to their appropriate names.
    # Also drops unnecessary columns.
    data['Latitude'] = data['Y']
    data['Longitude'] = data['X']
    data['Type'] = data['Category']
    
    # print data.columns
    data = data.drop(['IncidntNum', 'Descript','Resolution','Address','X','Y', 'Location'], axis=1)
    
    return data

def createPartitions(data, n):
    # Remove outliers from the latitude/longitude issues
    # We know that the city lies between -130, -120 longitude
    # We also know that the citiy lies between 37 and 40 degrees latitude
    data = data[-120 > data.Longitude][data.Longitude > (-130)]
    data = data[data.Latitude > 37][data.Latitude < 40]
    
    # Each row is an occurrance of a single crime. 
    # Keep around the original data
    # data['Region'] =  n *  buckets(data['Latitude'], n) + buckets(data['Longitude'],n) + 1
    data['xRegion'] = buckets(data['Latitude'], n)
    data['yRegion'] =  buckets(data['Longitude'],n) + 1
    data['Region'] =  n * data.yRegion + data.xRegion    
    
    # Add in the types into the results.
    mapping = {key: i for i,key in enumerate(data['Type'].unique())}
    data['TypeIndex'] = data['Type'].map(mapping)

    # Now we can add the crime counts. 
    data['CrimeCount'] = np.ones(len(data))
    return data

def extractDateFeatures(data):
    # Creates a new data frame and returns it as copy with all the data that we're interested in
    # Create map from week days to integers
    DayOfWeek = {'Sunday': 1,
                 'Monday': 2,
                 'Tuesday': 3,
                 'Wednesday': 4,
                 'Thursday': 5,
                 'Friday': 6,
                 'Saturday': 7 }
    data['DoW'] = data['DayOfWeek'].map(DayOfWeek)
    data = data.drop('DayOfWeek', axis=1)
    print "Created Weeks"
    
    # We assume that the Date column is already in datetime format
    data['Month'] = data.Date.map(lambda x: x.month)
    data['DoM'] = data.Date.map(lambda x: x.day)
    data['Year'] = data.Date.map(lambda x: x.year) - data.Date.min().year
    data['ToD'] = data.Time.map(lambda x: x.minute)
    data['Time'] = data.Time.map(lambda x: x.value / 10**9) - data.Date.min().value / 10**9
    
    # We add an additional column that combines the month and the year into number of months since beginning
    data['TimeFeature'] = data.ix[:, ['Year', 'Month']].apply(lambda s: 12*s[0] + s[1], axis=1)
    
    data = data.drop('Date', axis=1)
    
    print "Created the time data features!"
    
    return data

In [9]:
sfpickle_file = '../../cs281_data/large_data/sfclean.pk'

In [None]:
# We only need to do this once, afterwards we should load from the 
# saved location.
# DO NOT RUN IF PICKLED FILE ALREADY EXISTS
if not loadPickle:
    sfclean = cleanColumns(sfdata)
    sfclean = extractDateFeatures(sfclean)
    sfclean.to_pickle(sfpickle_file)

In [10]:
# Load sfclean from pickled location!
import pickle
sfclean = open(sfpickle_file)
sfclean = pickle.load(sfclean)

In [11]:
# Sort the data by time
sfclean = sfclean.sort_values(by='TimeFeature')

In [12]:
# Save memory by deleting old data
del(sfdata)
# del(sfclean_data)

In [None]:
# Generate some additional histograms
columnsToPlotSF2 = { 'Month' : 'Month',
                    'Day of Month' : 'DoM',
                    'Year' : 'Year',
                   'Hour of Day' : 'ToD' }

In [None]:
plotCrimeCounts(sfclean, columnsToPlotSF2, 'Boston')

In [None]:
# Make histograms of the crimes by month for each given year
years = bosData.Year.unique()
for year in years:
    # Subset the data
    data = bosData[bosData.Year == year]
    
    # Make a histogram based on month
    sns.countplot(x = 'Month', data=data)
    plt.title('Boston Crime Histogram for {}'.format(year))
    plt.show()

In [7]:
# Repeat the above procedure but do it for the sf data
years = sfclean.Year.unique()
for year in years:
    data = sfclean[sfclean.Year == year]
    
    sns.countplot(x = 'Month', data=data)
    plt.title('San Francisco Crime Histogram for {}'.format(2001 + year))
    plt.show()

In [16]:
# Let's generate some heatmaps for both of these data crimes
# Can we overlay these on top of the geographical location???
n = 30
partitionedData = createPartitions(sfclean, n)

In [113]:
# Let's do one really large heatmap
def createHeatmapData(data, n =30):
    # data is what we're making a heatmap on.
    # Using the San Francisco data set, we create an nxn matrix
    # which counts the number of crimes in a given area!
    res = np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            res[i,j] = len(data[(data.xRegion == i) & (data.yRegion == j)])
    return res

In [17]:
sns.heatmap(createHeatmapData(partitionedData))
plt.title('Crime Distribution in San Francisco')
plt.show()

  if self._edgecolors == str('face'):


In [118]:
# Let's redo the heatmap year by year?
def yearByYear(data, folder, city):
    
    years = data.Year.unique()
    for year in years:
        # Deal with data in SF set being zero indexed
        year = year if city != 'San Francisco' else year + 2001
        tmp = data[data.Year == year]

        sns.heatmap(createHeatmapData(tmp))
        plt.title('Crime Distribution in {} for {}'.format(city, year))
        plt.savefig('../figures/{}/heat_map_{}'.format(folder, year))
        plt.close()

In [24]:
# Let's redo it month by month because that's what we want to analyze?
def monthByMonth(data, folder, city):
    years = partitionedData.Year.unique()
    months = partitionedData.Month.unique()
    for year in years:
        for month in months:
            data = partitionedData[(partitionedData.Year == year) &
                                   (partitionedData.Month == month)]
            sns.heatmap(createHeatmapData(data))
            plt.title('Crime Distribution in San Francisco for {}, {}'.format(month, year + 2001))
            plt.savefig('../figures/sf_data_analysis/heat_map_{}_{}'.format(month, year+2001))
            plt.close()

In [76]:
# We now process the Boston Data: Again, we pickle the results.
loadBosPickle = True
bos_pickle_file = '../../cs281_data/large_data/bosclean.pk'

In [77]:
# Let's process the boston data
if not loadBosPickle:
    # Clean the columns
    bosData['Latitude'] = bosData['X']
    bosData['Longitude'] = bosData['Y']
    
    # Drop unused columns
    toDrop = ['X', 'Y']
    bosData = bosData.drop(toDrop, axis=1)
    
    # Extract date features
    # day of week
    day = np.array(bosData.DAY_WEEK)
    day[ day == "Sunday"] = 0
    day[ day == "Monday"] = 1
    day[ day == "Tuesday"] = 2
    day[ day == "Wednesday"] = 3
    day[ day == "Thursday"] = 4
    day[ day == "Friday"] = 5
    day[ day == "Saturday"] = 6

    date_time = np.array([x.split() for x in bosData.FROMDATE])
    date = date_time[:,0]
    time = date_time[:,1]
    tod = date_time[:,2]

    # month, day, year
    date = np.array([x.split('/') for x in date])
    month = [int(x) for x in date[:,0]]
    dom = [int(x) for x in date[:,1]]
    year = [int(x) for x in date[:,2]]

    # time of day
    time_c = [x.split(':') for x in time]
    time = [int(x[1]) if (y == 'AM' and int(x[0]) == 12) else 60*int(x[0])+int(x[1]) 
            if (y =='AM' and int(x[0]) != 12) or (int(x[0]) == 12 and y == 'PM') else 12*60+60*int(x[0])+int(x[1]) 
            for x,y in zip(time_c, tod)]
    
    # Add them back to the data frame
    bosData['Day'] = day
    bosData['Month'] = month
    bosData['Dom'] = dom
    bosData['Year'] = year
    bosData['Time'] = time

In [120]:
bosclean

Unnamed: 0,SHIFT,Year,Month,DAY_WEEK,UCRPART,Latitude,Longitude,day,month,dom,year,time,xRegion,yRegion,Region
0,Last,2012,7,Sunday,Part One,763273.1791,2951498.962,0,7,8,2012,360,10,21,640
1,Last,2012,7,Sunday,Part One,771223.1638,2940772.099,0,7,8,2012,363,14,16,494
2,Last,2012,7,Sunday,Part One,765118.8605,2950217.536,0,7,8,2012,386,11,21,641
3,Last,2012,7,Sunday,Part One,773591.8648,2940638.174,0,7,8,2012,416,15,16,495
4,Last,2012,7,Sunday,Part One,759042.7315,2923832.681,0,7,8,2012,435,8,7,218
5,Day,2012,7,Sunday,Part One,777066.0032,2939504.801,0,7,8,2012,452,17,15,467
6,Day,2012,7,Sunday,Part One,768066.0298,2943456.962,0,7,8,2012,470,12,17,522
7,Day,2012,7,Sunday,Part Two,774083.6785,2952595.797,0,7,8,2012,470,16,22,676
8,Day,2012,7,Sunday,Part Three,746474.0254,2953382.196,0,7,8,2012,473,1,22,661
9,Day,2012,7,Sunday,Part Three,759766.2593,2919778.597,0,7,8,2012,485,8,5,158


In [79]:
# Drop some more unnecessary columns
if not loadBosPickle:
    toDrop = ['COMPNOS', 'NatureCode', 'Location', 'XSTREETNAME', 'STREETNAME']
    toDrop += ['INCIDENT_TYPE_DESCRIPTION', 'MAIN_CRIMECODE', 'REPTDISTRICT']
    toDrop += ['REPORTINGAREA', 'FROMDATE', 'WEAPONTYPE', 'Shooting', 'DOMESTIC']
    toDrop += ['Location']

    bosData = bosData.drop(toDrop, axis=1)
    print "Records before: {}".format(len(bosData))
    bosData = bosData.dropna(axis=0)
    print "Records after: {}".format(len(bosData))

Records before: 268056
Records after: 253075


In [80]:
# Lets's save the data
if not loadBosPickle:
    bosData.to_pickle(bos_pickle_file)
    del(bosData)

In [81]:
# Let's load the data
import pickle 
bosclean = open(bos_pickle_file)
bosclean = pickle.load(bosclean)

In [82]:
# Now we write a function to partition the data
def createBosPartitions(data, n):
    # Returns a partitioned version of the bosclean data!
    data['xRegion'] = buckets(data.Latitude, n)
    data['yRegion'] = buckets(data.Longitude, n) + 1
    data['Region'] = n * data.yRegion + data.xRegion

    return data

In [115]:
# Let's generate some heatmaps on the boston crime data
# Can we overlay these over a boston map?
n = 30
bosPartitions = createBosPartitions(bosclean, n)

In [116]:
# Let's do a single large heatmap
sns.heatmap(createHeatmapData(bosPartitions))
plt.title('Crime Distribution in Boston')
plt.show()

In [119]:
yearByYear(bosPartitions, 'boston_data_analysis/', 'Boston')

In [110]:
data = bosPartitions[bosPartitions.Year == 2014]
sns.heatmap(createHeatmapData(data))
plt.show()

In [None]:
res = {}
steps = {}
def operationsToK(numbers, k, epsilon = 0.0001):
    try:
        return res[tuple(numbers),k]
    except KeyError:
        if len(numbers) == 1:
            return abs(numbers[0] - k) < epsilon
        for number in numbers:
            newSet = list(numbers)
            newSet.remove(number)
            # print newSet
            if operationsToK(newSet, k - number):
                res[tuple(numbers), k] = True
                steps[tuple(numbers), k] = '{}-{}'.format(k,number)
                return True
            if operationsToK(newSet, number- k):
                res[tuple(numbers), k] = True
                steps[tuple(numbers), k] = '{}-{}'.format(number,k)
                return True
            if operationsToK(newSet, number+ k):
                res[tuple(numbers), k] = True
                steps[tuple(numbers), k] = '{}+{}'.format(k,number)
                return True
            if operationsToK(newSet, number * k):
                res[tuple(numbers), k] = True
                steps[tuple(numbers), k] = '{}*{}'.format(k,number)
                return True
            if operationsToK(newSet, float(number) / k):
                res[tuple(numbers), k] = True
                steps[tuple(numbers), k] = '{}/{}'.format(number,k)
                return True
            if operationsToK(newSet, k / float(number)):
                res[tuple(numbers), k] = True
                steps[tuple(numbers), k] = '{}/{}'.format(k,number)
                return True
        res[tuple(numbers), k] = False
        return False