Chicago Data Portal download: https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Pot-Holes-Reported/7as2-ds3y

In [None]:
cd~/Downloads

In [None]:
#created a date time for today 
from datetime import datetime
from datetime import date
today = date.today()

In [None]:
#dataset
import pandas as pd
import datetime
data = pd.read_csv('Potholes.csv')
#remove duplicated  values
data= data[data.STATUS != 'Open - Dup']
data = data[data.STATUS != 'Completed - Dup']

In [None]:
#convert creation date to datetime formate
data['CREATION DATE'] = pd.to_datetime(data['CREATION DATE'], infer_datetime_format= True)

In [None]:
#convert completion date to datetime formate
data['COMPLETION DATE'] = pd.to_datetime(data['COMPLETION DATE'], infer_datetime_format= True, errors = 'ignore')

In [None]:
#create a new column where completion dates are all filled
#This is help in calculations later on
data['COMPLETION_DATE'] = data['COMPLETION DATE'].fillna(today)

In [None]:
#Create a column calculating the length of time to complete request
#Column _na includes NA values
data['completion_length_na'] = data['COMPLETION DATE'] - data['CREATION DATE']
data['completion_length'] = data['COMPLETION_DATE'] - data['CREATION DATE']
#convert new columns into floats for easier calulations 
data.completion_length = data.completion_length.astype('timedelta64[D]')
data.completion_length_na = data.completion_length_na.astype('timedelta64[D]')

In [None]:
data.completion_length_na.describe()
data.completion_length.describe()
#Almost 400K unique data points
#Including NA values increases data set by less than 1%
    #more importantly,it does not change the mean or quartile placements
    

In [None]:
#Review boxplots to examin data distribution and ensure adding NA values in does not chance the outlook
data.completion_length.value_counts().plot.box(figsize=(7,8)) 
data.completion_length_na.value_counts().plot.box(figsize=(7,8)) 

In [None]:
status_open = data[data['STATUS'] == 'Open']
status_open.completion_length.describe()
status_open.completion_length.plot.box()
#from the boxplot it appears open requests are mostly older requests
#Mean and Median much high than the 99% of completed cases

In [None]:
data.completion_length.skew()
#3.29 --> very positively skewed
data.completion_length.kurt()
#22.29 --> Very long tails

A few points:
1. The city has a fairly successful rate of completion
2. The first quartile of data falls within being completed on the first day requested
3. Mean > median, showing a positive distribution skew
4. NA values in completion length can be included in our computations without effecting the data distribution
5. Because of the skewness and many outliers it would be 
    best to tranform this data for further use

In [None]:
import numpy as np
#A quarter of the data is created and completed within a day
#resulting in many 0's in column to be transformed
#This is worked around by using a log(x+c) transformation
data['log_transformation'] = np.log(data.completion_length +1)
data.log_transformation.describe()

In [None]:
data.log_transformation.plot.box()
#Outliers still present but to a lesser degree

In [None]:
data.log_transformation.plot.hist()
#still positiviley skewed, again to a lesser degree

In [None]:
data.log_transformation.skew()
#.17 --> very close to 0
data.log_transformation.kurt()
# -1.06 --> kurtosis has been transformed from a leptokurtic distribution
#to a platykurtic distribuion, showing less sever tails than a normal distribution

In [None]:
first_quartile = data[data['log_transformation'] <=0.693147]
first_quartile = data[data['log_transformation'] <=0.693147]
second_quartile = data[(data['log_transformation'] < .693147) & (data['log_transformation'] <= 1.945910)]
third_quartile = data[(data['log_transformation'] < 1.945910) & (data['log_transformation'] <= 3.135494)]
outliers= data[data['log_transformation'] <=3.135494]

The next step is to apply geovisualization for a better understanding of response rate within community locations as well as to locate if there is a reporting pattern.
This will be achieve by using the folium mapping package
and overlaying heat and distribution maps.

In [None]:
import folium
from folium import plugins
import matplotlib.pyplot as plt
import seaborn as sns