## EXPLANATORY ANALYSIS 

In [1]:
## Libraries Used For The Analysis ##
import time
import numpy as np
import pandas as pd
import seaborn as sns
import math as math

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

import matplotlib.pyplot as plt
from matplotlib import pylab
from matplotlib import cm
from matplotlib import style
style.use('fivethirtyeight')

%matplotlib inline

# DataSet Input As Data Frame
trips = pd.read_csv('formated_uber_data.csv')
trips.columns = ['timestamp', 'lat', 'lon', 'base','weekday','month','day','hour','minute','dayofweek']

trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4534327 entries, 0 to 4534326
Data columns (total 10 columns):
timestamp    object
lat          float64
lon          float64
base         object
weekday      object
month        int64
day          int64
hour         int64
minute       int64
dayofweek    int64
dtypes: float64(2), int64(5), object(3)
memory usage: 345.9+ MB


In [2]:
## Histogram Demonstrating Number Of Trips Against Different Times Of A Day ##

trace = go.Histogram(x=trips.hour, opacity=0.750 )
data = [trace]

layout = go.Layout(title='Averege Trips In A Day (April - September 2014)',
                   titlefont=dict(size=30),
                   xaxis=dict(title='Time In Hours',
                              titlefont=dict(
                              family='Courier New, monospace',
                              size=30,color='#7f7f7f')),
                   yaxis=dict(title='Trips Frequency',
                              titlefont=dict(
                              family='Courier New, monospace',
                              size=30,color='#7f7f7f')),
                   
                   bargap=0.2)

fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename='Histogram(TRIPS-TIME).html')

'file:///Users/dimitriskemos/Documents/MSc Data Science And Analytics/Visual Communication and Information Design/Assesments/2/Viz/uber-pickups-in-new-york-city/Uber14/Histogram(TRIPS-TIME).html'

In [3]:
## Heatmap Demonstrating The Number Of Trips Against Different Time And Days 

def count_rows(rows):
    return len(rows)

by_cross = trips.groupby('dayofweek hour'.split()).apply(count_rows).unstack()

data = [go.Heatmap(z=by_cross.values.tolist(),colorscale='Inferno')]

layout = go.Layout(
            title='Heatmap Indicating Trip Frequency',
            titlefont=dict(size=30),
    
    xaxis=dict(
            tickfont=dict(size=15),
            tickmode = ['array'],
            tickvals = ['0','1','2','3',' 4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24'],
            ticktext = ['00:00','01:00','02:00','03:00',' 04:00','05:00','06:00','07:00','08:00','09:00','10:00','11:00','12:00','13:00','14:00','15:00','16:00','17:00','18:00','19:00','20:00','21:00','22:00','23:00','24:00'],
            title='Time In Hours',
            titlefont=dict(size=25),
            tickangle=45),
    
    yaxis=dict(
            tickfont=dict(size=15),
            tickmode = ['array'],
            tickvals = ['0','1','2','3',' 4','5','6'],
            ticktext = ['Mon','Tue','Wed','Thu',' Frid','Sat','Sun'],
            title='Days',
            titlefont=dict(size=25)))

updatemenus=list([dict(buttons=list([   
                  dict(args=['type', 'surface'],label='3D Surface',method='restyle'),
                  dict(args=['type', 'heatmap'],label='Heatmap',method='restyle')]),
                       
                  direction = 'left',
                  pad = {'r': 10, 't': 10},
                  showactive = True,
                  type = 'buttons',
                  x = 0.1,
                  xanchor = 'left',
                  y = 1.1,
                  yanchor = 'top')])

annotations = list([dict(
                    text='Trace type:',
                    x=0, 
                    y=1.085, 
                    yref='paper', 
                    align='left', 
                    showarrow=False)])

annotations = list([dict(text='Trips',
                         x=1.054, y=1.025, 
                         yref='paper',
                         xref='paper' ,
                         align='middle',
                         showarrow=False,
                         font=dict(size=16)),
                         
                        dict(text='Trace type:', 
                         x=0, 
                         y=1.085, 
                         yref='paper', 
                         align='left', 
                         showarrow=False)])
    
layout['updatemenus'] = updatemenus
layout['annotations'] = annotations


df = pd.DataFrame(by_cross)
df.to_csv('by_cross.csv')
fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename='HeatMap(TRIPS-TIME-DAY).html')

'file:///Users/dimitriskemos/Documents/MSc Data Science And Analytics/Visual Communication and Information Design/Assesments/2/Viz/uber-pickups-in-new-york-city/Uber14/HeatMap(TRIPS-TIME-DAY).html'

In [4]:
df = pd.read_csv('by_cross_formated.csv', encoding = "ISO-8859-1")
df.columns = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
df['Time'] = df.index

Monday = go.Scatter(
                    x = df['Time'],
                    y = df.Monday,
                    name = "Monday",
                    line = dict(color = '#8c510a',width = 4,shape='spline'),
                    mode = 'lines+markers',
                    opacity = 0.8)

Tuesday = go.Scatter(
                    x = df['Time'],
                    y = df.Tuesday,
                    name = "Tuesday",
                    line = dict(color = '#d8b365',width = 4,shape='spline'),
                    mode = 'lines+markers',
                    opacity = 0.8)

Wednesday = go.Scatter(
                    x = df['Time'],
                    y = df.Wednesday,
                    name = "Wednesday",
                    line = dict(color = '#c51b7d',width = 4,shape='spline'),
                    mode = 'lines+markers',
                    opacity = 0.8)

Thursday = go.Scatter(
                    x = df['Time'],
                    y = df.Thursday,
                    name = "Thursday",
                    line = dict(color = '#762a83',width = 4,shape='spline'),
                    mode = 'lines+markers',
                    opacity = 0.8)

Friday = go.Scatter(
                    x = df['Time'],
                    y = df.Friday,
                    name = "Friday",
                    line = dict(color = '#c7eae5',width = 4,shape='spline'),
                    mode = 'lines+markers',
                    opacity = 0.8)

Saturday = go.Scatter(
                    x = df['Time'],
                    y = df.Saturday,
                    name = "Saturday",
                    line = dict(color = '#5ab4ac',width = 4,shape='spline'),
                    mode = 'lines+markers',
                    opacity = 0.8)

Sunday = go.Scatter(
                    x = df['Time'],
                    y = df.Sunday,
                    name = "Sunday",
                    line = dict(color = '#01665e',width = 4,shape='spline'),
                    mode = 'lines+markers',
                    opacity = 0.8)

data = [Monday, Tuesday, Wednesday, Thursday, Friday, Saturday, Sunday]

layout = dict(
            title='Frequency Of Trips During Different Times and Days',
            titlefont=dict(size=30),
    
            xaxis=dict(
                    zeroline = True,
                    showline = True,
                    title='Time In Hours',
                    tickmode = ['array'],
                    tickvals = ['0','1','2','3',' 4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23'],
                    ticktext = ['00:00','01:00','02:00','03:00',' 04:00','05:00','06:00','07:00','08:00','09:00','10:00','11:00','12:00','13:00','14:00','15:00','16:00','17:00','18:00','19:00','20:00','21:00','22:00','23:00'],
                    titlefont=dict(size=20),
                    tickangle=45),
    
            yaxis=dict(rangemode='tozero',
                    zeroline = True,
                    title='Trip Frequency',
                    titlefont=dict(size=20)),updatemenus=updatemenus)

updatemenus = list([
                    dict(active=9,
                         buttons=list([   
                         dict(label = 'Monday',
                             method = 'update',
                             args = [{'visible': [True, False, False,False,False,False,False]},
                             {'title': 'Frequency Of Trips During Different Times On Monday'}]),
                        dict(label = 'Tuesday',
                             method = 'update',
                             args = [{'visible': [False, True, False,False,False,False,False]},
                             {'title': 'Frequency Of Trips During Different Times On Tuesday'}]),
                        dict(label = 'Wednesday',
                             method = 'update',
                             args = [{'visible': [False, False,True,False,False,False,False]},
                             {'title': 'Frequency Of Trips During Different Times On Wednesday'}]),
                        dict(label = 'Thursday',
                             method = 'update',
                             args = [{'visible': [False, False,False,True,False,False,False]},
                             {'title': 'Frequency Of Trips During Different Times On Thursday'}]),
                        dict(label = 'Friday',
                             method = 'update',
                             args = [{'visible': [False, False,False,True,False,False,False]},
                             {'title': 'Frequency Of Trips During Different Times On Friday'}]),
                        dict(label = 'Saturday',
                             method = 'update',
                             args = [{'visible': [False, False,False,False,False,True,False]},
                             {'title': 'Frequency Of Trips During Different Times On Saturday'}]),
                        dict(label = 'Sunday',
                             method = 'update',
                             args = [{'visible': [False, False,False,False,False,False,True]},
                             {'title': 'Frequency Of Trips During Different Times On Sunday'}]),
                        dict(label = 'Weekday',
                             method = 'update',
                             args = [{'visible': [True, True,True,True,True,False,False]},
                             {'title': 'Frequency Of Trips During Different Times On Weekdays'}]),
                        dict(label = 'Weekend',
                             method = 'update',
                             args = [{'visible': [False, False,False,False,False,True,True]},
                             {'title': 'Frequency Of Trips During Different Times On Weekends'}]), 
                        dict(label = 'All',
                             method = 'update',
                             args = [{'visible': [True, True,True,True,True,True,True]},
                             {'title': 'Frequency Of Trips During Different Times and Days'}]),
                        ]),
                direction = 'down',
                pad = {'r': 10, 't': 10},
                showactive = True,
                x = 1.02,
                xanchor = 'left',
                y = 0.50,
                yanchor = 'top')
        ])

annotations = list([dict(
                        text='Day Filter',
                        x=1.075, 
                        y=0.50, 
                        yref='paper',
                        xref='paper',
                        align='left', 
                        showarrow=False)])

layout['updatemenus'] = updatemenus
layout['annotations'] = annotations

fig = dict(data=data,layout=layout)

# Plot Figure #
plotly.offline.plot(fig, filename='Point Plot(TRIPS-DAY-TIME).html')

'file:///Users/dimitriskemos/Documents/MSc Data Science And Analytics/Visual Communication and Information Design/Assesments/2/Viz/uber-pickups-in-new-york-city/Uber14/Point Plot(TRIPS-DAY-TIME).html'

In [5]:
##  Histogram Displaying The Distribution Of Trips Over The Days Of The Week ##

trace = go.Histogram(x=trips.dayofweek,opacity=0.750)
data = [trace]
layout = go.Layout(
                   title=('Uber Pickups Over A Week (April - September 2014)'),
                   titlefont=dict(size=30),
    
                   xaxis=dict(title= 'Days Of Week',
                              tickmode = ['array'],
                              tickvals = ['0','1','2','3',' 4','5','6'],
                              ticktext = ['Monday','Tuesday','Wedenday','Thursday',' Friday','Saturday','Sunday'],
                              titlefont=dict(family='Courier New, monospace',
                                             size=25,
                                             color='#7f7f7f'),
                              tickfont=dict(size=15)),

                   yaxis=dict(title='Trip Frequnecy',
                              titlefont=dict(family='Courier New, monospace',
                                             size=25,
                                             color='#7f7f7f'),
                              tickfont=dict(size=15)),
                     bargap=0.2)

fig = go.Figure(data=data, layout=layout)

# Plot Figure #
plotly.offline.plot(fig, filename='Histogram(TRIPS-DAYS).html')

'file:///Users/dimitriskemos/Documents/MSc Data Science And Analytics/Visual Communication and Information Design/Assesments/2/Viz/uber-pickups-in-new-york-city/Uber14/Histogram(TRIPS-DAYS).html'

In [6]:
## Latitude Ploted Against Number Of Trips With Interactive Elements ##

trace = go.Histogram(x=trips.lat, xbins={'start':40.5, 'end':41, 'size': 0.0008},opacity=0.750)
data = [trace]
layout = go.Layout(title='Uber Trips Against Latitude (April - September 2014)',
                   xaxis=dict(title='Time'),
                   yaxis=dict( title='Trips'))

fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename='Histogram(LATITUDE).html')

'file:///Users/dimitriskemos/Documents/MSc Data Science And Analytics/Visual Communication and Information Design/Assesments/2/Viz/uber-pickups-in-new-york-city/Uber14/Histogram(LATITUDE).html'

In [7]:
## Longitude Ploted Against Number Of Trips With Interactive Elemetns ##

trace = go.Histogram(x=trips.lon, xbins={'start':-74.1, 'end':-73.9, 'size': 0.0008},opacity=0.750,xaxis='x2')
data = [trace]
layout = go.Layout(title='Uber Trips Against Longitude (April - September 2014)',
                   xaxis=dict(title='Time'),
                   yaxis=dict( title='Trips'))

fig = go.Figure(data=data, layout=layout)

# Plot Figure #
plotly.offline.plot(fig, filename='Histogram(LONGITUDE).html')

'file:///Users/dimitriskemos/Documents/MSc Data Science And Analytics/Visual Communication and Information Design/Assesments/2/Viz/uber-pickups-in-new-york-city/Uber14/Histogram(LONGITUDE).html'

In [8]:
## Latitude And Longitude Ploted Together With Number Of Trips ##

Latitude = go.Histogram(
           x=trips.lat, 
           xbins={'start':40.5, 'end':41, 'size': 0.0016},
           opacity=0.750,xaxis='x1',
           name='Latitude')

Longitude = go.Histogram(
            x=trips.lon, 
            xbins={'start':-74.1,'end':-73.9, 'size': 0.0008},
            opacity=0.750,xaxis='x2',
            name='Longitude')

layout = go.Layout(yaxis=dict(title = "Trip Frequency",ticks='outside'),
                   xaxis=dict(title= 'Latitude'),
                   xaxis2=dict(title= 'Longitude',
                   side = 'top',overlaying='x',
                   showgrid=False))
            

annotations = list([dict(text='Trips Frequency On Different Coordinates',
                         x=0, y=1.130, 
                         yref='paper',
                         xref='paper' ,
                         align='left',
                         showarrow=False,
                         font=dict(size=25))])

layout['annotations'] = annotations

data = [Latitude, Longitude]

fig = go.Figure(data=data, layout=layout)

plotly.offline.plot(fig, filename='Histogram(LATITUDE-LONGITUDE).html')


'file:///Users/dimitriskemos/Documents/MSc Data Science And Analytics/Visual Communication and Information Design/Assesments/2/Viz/uber-pickups-in-new-york-city/Uber14/Histogram(LATITUDE-LONGITUDE).html'

## CLUSTER ANALYSIS

In [9]:
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
import mplleaflet as mpl

# clustering function that:
# takes the maximum distance between two points, the minimum number of points required within each cluster
# and the lat - lon from the trips data set, returrning the hot-spots in [lon,lat,number of members]
def hotspots(max_dist,min_trips,trip_data):
    point = trip_data.as_matrix(columns=['lat', 'lon'])
    
    kms_per_radian = 6371.0088
    epsilon = max_dist / kms_per_radian
    
    # epsilon and coordinates get converted to radians, because scikit-learn’s haversine metric needs radian units
    db = DBSCAN(eps=epsilon, min_samples=min_trips,
                algorithm='ball_tree', metric='haversine').fit(np.radians(point))
    
    cluster_labels = db.labels_
    num_clusters = len(set(cluster_labels))
    clusters = pd.Series([point[cluster_labels == n] for n in range(num_clusters)])
    
    print('Number of clusters: {}'.format(num_clusters))
    
    lat,lon,num_members = [],[],[]
    
    # get the cluster centroids
    for ii in range(len(clusters)):
        if clusters[ii].any():
            lat.append(MultiPoint(clusters[ii]).centroid.x)
            lon.append(MultiPoint(clusters[ii]).centroid.y)
            num_members.append(len(clusters[ii]))
    hot_spots = [lon,lat,num_members]
    
    return hot_spots


trip_data = trips.loc[(trips['weekday']=='Monday')&(trips['hour']>18)]

# maximum distance between two points 
# contained in the same cluster measured in kilometers
# can be redifined
max_dist = 0.07

#minimum number of pickup points inside the same cluster
# can be redifined
min_trips = 200

#calling the function containing the clustering procedure 
# using the values defined above
hot_spots = hotspots(max_dist,min_trips, trip_data)

df = pd.DataFrame(hot_spots)
df.to_csv('Cluster_Data.csv')


Number of clusters: 47


## CORREALTION COEFFICIENT

In [3]:
import pandas as pd
weather_trips = pd.read_csv('uber_nyc_enriched.csv')
weather_trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29101 entries, 0 to 29100
Data columns (total 13 columns):
pickup_dt    29101 non-null object
borough      26058 non-null object
pickups      29101 non-null int64
spd          29101 non-null float64
vsb          29101 non-null float64
temp         29101 non-null float64
dewp         29101 non-null float64
slp          29101 non-null float64
pcp01        29101 non-null float64
pcp06        29101 non-null float64
pcp24        29101 non-null float64
sd           29101 non-null float64
hday         29101 non-null object
dtypes: float64(9), int64(1), object(3)
memory usage: 2.9+ MB


In [None]:
weather_trips.info() , weather_trips[['pickups','temp']].describe

In [None]:
## Visual Check For Spearmans Assumptions, 
weather_trips.plot.scatter("temp", "pickups")

In [None]:
import scipy.stats as stats

## Checking Assumptions For Spearman Correlation
stats.levene(weather_trips['pickups'], weather_trips['temp'])

In [None]:
## Acording To Lavenes Test Spearman Cant Be Used On Our Data
## Spearman Calculated Out Of Curiosity 
weather_trips['pickups'].corr(weather_trips['temp'],method= 'spearman')