In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
# Python 3 program to calculate Distance Between Two Points on Earth
from math import radians, cos, sin, asin, sqrt
def distance(lat1, lat2, lon1, lon2):
	
	# The math module contains a function named
	# radians which converts from degrees to radians.
	lon1 = radians(lon1)
	lon2 = radians(lon2)
	lat1 = radians(lat1)
	lat2 = radians(lat2)
	
	# Haversine formula
	dlon = lon2 - lon1
	dlat = lat2 - lat1
	a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2

	c = 2 * asin(sqrt(a))
	
	# Radius of earth in kilometers. Use 3956 for miles
	r = 6371
	
	# calculate the result
	return(c * r)
	
	
def getDistance(A,B):
    return distance(A["latitude"],B["latitude"],A["longitude"],B["longitude"])

def isinBeijing(data):
    beijing = {"lat":40.190632, "lon":116.412144,"radius":sqrt(16411/(np.pi*2))}
    res = []
    for i in range(len(data)):
        if beijing["radius"] >= distance(data.iloc[i]["latitude"],beijing["lat"],data.iloc[i]["longitude"],beijing["lon"]):
            res.append(True)
        else:
            res.append(False)
    return res
def defineZones(data,zoneRadius=15):
    locations = data[["longitude","latitude"]].value_counts()
    centers = []
    for current in locations.index.to_list():
        shoulBeACenter = True
        for c in centers:
            if distance(current[1],c[1],current[0],c[0])<zoneRadius:
                shoulBeACenter = False
                break
        if shoulBeACenter:
            centers.append(current)
    
    return centers
def fillZone(data,centers):
    res = []
    for i in range(len(data)):
        print(i)
        l = []
        for c in centers:
            l.append(distance(c[1],data.iloc[i]["latitude"],c[0],data.iloc[i]["longitude"]))
        res.append(np.argmin(l)) 
    return res       

In [3]:
beijing = {"lat":40.190632, "lon":116.412144,"radius":sqrt(16411/(np.pi*2))}

In [4]:
taxis = pd.read_csv("taxis2.csv",index_col="datetime")
taxis.index = pd.to_datetime(taxis.index)

Taxis average speed between two points

In [5]:
taxis.head()

Unnamed: 0_level_0,taxiId,longitude,latitude,dayMoment,hours,days,res_T,res_D,speed,isinBeijing,zone
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2008-02-02 15:36:08,1,116.51172,39.92123,afternoon,15,Sat,,,,True,2
2008-02-02 15:46:08,1,116.51135,39.93883,afternoon,15,Sat,10.0,1.957285,0.195728,True,2
2008-02-02 15:56:08,1,116.51627,39.91034,afternoon,15,Sat,10.0,3.195604,0.31956,True,2
2008-02-02 16:06:08,1,116.47186,39.91248,afternoon,16,Sat,10.0,3.795226,0.379523,True,2
2008-02-02 16:16:08,1,116.47217,39.92498,afternoon,16,Sat,10.0,1.390188,0.139019,True,2


---- 

Cleaning

In [6]:
taxis.shape

(474911, 11)

In [7]:
import folium

In [8]:
t1 = taxis[(taxis["taxiId"]==2)|(taxis["taxiId"]==1)|(taxis["taxiId"]==5)]
color = {"morning":"yellow","afternoon":"orange","night":"blue","afterMidnight":"black",}
map = folium.Map(location=[beijing["lat"], beijing["lon"]])
folium.TileLayer('stamenterrain').add_to(map)
for ind, lat, lon, m in t1[['latitude', 'longitude', 'dayMoment']].itertuples():
    map.add_child(folium.RegularPolygonMarker(location=[lat,lon],
                       color=color[m], radius=5))

map

In [9]:
from matplotlib import colors as mcolors


colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)

colors = list(colors.values())[20:]

In [10]:
centers = defineZones(taxis)

In [11]:
t1 = taxis[taxis["taxiId"]==2]
map = folium.Map(location=[beijing["lat"], beijing["lon"]])
folium.TileLayer('stamenterrain').add_to(map)
for ind, lat, lon, zone in t1[['latitude', 'longitude', 'zone']].itertuples():
    map.add_child(folium.RegularPolygonMarker(location=[lat,lon],
                       color=colors[zone], radius=3))
for i in range(len(centers)):
    map.add_child(folium.RegularPolygonMarker(
        location=[centers[i][1],centers[i][0]],
        color="black", radius=5))
    map.add_child(folium.CircleMarker(
        location = [centers[i][1],centers[i][0]],
        popup="zone_{}".format(i),
        color = colors[i],
        radius = 75,
        fill = True)
    )
map

In [12]:
taxis["zone"].value_counts()

2     163306
1     124851
0      87802
11     18567
3      18505
12     15229
10     14525
7       8736
4       5171
25      3846
13      3438
8       1799
24      1705
6       1337
17      1328
15      1191
18       994
14       899
5        599
23       596
19       170
16       129
22        82
21        53
9         31
20        21
26         1
Name: zone, dtype: int64

In [13]:
import datetime

In [14]:
train1_df = pd.read_csv("taxis2.csv")
train1_df['datetime'] = pd.to_datetime(train1_df['datetime'])
train1_df['weekday'] = train1_df['datetime'].dt.weekday
train1_df['hour'] = train1_df['datetime'].dt.hour
train1_df['minute'] = train1_df['datetime'].dt.minute

In [15]:
train1_df

Unnamed: 0,datetime,taxiId,longitude,latitude,dayMoment,hours,days,res_T,res_D,speed,isinBeijing,zone,weekday,hour,minute
0,2008-02-02 15:36:08,1,116.51172,39.92123,afternoon,15,Sat,,,,True,2,5,15,36
1,2008-02-02 15:46:08,1,116.51135,39.93883,afternoon,15,Sat,10.000000,1.957285,0.195728,True,2,5,15,46
2,2008-02-02 15:56:08,1,116.51627,39.91034,afternoon,15,Sat,10.000000,3.195604,0.319560,True,2,5,15,56
3,2008-02-02 16:06:08,1,116.47186,39.91248,afternoon,16,Sat,10.000000,3.795226,0.379523,True,2,5,16,6
4,2008-02-02 16:16:08,1,116.47217,39.92498,afternoon,16,Sat,10.000000,1.390188,0.139019,True,2,5,16,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
474906,2008-02-07 13:03:05,10174,116.45776,39.79692,afternoon,13,Thu,4.716667,1.330247,0.282031,True,1,3,13,3
474907,2008-02-07 06:15:44,10216,116.35936,39.96986,morning,6,Thu,5.000000,0.110728,0.022146,True,2,3,6,15
474908,2008-02-06 08:51:30,10252,116.77860,40.36083,morning,8,Wed,4.733333,0.000000,0.000000,True,13,2,8,51
474909,2008-02-05 06:05:50,10299,116.31430,39.85684,morning,6,Tue,5.000000,0.001402,0.000280,True,0,1,6,5


In [16]:
from iteration_utilities import duplicates
from iteration_utilities import unique_everseen

In [17]:
dup=pd.DataFrame(train1_df,columns=['longitude','latitude'])

In [18]:
dup

Unnamed: 0,longitude,latitude
0,116.51172,39.92123
1,116.51135,39.93883
2,116.51627,39.91034
3,116.47186,39.91248
4,116.47217,39.92498
...,...,...
474906,116.45776,39.79692
474907,116.35936,39.96986
474908,116.77860,40.36083
474909,116.31430,39.85684


In [38]:
sam =list(unique_everseen(duplicates(dup['longitude'])))

In [39]:
sam

[116.69168,
 116.69167,
 116.69172,
 116.69171,
 116.6917,
 116.69164,
 116.69176,
 116.69156,
 116.69162,
 116.69161,
 116.69165,
 116.69169,
 116.69157,
 116.69155,
 116.69159,
 116.6916,
 116.69158,
 116.69163,
 116.69166,
 116.69149,
 116.6915,
 116.69152,
 116.69177,
 116.69154,
 116.69153,
 116.47179,
 116.69151,
 116.69173,
 116.69182,
 116.69185,
 116.69174,
 116.69146,
 116.69175,
 116.69179,
 116.69136,
 116.69141,
 116.69184,
 116.69137,
 116.69145,
 116.69147,
 116.48347,
 116.4215,
 116.4554,
 116.45478,
 116.48377,
 116.45531,
 116.48364,
 116.47237,
 116.45105,
 116.45053,
 116.44439,
 116.45766,
 116.4496,
 116.45525,
 116.45551,
 116.45507,
 116.44961,
 116.4581,
 116.45,
 116.38529,
 116.37216,
 116.45527,
 116.47166,
 116.45774,
 116.44972,
 116.42931,
 116.3571,
 116.41531,
 116.40533,
 116.45765,
 116.45601,
 116.45552,
 116.45474,
 116.45526,
 116.45529,
 116.44953,
 116.44324,
 116.45535,
 116.45208,
 116.44429,
 116.44434,
 116.35982,
 116.4555,
 116.455,
 116.4

In [46]:
sam=train1_df[(train1_df['longitude']==116.51172) &(train1_df['latitude']==39.92123)|(train1_df['longitude']==116.51135) &(train1_df['latitude']==39.93883)]

In [47]:
sam

Unnamed: 0,datetime,taxiId,longitude,latitude,dayMoment,hours,days,res_T,res_D,speed,isinBeijing,zone,weekday,hour,minute
0,2008-02-02 15:36:08,1,116.51172,39.92123,afternoon,15,Sat,,,,True,2,5,15,36
1,2008-02-02 15:46:08,1,116.51135,39.93883,afternoon,15,Sat,10.0,1.957285,0.195728,True,2,5,15,46
404559,2008-02-07 19:00:30,715,116.51172,39.92123,night,19,Thu,0.166667,0.03423,0.20538,True,2,3,19,0


In [48]:
sam=sam[(sam['taxiId']==1)]

In [49]:
sam1=sam['hour']*60+sam['minute']

In [50]:
sam=sam1.shift(-1)-sam1

In [51]:
sam=sam.dropna()
sam=np.mean(sam)

In [52]:
sam

10.0