In [1]:
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv(r'../data-sample/TaxiData-sample', header=None)
data.columns = ['VehicleNum', 'Stime', 'Lng', 'Lat', 'OpenStatus', 'Speed']
data.head()

Unnamed: 0,VehicleNum,Stime,Lng,Lat,OpenStatus,Speed
0,22271,22:54:04,114.167,22.718399,0,0
1,22271,18:26:26,114.190598,22.6478,0,4
2,22271,18:35:18,114.201401,22.6497,0,0
3,22271,16:02:46,114.233498,22.725901,0,24
4,22271,21:41:17,114.233597,22.7209,0,19


In [3]:
# 筛选特定车辆ID的序列
data[data['VehicleNum']==22271].head()
# 使用合适的方式去除数据
data[-(data['VehicleNum']==22271)].head()

Unnamed: 0,VehicleNum,Stime,Lng,Lat,OpenStatus,Speed
1437,35807,01:53:46,113.809898,22.626801,0,0
1438,35807,01:43:46,113.813301,22.6236,0,0
1439,35807,01:14:15,113.847,22.5947,0,41
1440,35807,02:01:41,113.852501,22.6257,0,22
1441,35807,01:01:59,113.897003,22.551901,0,42


In [7]:
# 筛选特定列的数据
data[['Stime']].head()
# 可以通过两种形式创建新的列数据
data['Speed1']=data['Speed']*2
# Or
data.loc[:, 'Speed1']=data['Speed']*2
data.head()

Unnamed: 0,VehicleNum,Stime,Lng,Lat,OpenStatus,Speed,Speed1
0,22271,22:54:04,114.167,22.718399,0,0,0
1,22271,18:26:26,114.190598,22.6478,0,4,8
2,22271,18:35:18,114.201401,22.6497,0,0,0
3,22271,16:02:46,114.233498,22.725901,0,24,48
4,22271,21:41:17,114.233597,22.7209,0,19,38


In [8]:
# drop方法不会直接对数据生效，需要赋值
data = data.drop(['Speed1'], axis=1)
data.head()
# 数据操作后不应使用标签索引，这会带来错误，应该使用整数索引
data['Stime'].iloc[3]

'16:02:46'

In [9]:
# 排序默认升序，以第一列值作为主键
data=data.sort_values(by=['VehicleNum', 'Stime'])
data.head()

Unnamed: 0,VehicleNum,Stime,Lng,Lat,OpenStatus,Speed
39,22271,00:00:49,114.266502,22.728201,0,0
397,22271,00:01:48,114.266502,22.728201,0,0
1413,22271,00:02:47,114.266502,22.728201,0,0
244,22271,00:03:46,114.266502,22.728201,0,0
247,22271,00:04:45,114.268898,22.7295,0,11


data[condition] is to keep qualified data
data[-(condition)] is to delete qualified data

In [None]:
# 去除异常值 shift()将序列向后移动，shift(-1)序列向前移动
data = data[-(
    (data['OpenStatus'].shift(-1) == data['OpenStatus'].shift()) &
    (data['VehicleNum'].shift(-1) != data['OpenStatus']) &
    (data['VehicleNum'].shift(-1) == data['VehicleNum'].shift()) &
    (data['VehicleNum'].shift(-1) == data['VehicleNum'])
)]
# 定义所谓异常值，OpenStatus数据应为连续的，通过移动序列判断突变值，以进行筛选

In [10]:
# 按照索引段提取车辆的运行段
data.loc[:, 'OpenStatus1'] = data['OpenStatus'].shift(-1)
data.loc[:, 'VehicleNum1'] = data['VehicleNum'].shift(-1)
data.loc[:, 'Lng1'] = data['Lng'].shift(-1)
data.loc[:, 'Lat1'] = data['Lat'].shift(-1)
data.loc[:, 'Stime1'] = data['Stime'].shift(-1)
data.loc[:, 'StatusChage'] = data['OpenStatus1']-data['OpenStatus']
data.head()
# 找出发生条件转换的时间片，同时要求车辆编号没有发生变化
data = data[(
    (data['StatusChage']==-1 | data['StatusChage']==1) &
    (data['VehicleNum']==data['VehicleNum1'])
)]
data = data[['VehicleNum', 'Stime', 'Lng', 'Lat', 'StatusChange']]
# 重命名列名
data = data.rename(columns={
    'Lng' : 'SLng',
    'Lat' : 'SLat'
})
# 将索引的下一个项作为OD的重点数值
data['ELng'] = data['SLng'].shift(-1)
data['ELat'] = data['SLat'].shift(-1)
data['Etime'] = data['Stime'].shift(-1)
# 定义StatusChage为正条件下，即起始点
data = data[data['StatusChage']==1]
data = data.drop('StatusChange', axis=1)

Unnamed: 0,VehicleNum,Stime,Lng,Lat,OpenStatus,Speed,OpenStatus1,VehicleNum1,Lng1,Lat1,Stime1,StatusChage
39,22271,00:00:49,114.266502,22.728201,0,0,0.0,22271.0,114.266502,22.728201,00:01:48,0.0
397,22271,00:01:48,114.266502,22.728201,0,0,0.0,22271.0,114.266502,22.728201,00:02:47,0.0
1413,22271,00:02:47,114.266502,22.728201,0,0,0.0,22271.0,114.266502,22.728201,00:03:46,0.0
244,22271,00:03:46,114.266502,22.728201,0,0,0.0,22271.0,114.268898,22.7295,00:04:45,0.0
247,22271,00:04:45,114.268898,22.7295,0,11,0.0,22271.0,114.272003,22.731199,00:05:44,0.0
