In [1]:
import os
import folium
import random
import collections
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from shapely.geometry import Polygon, Point

In [2]:
bikes = pd.read_csv('../cleaned_data/removed_abnormal.csv')
if 'Unnamed: 0' in bikes.columns:
    bikes.drop('Unnamed: 0', axis = 1)

In [3]:
inbikes = bikes[bikes['LOCK_STATUS'] == 1]
outbikes = bikes[bikes['LOCK_STATUS'] == 0]
fences = pd.read_csv('../cleaned_data/fence_position.csv')

In [4]:
fences.sample(5)

Unnamed: 0,FENCE_ID,LONGITUDE_0,LATITUDE_0,LONGITUDE_1,LATITUDE_1,LONGITUDE_2,LATITUDE_2,LONGITUDE_3,LATITUDE_3,LONGITUDE_4,LATITUDE_4,LATITUDE,LONGITUDE,ROAD,AREA,FID
12420,金民路_L_B30004,118.145106,24.4969,118.145121,24.496947,118.145135,24.496943,118.14512,24.496896,118.145106,24.4969,24.496921,118.145121,金民路,4.196997,12420
3939,后浦二路0_L_A20009,118.138429,24.499611,118.13847,24.499612,118.138471,24.499598,118.138429,24.499596,118.138429,24.499611,24.499604,118.13845,后浦二路,3.521087,3939
10084,祥店路_R_5,118.137135,24.504188,118.137206,24.504168,118.137201,24.504154,118.137131,24.504174,118.137135,24.504188,24.504171,118.137168,祥店路,6.244606,10084
8157,海天路_R_14,118.094397,24.507145,118.094313,24.507143,118.094313,24.507157,118.094396,24.507159,118.094397,24.507145,24.507151,118.094355,海天路,7.042396,8157
9900,白鹭洲路0_L_7,118.086445,24.473495,118.086435,24.47344,118.086418,24.473443,118.086428,24.473497,118.086445,24.473495,24.473469,118.086432,白鹭洲路,5.734251,9900


In [5]:
random.seed(0)
def random_points_within(poly, num_points):
    min_x, min_y, max_x, max_y = poly.bounds

    points = []

    while len(points) < num_points:
        random_point = Point([random.uniform(min_x, max_x), random.uniform(min_y, max_y)])
        if (random_point.within(poly)):
            points.append(random_point)

    return points


fid_posdic = dict()
for i in range(len(fences)):
    area = fences['AREA'][i]
    p1, p2, p3, p4, p5 = (fences['LATITUDE_0'][i], fences['LONGITUDE_0'][i]), (fences['LATITUDE_1'][i], fences['LONGITUDE_1'][i]), (fences['LATITUDE_2'][i], fences['LONGITUDE_2'][i]), (fences['LATITUDE_3'][i], fences['LONGITUDE_3'][i]), (fences['LATITUDE_4'][i], fences['LONGITUDE_4'][i])
    base = [p1, p2, p3, p4]
    if area > 5:
        num_points = area // 1 + 1
        poly = Polygon([p1, p2, p3, p4, p5])
        points = random_points_within(poly, num_points)
        for p in points:
            base.append((p.x, p.y))
    fid_posdic[i] = base


In [6]:
fid_reverse_proj = collections.defaultdict(list)
x = set()
for k, v in fid_posdic.items():
    fence_id = k
    for pos in v:
        fid_reverse_proj[pos].append(k)
        x.add(pos)
x = list(x)

In [7]:
fence_cluster = DBSCAN(eps = 0.00015, min_samples = 1).fit_predict(x)

In [8]:
cluster_cnts = collections.Counter(fence_cluster)
point_cnts = sorted([(v, k) for k, v in cluster_cnts.items()], reverse = True)
print("Number of clusters")
print(str(len(cluster_cnts)))
print("Max/min number of points in clusters")
print(str(point_cnts[0]))
print(str(point_cnts[-1]))

Number of clusters
6073
Max/min number of points in clusters
(426, 16)
(4, 76)


In [9]:
fence_belong = dict()
for i in range(len(fence_cluster)):
    point = x[i]
    belong = fence_cluster[i]
    curfences = fid_reverse_proj[point]
    for f in curfences:
        if f in fence_belong and fence_belong[f] != belong:
            print("ERROR")
            print(fence_belong[f])
        fence_belong[f] = belong

In [10]:
clusters = []
for i in range(len(fences)):
    clusters.append(fence_belong[i])
fences['CLUSTER'] = clusters

In [11]:
cluster_area = fences.groupby('CLUSTER').agg({'AREA':sum})
cluster_area

Unnamed: 0_level_0,AREA
CLUSTER,Unnamed: 1_level_1
0,72.535292
1,8.842340
2,18.079511
3,92.350318
4,33.660985
...,...
6068,4.024273
6069,4.053196
6070,4.878191
6071,3.731820


In [12]:
in_outliers = inbikes[(inbikes['NEAREST_FENCE'] == -1) | (inbikes['DISTANCE'] > 50)]
out_outliers = outbikes[(outbikes['NEAREST_FENCE'] == -1)|(outbikes['DISTANCE'] > 50)]
inbikes = inbikes[(inbikes['NEAREST_FENCE'] != -1) & (inbikes['DISTANCE'] <= 50)]
outbikes = outbikes[(outbikes['NEAREST_FENCE'] != -1) & (outbikes['DISTANCE'] <= 50)]

In [13]:
bikes_in = pd.merge(inbikes,fences, left_on='NEAREST_FENCE', right_on='FID',how='left')
bikes_in.drop('NEAREST_FENCE', axis = 1, inplace = True)
bikes_out = pd.merge(outbikes,fences, left_on='NEAREST_FENCE', right_on='FID',how='left')
bikes_out.drop('NEAREST_FENCE', axis = 1, inplace = True)

In [14]:
in_flow = bikes_in.groupby('CLUSTER').agg({'CLUSTER':'count'})
in_flow

Unnamed: 0_level_0,CLUSTER
CLUSTER,Unnamed: 1_level_1
0,45
1,25
2,98
3,47
4,9
...,...
6067,5
6069,16
6070,10
6071,37


In [15]:
out_flow = bikes_out.groupby('CLUSTER').agg({'CLUSTER':'count'})

In [16]:
cluster_df = pd.DataFrame()
cluster_df['CLUSTER'] = cluster_area.index
cluster_df['AREA'] = cluster_area.values.ravel()
inflow = [0]*len(cluster_cnts)
for k in cluster_area.index:
    if k in in_flow.index:
        inflow[k] = in_flow.loc[k].values[0]
cluster_df['INFLOW'] = inflow
outflow = [0]*len(cluster_cnts)
for k in cluster_area.index:
    if k in out_flow.index:
        outflow[k] = out_flow.loc[k].values[0]
cluster_df['OUTFLOW'] = outflow
cluster_df['INFLOW_DENSITY'] = cluster_df.apply(lambda x: x['INFLOW']/x['AREA'], axis = 1)
cluster_df['FLOW'] = cluster_df.apply(lambda x: x['INFLOW'] - x['OUTFLOW'], axis = 1)
cluster_df['FLOW_DENSITY'] = cluster_df.apply(lambda x: (x['INFLOW'] - x['OUTFLOW'])/x['AREA'], axis = 1)

In [17]:
cluster_df.head(5)

Unnamed: 0,CLUSTER,AREA,INFLOW,OUTFLOW,INFLOW_DENSITY,FLOW,FLOW_DENSITY
0,0,72.535292,45,57,0.620388,-12.0,-0.165437
1,1,8.84234,25,15,2.827306,10.0,1.130922
2,2,18.079511,98,36,5.420501,62.0,3.429296
3,3,92.350318,47,97,0.508932,-50.0,-0.541417
4,4,33.660985,9,197,0.267372,-188.0,-5.585101


In [18]:
sort_by_inflow = cluster_df.sort_values("INFLOW", ascending = False)
top_40_inflow = sort_by_inflow['CLUSTER'].values[:40]
top_40_inflow

array([1390, 3660,  176,  961, 3951, 1522, 1115,  598, 1659, 1301,   68,
        240, 1549, 1320, 1238, 3777, 1674, 2268, 3599, 2664, 2166, 2378,
       5663, 3813, 1100, 3046, 3684,   64,  995,  311, 4639, 4089, 3104,
       1261, 2846, 3063,  792,  692, 4736, 4519], dtype=int64)

In [19]:
sort_by_inflow_density = cluster_df.sort_values("INFLOW_DENSITY", ascending = False)
top_40_inflow_density = sort_by_inflow_density['CLUSTER'].values[:40]
top_40_inflow_density

array([ 176, 3951, 3684, 5764,   76, 3660, 4687, 4639, 3599, 4600, 4184,
       1390, 5399, 5199, 5752, 4252, 2681, 5404, 5663, 4712, 3063, 4185,
       4736, 1301, 3929, 4576, 2995, 5498, 1853, 3886, 4847, 1592, 3887,
        484, 5723, 5210,  442, 5833, 1381,   64], dtype=int64)

In [20]:
sort_by_flow = cluster_df.sort_values("FLOW", ascending = False)
top_40_flow = sort_by_flow['CLUSTER'].values[:40]
top_40_flow

array([1390, 3951,  176, 3660, 1659,  961,  598, 1549, 1320, 2268, 1674,
       2166, 1100,  995, 3684, 3777,   64, 3104, 2378, 1702,  311, 1642,
       1853, 4519, 4736, 2681,  384, 1960, 3063,  720, 1301, 4687, 1952,
       2955,  240, 1023, 4600,  890, 5446, 4176], dtype=int64)

In [21]:
sort_by_flow_density = cluster_df.sort_values("FLOW_DENSITY", ascending = False)
top_40_flow_density = sort_by_flow_density['CLUSTER'].values[:40]
top_40_flow_density

array([ 176, 3951, 3684, 4687, 3660, 4600, 1390, 5399, 5752, 2681, 4712,
       1853, 5498, 4736, 3929, 3063,  484, 4252,   64, 5210,  384, 2122,
       5833, 4639, 3777, 3599, 4887, 2558,  720, 5388, 4185, 4184, 4251,
       5696, 2927, 4519, 5448, 5205, 5121, 1301], dtype=int64)

In [22]:
def export_res(cluster_keys, fname, fences_df):
    f = open(fname, 'w', encoding = 'utf-8')
    f.writelines('FENCE_ID|FENCE_TYPE|BELONG_AREA')
    f.write('\n')
    for i in range(len(fences_df)):
        fence_id = fences_df['FENCE_ID'][i]
        cluster = fences_df['CLUSTER'][i]
        if cluster in cluster_keys:
            is_crowded = 1
        else:
            is_crowded = 0
        string = fence_id + '|' + str(is_crowded) + '|' + str(cluster)
        f.writelines(string)
        f.write('\n')
    f.close()

In [23]:
def crowd_fence_position(cluster_keys, fences_df):
    pos = []
    for i in range(len(fences_df)):
        fence_id = fences_df['FENCE_ID'][i]
        cluster = fences_df['CLUSTER'][i]
        if cluster in cluster_keys:
            pos.append((fences_df['LATITUDE'][i], fences_df['LONGITUDE'][i]))
    return pos

In [24]:
sort_by_inflow_points = crowd_fence_position(top_40_inflow, fences)
print(len(sort_by_inflow_points))

207


In [25]:
bikes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339810 entries, 0 to 339809
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Unnamed: 0     339810 non-null  int64  
 1   BID            339810 non-null  int64  
 2   BICYCLE_ID     339810 non-null  object 
 3   LATITUDE       339810 non-null  float64
 4   LONGITUDE      339810 non-null  float64
 5   LOCK_STATUS    339810 non-null  int64  
 6   UPDATE_TIME    339810 non-null  object 
 7   GRID           339810 non-null  object 
 8   NEAREST_FENCE  339810 non-null  int64  
 9   DISTANCE       339810 non-null  float64
 10  MKTIME         339810 non-null  float64
dtypes: float64(4), int64(4), object(3)
memory usage: 28.5+ MB


In [26]:
inbikes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 151876 entries, 1 to 339808
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Unnamed: 0     151876 non-null  int64  
 1   BID            151876 non-null  int64  
 2   BICYCLE_ID     151876 non-null  object 
 3   LATITUDE       151876 non-null  float64
 4   LONGITUDE      151876 non-null  float64
 5   LOCK_STATUS    151876 non-null  int64  
 6   UPDATE_TIME    151876 non-null  object 
 7   GRID           151876 non-null  object 
 8   NEAREST_FENCE  151876 non-null  int64  
 9   DISTANCE       151876 non-null  float64
 10  MKTIME         151876 non-null  float64
dtypes: float64(4), int64(4), object(3)
memory usage: 13.9+ MB


In [27]:
outbikes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144675 entries, 0 to 339809
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Unnamed: 0     144675 non-null  int64  
 1   BID            144675 non-null  int64  
 2   BICYCLE_ID     144675 non-null  object 
 3   LATITUDE       144675 non-null  float64
 4   LONGITUDE      144675 non-null  float64
 5   LOCK_STATUS    144675 non-null  int64  
 6   UPDATE_TIME    144675 non-null  object 
 7   GRID           144675 non-null  object 
 8   NEAREST_FENCE  144675 non-null  int64  
 9   DISTANCE       144675 non-null  float64
 10  MKTIME         144675 non-null  float64
dtypes: float64(4), int64(4), object(3)
memory usage: 13.2+ MB


In [28]:
m = folium.Map(location=[24.527352, 118.10321675], zoom_start=13, zoom_control='False',
               attr='AutoNavi')
for point in sort_by_inflow_points:
    folium.Marker(point).add_to(m)
m

In [29]:
sort_by_inflow_density_points = crowd_fence_position(top_40_inflow_density, fences)
print(len(sort_by_inflow_density_points))
m = folium.Map(location=[24.527352, 118.10321675], zoom_start=13, zoom_control='False',
               attr='AutoNavi')
for point in sort_by_inflow_density_points:
    folium.Marker(point).add_to(m)
m

49


In [30]:
sort_by_flow_points = crowd_fence_position(top_40_flow, fences)
print(len(sort_by_flow_points))
m = folium.Map(location=[24.527352, 118.10321675], zoom_start=13, zoom_control='False',
               attr='AutoNavi')
for point in sort_by_flow_points:
    folium.Marker(point).add_to(m)
m

187


In [31]:
sort_by_flow_density_points = crowd_fence_position(top_40_flow_density, fences)
print(len(sort_by_flow_density_points))
m = folium.Map(location=[24.527352, 118.10321675], zoom_start=13, zoom_control='False',
               attr='AutoNavi')
for point in sort_by_flow_density_points:
    folium.Marker(point).add_to(m)
m

56


In [32]:
export_res(top_40_flow,'../result.txt',fences) 
#eps=20m, 50m外不算,得分19.6322
#eps=15m, 50m外不算

In [33]:
f = pd.read_csv('../result.txt', sep = '|')

In [34]:
f.head()

Unnamed: 0,FENCE_ID,FENCE_TYPE,BELONG_AREA
0,长乐路0_L_A17001,0,508
1,长乐路0_L_A17002,0,508
2,长乐路0_L_A17003,0,508
3,长乐路0_L_A17004,0,508
4,长乐路0_L_A17005,0,508


In [35]:
sum(f['FENCE_TYPE'])

187